From e08afeb7e69f45e4ab9fbb8530fe433484b96606 Mon Sep 17 00:00:00 2001 From: Brian King Date: Tue, 23 Jun 2009 17:14:01 -0500 Subject: [PATCH 01/58] [SCSI] ibmvscsi: Fix module load hang Fixes a regression seen in the ibmvscsi driver when using the VSCSI server in SLES 9 and SLES 10. The VSCSI server in these releases has a bug in it in which it does not send responses to unknown MADs. Check the OS Type field in the adapter info response and do not send these unsupported commands when talking to an older server. Signed-off-by: Brian King Signed-off-by: James Bottomley --- drivers/scsi/ibmvscsi/ibmvscsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 869a11bdccbd..9928704e235f 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -1095,9 +1095,14 @@ static void adapter_info_rsp(struct srp_event_struct *evt_struct) MAX_INDIRECT_BUFS); hostdata->host->sg_tablesize = MAX_INDIRECT_BUFS; } + + if (hostdata->madapter_info.os_type == 3) { + enable_fast_fail(hostdata); + return; + } } - enable_fast_fail(hostdata); + send_srp_login(hostdata); } /** From 87a2d34b0372dcf6bc4caf4d97a7889f5e62a1af Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 23 Jun 2009 01:06:40 +0200 Subject: [PATCH 02/58] [SCSI] fnic: remove redundant BUG_ONs and fix checks on unsigned The shost sg tablesize is set to FNIC_MAX_SG_DESC_CNT and fnic uses scsi_dma_map, so both BUG_ONs can be removed. scsi_dma_map may return -ENOMEM, sg_count should be int to catch that. Signed-off-by: Roel Kluin Signed-off-by: James Bottomley --- drivers/scsi/fnic/fnic_scsi.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index eabf36502856..bfc996971b81 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -245,7 +245,7 @@ static inline int fnic_queue_wq_copy_desc(struct fnic *fnic, struct vnic_wq_copy *wq, struct fnic_io_req *io_req, struct scsi_cmnd *sc, - u32 sg_count) + int sg_count) { struct scatterlist *sg; struct fc_rport *rport = starget_to_rport(scsi_target(sc->device)); @@ -260,9 +260,6 @@ static inline int fnic_queue_wq_copy_desc(struct fnic *fnic, char msg[2]; if (sg_count) { - BUG_ON(sg_count < 0); - BUG_ON(sg_count > FNIC_MAX_SG_DESC_CNT); - /* For each SGE, create a device desc entry */ desc = io_req->sgl_list; for_each_sg(scsi_sglist(sc), sg, sg_count, i) { @@ -344,7 +341,7 @@ int fnic_queuecommand(struct scsi_cmnd *sc, void (*done)(struct scsi_cmnd *)) struct fnic *fnic; struct vnic_wq_copy *wq; int ret; - u32 sg_count; + int sg_count; unsigned long flags; unsigned long ptr; From e3f47cc74bddea8121560026185ede4770170043 Mon Sep 17 00:00:00 2001 From: Abhijeet Joglekar Date: Wed, 24 Jun 2009 07:42:25 -0700 Subject: [PATCH 03/58] [SCSI] fnic: use DMA_BIT_MASK(nn) instead of deprecated DMA_nnBIT_MASK Robert Love reported warning while building fnic_main.c: drivers/scsi/fnic/fnic_main.c:478: warning: `DMA_nnBIT_MASK' is deprecated. Replaced use of DMA_nnBIT_MASK by DMA_BIT_MASK(nn) Signed-off-by: Abhijeet Joglekar Signed-off-by: James Bottomley --- drivers/scsi/fnic/fnic_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c index a84072865fc2..2c266c01dc5a 100644 --- a/drivers/scsi/fnic/fnic_main.c +++ b/drivers/scsi/fnic/fnic_main.c @@ -473,16 +473,16 @@ static int __devinit fnic_probe(struct pci_dev *pdev, * limitation for the device. Try 40-bit first, and * fail to 32-bit. */ - err = pci_set_dma_mask(pdev, DMA_40BIT_MASK); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(40)); if (err) { - err = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { shost_printk(KERN_ERR, fnic->lport->host, "No usable DMA configuration " "aborting\n"); goto err_out_release_regions; } - err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { shost_printk(KERN_ERR, fnic->lport->host, "Unable to obtain 32-bit DMA " @@ -490,7 +490,7 @@ static int __devinit fnic_probe(struct pci_dev *pdev, goto err_out_release_regions; } } else { - err = pci_set_consistent_dma_mask(pdev, DMA_40BIT_MASK); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(40)); if (err) { shost_printk(KERN_ERR, fnic->lport->host, "Unable to obtain 40-bit DMA " From d3a263a8168f78874254ea9da9595cfb0f3e96d7 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 24 Jun 2009 19:55:22 +0000 Subject: [PATCH 04/58] [SCSI] zalon: fix oops on attach failure I recently discovered on my zalon that if the attachment fails because of a bus misconfiguration (I scrapped my HVD array, so the card is now unterminated) then the system oopses. The reason is that if ncr_attach() returns NULL (signalling failure) that NULL is passed by the goto failed straight into ncr_detach() which oopses. The fix is just to return -ENODEV in this case. Cc: Stable Tree Signed-off-by: James Bottomley --- drivers/scsi/zalon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/zalon.c b/drivers/scsi/zalon.c index 97f3158fa7b5..27e84e4b1fa9 100644 --- a/drivers/scsi/zalon.c +++ b/drivers/scsi/zalon.c @@ -134,7 +134,7 @@ zalon_probe(struct parisc_device *dev) host = ncr_attach(&zalon7xx_template, unit, &device); if (!host) - goto fail; + return -ENODEV; if (request_irq(dev->irq, ncr53c8xx_intr, IRQF_SHARED, "zalon", host)) { dev_printk(KERN_ERR, &dev->dev, "irq problem with %d, detaching\n ", From 39562e783928e3ea9ee2cbce99a756ab48d3c06a Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Fri, 26 Jun 2009 16:30:43 +0200 Subject: [PATCH 05/58] [SCSI] FC transport: Locking fix for common-code FC pass-through patch Fix this: ------------[ cut here ]------------ Badness at block/blk-core.c:244 CPU: 0 Tainted: G W 2.6.31-rc1-00004-gd3a263a #3 Process zfcp_wq (pid: 901, task: 000000002fb7a038, ksp: 000000002f02bc78) Krnl PSW : 0704300180000000 00000000002141ba (blk_remove_plug+0xb2/0xb8) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:3 PM:0 EA:3 Krnl GPRS: 0000000000000001 0000000000000001 0000000022811440 0000000022811798 000000000027ff4e 0000000000000000 0000000000000000 000000002f00f000 070000000006a0f4 000000002af70000 000000002af2a800 00000000228d1c00 0000000022811440 000000000050c708 000000002f02bca8 000000002f02bc80 Krnl Code: 00000000002141b0: b9140022 lgfr %r2,%r2 00000000002141b4: 07fe bcr 15,%r14 00000000002141b6: a7f40001 brc 15,2141b8 >00000000002141ba: a7f4ffbe brc 15,214136 00000000002141be: 0707 bcr 0,%r7 00000000002141c0: ebaff0680024 stmg %r10,%r15,104(%r15) 00000000002141c6: c0d00017c2a9 larl %r13,50c718 00000000002141cc: a7f13fc0 tmll %r15,16320 Call Trace: ([<000000000050e7d8>] C.272.16122+0x88/0x110) [<00000000002141ec>] __blk_run_queue+0x2c/0x154 [<000000000028013a>] fc_remote_port_add+0x85e/0x95c [<000000000037596e>] zfcp_scsi_rport_work+0xe6/0x148 [<000000000006908c>] worker_thread+0x25c/0x318 [<000000000006f10c>] kthread+0x94/0x9c [<000000000001c2b2>] kernel_thread_starter+0x6/0xc [<000000000001c2ac>] kernel_thread_starter+0x0/0xc INFO: lockdep is turned off. Last Breaking-Event-Address: [<00000000002141b6>] blk_remove_plug+0xae/0xb8 The FC pass-through support triggers the WARN_ON(!irqs_disabled()) in blk_plug_device. Since blk_plug_device requires being called with disabled interrupts, use spin_lock_irqsave in fc_bsg_goose_queue to disable the interrupts before calling into the block layer. Signed-off-by: Christof Schmitt Acked-by: James Smart Signed-off-by: James Bottomley --- drivers/scsi/scsi_transport_fc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 2eee9e6e4fe8..292c02f810d0 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -3670,13 +3670,14 @@ static void fc_bsg_goose_queue(struct fc_rport *rport) { int flagset; + unsigned long flags; if (!rport->rqst_q) return; get_device(&rport->dev); - spin_lock(rport->rqst_q->queue_lock); + spin_lock_irqsave(rport->rqst_q->queue_lock, flags); flagset = test_bit(QUEUE_FLAG_REENTER, &rport->rqst_q->queue_flags) && !test_bit(QUEUE_FLAG_REENTER, &rport->rqst_q->queue_flags); if (flagset) @@ -3684,7 +3685,7 @@ fc_bsg_goose_queue(struct fc_rport *rport) __blk_run_queue(rport->rqst_q); if (flagset) queue_flag_clear(QUEUE_FLAG_REENTER, rport->rqst_q); - spin_unlock(rport->rqst_q->queue_lock); + spin_unlock_irqrestore(rport->rqst_q->queue_lock, flags); put_device(&rport->dev); } From a222ad1a4b2e3ca177a538482c99c519c1ce94d1 Mon Sep 17 00:00:00 2001 From: Karen Xie Date: Fri, 26 Jun 2009 15:17:29 -0700 Subject: [PATCH 06/58] [SCSI] cxgb3i: fix connection error when vlan is enabled There is a bug when VLAN is configured on the cxgb3 interface, the iscsi conn. would be denied with message "cxgb3i: NOT going through cxgbi device." This patch adds code to get the real egress net_device when vlan is configured. Signed-off-by: Karen Xie Reviewed-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/cxgb3i/cxgb3i_iscsi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/cxgb3i/cxgb3i_iscsi.c b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c index 74369a3f963b..c399f485aa7d 100644 --- a/drivers/scsi/cxgb3i/cxgb3i_iscsi.c +++ b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -184,6 +185,9 @@ static struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *ndev) struct cxgb3i_adapter *snic; int i; + if (ndev->priv_flags & IFF_802_1Q_VLAN) + ndev = vlan_dev_real_dev(ndev); + read_lock(&cxgb3i_snic_rwlock); list_for_each_entry(snic, &cxgb3i_snic_list, list_head) { for (i = 0; i < snic->hba_cnt; i++) { From bf92df30df909710c498d05620e2df1be1ef779b Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 29 Jun 2009 11:31:45 +0800 Subject: [PATCH 07/58] intel-iommu: Only avoid flushing device IOTLB for domain ID 0 in caching mode In caching mode, domain ID 0 is reserved for non-present to present mapping flush. Device IOTLB doesn't need to be flushed in this case. Previously we were avoiding the flush for domain zero, even if the IOMMU wasn't in caching mode and domain zero wasn't special. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 420afa887283..3cad7006ed8e 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1054,7 +1054,12 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, else iommu->flush.flush_iotlb(iommu, did, addr, mask, DMA_TLB_PSI_FLUSH); - if (did) + + /* + * In caching mode, domain ID 0 is reserved for non-present to present + * mapping flush. Device IOTLB doesn't need to be flushed in this case. + */ + if (!cap_caching_mode(iommu->cap) || did) iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); } From b213203e475212a69ad6fedfb73464087e317148 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 26 Jun 2009 18:50:28 +0100 Subject: [PATCH 08/58] intel-iommu: Create new iommu_domain_identity_map() function We'll want to do this to a _domain_ (the si_domain) rather than a PCI device. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 62 +++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 3cad7006ed8e..3a4f347e2f88 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1849,25 +1849,12 @@ error: static int iommu_identity_mapping; -static int iommu_prepare_identity_map(struct pci_dev *pdev, - unsigned long long start, - unsigned long long end) +static int iommu_domain_identity_map(struct dmar_domain *domain, + unsigned long long start, + unsigned long long end) { - struct dmar_domain *domain; unsigned long size; unsigned long long base; - int ret; - - printk(KERN_INFO - "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", - pci_name(pdev), start, end); - if (iommu_identity_mapping) - domain = si_domain; - else - /* page table init */ - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); - if (!domain) - return -ENOMEM; /* The address might not be aligned */ base = start & PAGE_MASK; @@ -1876,31 +1863,54 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, if (!reserve_iova(&domain->iovad, IOVA_PFN(base), IOVA_PFN(base + size) - 1)) { printk(KERN_ERR "IOMMU: reserve iova failed\n"); - ret = -ENOMEM; - goto error; + return -ENOMEM; } - pr_debug("Mapping reserved region %lx@%llx for %s\n", - size, base, pci_name(pdev)); + pr_debug("Mapping reserved region %lx@%llx for domain %d\n", + size, base, domain->id); /* * RMRR range might have overlap with physical memory range, * clear it first */ dma_pte_clear_range(domain, base, base + size); - ret = domain_page_mapping(domain, base, base, size, - DMA_PTE_READ|DMA_PTE_WRITE); + return domain_page_mapping(domain, base, base, size, + DMA_PTE_READ|DMA_PTE_WRITE); +} + +static int iommu_prepare_identity_map(struct pci_dev *pdev, + unsigned long long start, + unsigned long long end) +{ + struct dmar_domain *domain; + int ret; + + printk(KERN_INFO + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + + if (iommu_identity_mapping) + domain = si_domain; + else + /* page table init */ + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + if (!domain) + return -ENOMEM; + + ret = iommu_domain_identity_map(domain, start, end); if (ret) goto error; /* context entry init */ ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); - if (!ret) - return 0; -error: + if (ret) + goto error; + + return 0; + + error: domain_exit(domain); return ret; - } static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, From c7ab48d2acaf959e4d59c3f55d12fdb7ca9afd7c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 26 Jun 2009 19:10:36 +0100 Subject: [PATCH 09/58] intel-iommu: Clean up identity mapping code, remove CONFIG_DMAR_GFX_WA There's no need for the GFX workaround now we have 'iommu=pt' for the cases where people really care about performance. There's no need to have a special case for just one type of device. This also speeds up the iommu=pt path and reduces memory usage by setting up the si_domain _once_ and then using it for all devices, rather than giving each device its own private page tables. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 15 +----- drivers/pci/intel-iommu.c | 107 ++++++++++++-------------------------- 2 files changed, 34 insertions(+), 88 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f9..c07f72205909 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1913,25 +1913,14 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. -config DMAR_GFX_WA - def_bool y - prompt "Support for Graphics workaround" - depends on DMAR - ---help--- - Current Graphics drivers tend to use physical address - for DMA and avoid using DMA APIs. Setting this config - option permits the IOMMU driver to set a unity map for - all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA. - config DMAR_FLOPPY_WA def_bool y depends on DMAR ---help--- - Floppy disk drivers are know to bypass DMA API calls + Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This workaround will setup a 1:1 mapping for the first - 16M to make floppy (an ISA device) work. + 16MiB to make floppy (an ISA device) work. config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 3a4f347e2f88..fc121967cb5b 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1889,11 +1889,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", pci_name(pdev), start, end); - if (iommu_identity_mapping) - domain = si_domain; - else - /* page table init */ - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; @@ -1922,64 +1918,6 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, rmrr->end_address + 1); } -struct iommu_prepare_data { - struct pci_dev *pdev; - int ret; -}; - -static int __init iommu_prepare_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct iommu_prepare_data *data; - - data = (struct iommu_prepare_data *)datax; - - data->ret = iommu_prepare_identity_map(data->pdev, - start_pfn<ret; - -} - -static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) -{ - int nid; - struct iommu_prepare_data data; - - data.pdev = pdev; - data.ret = 0; - - for_each_online_node(nid) { - work_with_active_regions(nid, iommu_prepare_work_fn, &data); - if (data.ret) - return data.ret; - } - return data.ret; -} - -#ifdef CONFIG_DMAR_GFX_WA -static void __init iommu_prepare_gfx_mapping(void) -{ - struct pci_dev *pdev = NULL; - int ret; - - for_each_pci_dev(pdev) { - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO || - !IS_GFX_DEVICE(pdev)) - continue; - printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", - pci_name(pdev)); - ret = iommu_prepare_with_active_regions(pdev); - if (ret) - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); - } -} -#else /* !CONFIG_DMAR_GFX_WA */ -static inline void iommu_prepare_gfx_mapping(void) -{ - return; -} -#endif - #ifdef CONFIG_DMAR_FLOPPY_WA static inline void iommu_prepare_isa(void) { @@ -1990,12 +1928,12 @@ static inline void iommu_prepare_isa(void) if (!pdev) return; - printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n"); + printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); if (ret) - printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, " - "floppy might not work\n"); + printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " + "floppy might not work\n"); } #else @@ -2023,16 +1961,30 @@ static int __init init_context_pass_through(void) } static int md_domain_init(struct dmar_domain *domain, int guest_width); + +static int __init si_domain_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int *ret = datax; + + *ret = iommu_domain_identity_map(si_domain, + (uint64_t)start_pfn << PAGE_SHIFT, + (uint64_t)end_pfn << PAGE_SHIFT); + return *ret; + +} + static int si_domain_init(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; - int ret = 0; + int nid, ret = 0; si_domain = alloc_domain(); if (!si_domain) return -EFAULT; + pr_debug("Identity mapping domain is domain %d\n", si_domain->id); for_each_active_iommu(iommu, drhd) { ret = iommu_attach_domain(si_domain, iommu); @@ -2049,6 +2001,12 @@ static int si_domain_init(void) si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + for_each_online_node(nid) { + work_with_active_regions(nid, si_domain_work_fn, &ret); + if (ret) + return ret; + } + return 0; } @@ -2102,13 +2060,14 @@ static int iommu_prepare_static_identity_mapping(void) if (ret) return -EFAULT; - printk(KERN_INFO "IOMMU: Setting identity map:\n"); for_each_pci_dev(pdev) { - ret = iommu_prepare_with_active_regions(pdev); - if (ret) { - printk(KERN_INFO "1:1 mapping to one domain failed.\n"); - return -EFAULT; - } + printk(KERN_INFO "IOMMU: identity mapping for device %s\n", + pci_name(pdev)); + + ret = domain_context_mapping(si_domain, pdev, + CONTEXT_TT_MULTI_LEVEL); + if (ret) + return ret; ret = domain_add_dev_info(si_domain, pdev); if (ret) return ret; @@ -2299,8 +2258,6 @@ int __init init_dmars(void) } } - iommu_prepare_gfx_mapping(); - iommu_prepare_isa(); } From dd4e831960e4f0214480fa96a53ca9bb7dd04927 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 16:21:20 +0100 Subject: [PATCH 10/58] intel-iommu: Change dma_set_pte_addr() to dma_set_pte_pfn() Add some helpers for converting between VT-d and normal system pfns, since system pages can be larger than VT-d pages. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index fc121967cb5b..852f40a913d4 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -65,6 +65,26 @@ #define PHYSICAL_PAGE_MASK PAGE_MASK #endif +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) +{ + return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); +} + +static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} + /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; @@ -207,9 +227,9 @@ static inline u64 dma_pte_addr(struct dma_pte *pte) return (pte->val & VTD_PAGE_MASK); } -static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr) +static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn) { - pte->val |= (addr & VTD_PAGE_MASK); + pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT; } static inline bool dma_pte_present(struct dma_pte *pte) @@ -702,7 +722,7 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) return NULL; } domain_flush_cache(domain, tmp_page, PAGE_SIZE); - dma_set_pte_addr(pte, virt_to_phys(tmp_page)); + dma_set_pte_pfn(pte, virt_to_dma_pfn(tmp_page)); /* * high level table always sets r/w, last level page * table control read/write @@ -1648,7 +1668,7 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, * touches the iova range */ BUG_ON(dma_pte_addr(pte)); - dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT); + dma_set_pte_pfn(pte, start_pfn); dma_set_pte_prot(pte, prot); if (prot & DMA_PTE_SNP) dma_set_pte_snp(pte); From 77dfa56c94d2855a25ff552b74980a5538e129f8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 16:40:08 +0100 Subject: [PATCH 11/58] intel-iommu: Change address_level_offset() to pfn_level_offset() We're shifting the inputs for now, but that'll change... Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 852f40a913d4..529c1c13048f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -671,9 +671,9 @@ static inline unsigned int level_to_offset_bits(int level) return (12 + (level - 1) * LEVEL_STRIDE); } -static inline int address_level_offset(u64 addr, int level) +static inline int pfn_level_offset(unsigned long pfn, int level) { - return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK); + return (pfn >> (level_to_offset_bits(level) - 12)) & LEVEL_MASK; } static inline u64 level_mask(int level) @@ -708,7 +708,7 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) while (level > 0) { void *tmp_page; - offset = address_level_offset(addr, level); + offset = pfn_level_offset(addr >> VTD_PAGE_SHIFT, level); pte = &parent[offset]; if (level == 1) break; @@ -749,7 +749,7 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, parent = domain->pgd; while (level <= total) { - offset = address_level_offset(addr, total); + offset = pfn_level_offset(addr >> VTD_PAGE_SHIFT, total); pte = &parent[offset]; if (level == total) return pte; From 90dcfb5eb2fd427b16135a14f176a6902750b6b4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 17:14:59 +0100 Subject: [PATCH 12/58] intel-iommu: Change dma_addr_level_pte() to dma_pfn_level_pte() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 529c1c13048f..edd39d348a98 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -740,8 +740,9 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) } /* return address's pte at specific level */ -static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, - int level) +static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, + unsigned long pfn, + int level) { struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); @@ -749,7 +750,7 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, parent = domain->pgd; while (level <= total) { - offset = pfn_level_offset(addr >> VTD_PAGE_SHIFT, total); + offset = pfn_level_offset(pfn, total); pte = &parent[offset]; if (level == total) return pte; @@ -768,7 +769,7 @@ static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) struct dma_pte *pte = NULL; /* get last level pte */ - pte = dma_addr_level_pte(domain, addr, 1); + pte = dma_pfn_level_pte(domain, addr >> VTD_PAGE_SHIFT, 1); if (pte) { dma_clear_pte(pte); @@ -817,7 +818,8 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, return; while (tmp < end) { - pte = dma_addr_level_pte(domain, tmp, level); + pte = dma_pfn_level_pte(domain, tmp >> VTD_PAGE_SHIFT, + level); if (pte) { free_pgtable_page( phys_to_virt(dma_pte_addr(pte))); From a75f7cf94f01717c5103138319b96752ee2a2be9 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 17:44:39 +0100 Subject: [PATCH 13/58] intel-iommu: Make dma_pte_clear_one() take pfn not address Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index edd39d348a98..40eae2097aca 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -764,12 +764,12 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, } /* clear one page's page table */ -static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) +static void dma_pte_clear_one(struct dmar_domain *domain, unsigned long pfn) { struct dma_pte *pte = NULL; /* get last level pte */ - pte = dma_pfn_level_pte(domain, addr >> VTD_PAGE_SHIFT, 1); + pte = dma_pfn_level_pte(domain, pfn, 1); if (pte) { dma_clear_pte(pte); @@ -792,7 +792,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) /* we don't need lock here, nobody else touches the iova range */ while (npages--) { - dma_pte_clear_one(domain, start); + dma_pte_clear_one(domain, start >> VTD_PAGE_SHIFT); start += VTD_PAGE_SIZE; } } From 66eae8469e4e4ba6f4ca7ef82103c78f6d645583 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 19:00:32 +0100 Subject: [PATCH 14/58] intel-iommu: Don't just mask out too-big physical addresses; BUG() instead Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 40eae2097aca..ad367f53a2bb 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -700,8 +700,7 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) unsigned long flags; BUG_ON(!domain->pgd); - - addr &= (((u64)1) << addr_width) - 1; + BUG_ON(addr >> addr_width); parent = domain->pgd; spin_lock_irqsave(&domain->mapping_lock, flags); @@ -783,8 +782,9 @@ static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) int addr_width = agaw_to_width(domain->agaw); int npages; - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; + BUG_ON(start >> addr_width); + BUG_ON((end-1) >> addr_width); + /* in case it's partial page */ start &= PAGE_MASK; end = PAGE_ALIGN(end); @@ -807,8 +807,8 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, int level; u64 tmp; - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; + BUG_ON(start >> addr_width); + BUG_ON(end >> addr_width); /* we don't need lock here, nobody else touches the iova range */ level = 2; @@ -1654,7 +1654,7 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, int index; int addr_width = agaw_to_width(domain->agaw); - hpa &= (((u64)1) << addr_width) - 1; + BUG_ON(hpa >> addr_width); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; From 04b18e65dd5a3e544f07f4bcfa8fb52704a1833b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 19:15:01 +0100 Subject: [PATCH 15/58] intel-iommu: Make dma_pte_clear_range() use pfns Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ad367f53a2bb..d4217f737159 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -779,21 +779,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, unsigned long pfn) /* clear last level pte, a tlb flush should be followed */ static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) { - int addr_width = agaw_to_width(domain->agaw); - int npages; + unsigned long start_pfn = IOVA_PFN(start); + unsigned long end_pfn = IOVA_PFN(end-1); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - BUG_ON(start >> addr_width); - BUG_ON((end-1) >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && end_pfn >> addr_width); - /* in case it's partial page */ - start &= PAGE_MASK; - end = PAGE_ALIGN(end); - npages = (end - start) / VTD_PAGE_SIZE; - - /* we don't need lock here, nobody else touches the iova range */ - while (npages--) { - dma_pte_clear_one(domain, start >> VTD_PAGE_SHIFT); - start += VTD_PAGE_SIZE; + /* we don't need lock here; nobody else touches the iova range */ + while (start_pfn <= end_pfn) { + dma_pte_clear_one(domain, start_pfn); + start_pfn++; } } From 595badf5d65d50300319e6178e6df005ea501f70 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 22:09:11 +0100 Subject: [PATCH 16/58] intel-iommu: Make dma_pte_clear_range() take pfns as argument Noting that this is now an _inclusive_ range. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index d4217f737159..ff8b7ce4a013 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -56,6 +56,7 @@ #define MAX_AGAW_WIDTH 64 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) +#define DOMAIN_MAX_PFN(gaw) ((((u64)1) << (gaw-VTD_PAGE_SHIFT)) - 1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32)) @@ -777,17 +778,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, unsigned long pfn) } /* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) +static void dma_pte_clear_range(struct dmar_domain *domain, + unsigned long start_pfn, + unsigned long last_pfn) { - unsigned long start_pfn = IOVA_PFN(start); - unsigned long end_pfn = IOVA_PFN(end-1); int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); - BUG_ON(addr_width < BITS_PER_LONG && end_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); /* we don't need lock here; nobody else touches the iova range */ - while (start_pfn <= end_pfn) { + while (start_pfn <= last_pfn) { dma_pte_clear_one(domain, start_pfn); start_pfn++; } @@ -1424,7 +1425,7 @@ static void domain_exit(struct dmar_domain *domain) end = end & (~PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ dma_pte_free_pagetable(domain, 0, end); @@ -1890,7 +1891,8 @@ static int iommu_domain_identity_map(struct dmar_domain *domain, * RMRR range might have overlap with physical memory range, * clear it first */ - dma_pte_clear_range(domain, base, base + size); + dma_pte_clear_range(domain, base >> VTD_PAGE_SHIFT, + (base + size - 1) >> VTD_PAGE_SHIFT); return domain_page_mapping(domain, base, base, size, DMA_PTE_READ|DMA_PTE_WRITE); @@ -2618,7 +2620,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, pci_name(pdev), size, (unsigned long long)start_addr); /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_addr >> VTD_PAGE_SHIFT, + (start_addr + size - 1) >> VTD_PAGE_SHIFT); /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); if (intel_iommu_strict) { @@ -2710,7 +2713,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, start_addr = iova->pfn_lo << PAGE_SHIFT; /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_addr >> VTD_PAGE_SHIFT, + (start_addr + size - 1) >> VTD_PAGE_SHIFT); /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); @@ -2792,8 +2796,9 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne size, prot); if (ret) { /* clear the page */ - dma_pte_clear_range(domain, start_addr, - start_addr + offset); + dma_pte_clear_range(domain, + start_addr >> VTD_PAGE_SHIFT, + (start_addr + offset - 1) >> VTD_PAGE_SHIFT); /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + offset); @@ -3382,7 +3387,7 @@ static void vm_domain_exit(struct dmar_domain *domain) end = end & (~VTD_PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ dma_pte_free_pagetable(domain, 0, end); @@ -3526,7 +3531,8 @@ static void intel_iommu_unmap_range(struct iommu_domain *domain, /* The address might not be aligned */ base = iova & VTD_PAGE_MASK; size = VTD_PAGE_ALIGN(size); - dma_pte_clear_range(dmar_domain, base, base + size); + dma_pte_clear_range(dmar_domain, base >> VTD_PAGE_SHIFT, + (base + size - 1) >> VTD_PAGE_SHIFT); if (dmar_domain->max_addr == base + size) dmar_domain->max_addr = base; From 6660c63a79a639b86e3a709e25a8c4fc3ab24770 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jun 2009 22:41:00 +0100 Subject: [PATCH 17/58] intel-iommu: Make dma_pte_free_pagetable() use pfns Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 40 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ff8b7ce4a013..1526864a9d6f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -669,27 +669,27 @@ static inline int width_to_agaw(int width) static inline unsigned int level_to_offset_bits(int level) { - return (12 + (level - 1) * LEVEL_STRIDE); + return (level - 1) * LEVEL_STRIDE; } static inline int pfn_level_offset(unsigned long pfn, int level) { - return (pfn >> (level_to_offset_bits(level) - 12)) & LEVEL_MASK; + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) +static inline unsigned long level_mask(int level) { - return ((u64)-1 << level_to_offset_bits(level)); + return -1UL << level_to_offset_bits(level); } -static inline u64 level_size(int level) +static inline unsigned long level_size(int level) { - return ((u64)1 << level_to_offset_bits(level)); + return 1UL << level_to_offset_bits(level); } -static inline u64 align_to_level(u64 addr, int level) +static inline unsigned long align_to_level(unsigned long pfn, int level) { - return ((addr + level_size(level) - 1) & level_mask(level)); + return (pfn + level_size(level) - 1) & level_mask(level); } static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) @@ -798,25 +798,29 @@ static void dma_pte_clear_range(struct dmar_domain *domain, static void dma_pte_free_pagetable(struct dmar_domain *domain, u64 start, u64 end) { - int addr_width = agaw_to_width(domain->agaw); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned long start_pfn = start >> VTD_PAGE_SHIFT; + unsigned long last_pfn = (end-1) >> VTD_PAGE_SHIFT; struct dma_pte *pte; int total = agaw_to_level(domain->agaw); int level; - u64 tmp; + unsigned long tmp; - BUG_ON(start >> addr_width); - BUG_ON(end >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); /* we don't need lock here, nobody else touches the iova range */ level = 2; while (level <= total) { - tmp = align_to_level(start, level); - if (tmp >= end || (tmp + level_size(level) > end)) + tmp = align_to_level(start_pfn, level); + + /* Only clear this pte/pmd if we're asked to clear its + _whole_ range */ + if (tmp + level_size(level) - 1 > last_pfn) return; - while (tmp < end) { - pte = dma_pfn_level_pte(domain, tmp >> VTD_PAGE_SHIFT, - level); + while (tmp <= last_pfn) { + pte = dma_pfn_level_pte(domain, tmp, level); if (pte) { free_pgtable_page( phys_to_virt(dma_pte_addr(pte))); @@ -828,7 +832,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, level++; } /* free pgd */ - if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { + if (start == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { free_pgtable_page(domain->pgd); domain->pgd = NULL; } From d794dc9b302c2781c571c10dedb8094e223d31b8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 00:27:49 +0100 Subject: [PATCH 18/58] intel-iommu: Make dma_pte_free_pagetable() take pfns as argument With some cleanup of intel_unmap_page(), intel_unmap_sg() and vm_domain_exit() to no longer play with 64-bit addresses. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 70 ++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 1526864a9d6f..fc593adb049a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -796,11 +796,10 @@ static void dma_pte_clear_range(struct dmar_domain *domain, /* free page table pages. last level pte should already be cleared */ static void dma_pte_free_pagetable(struct dmar_domain *domain, - u64 start, u64 end) + unsigned long start_pfn, + unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - unsigned long start_pfn = start >> VTD_PAGE_SHIFT; - unsigned long last_pfn = (end-1) >> VTD_PAGE_SHIFT; struct dma_pte *pte; int total = agaw_to_level(domain->agaw); int level; @@ -832,7 +831,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, level++; } /* free pgd */ - if (start == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { + if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { free_pgtable_page(domain->pgd); domain->pgd = NULL; } @@ -1416,7 +1415,6 @@ static void domain_exit(struct dmar_domain *domain) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; - u64 end; /* Domain 0 is reserved, so dont process it */ if (!domain) @@ -1425,14 +1423,12 @@ static void domain_exit(struct dmar_domain *domain) domain_remove_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~PAGE_MASK); /* clear ptes */ dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); for_each_active_iommu(iommu, drhd) if (test_bit(iommu->seq_id, &domain->iommu_bmp)) @@ -2601,7 +2597,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, { struct pci_dev *pdev = to_pci_dev(dev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; struct intel_iommu *iommu; @@ -2617,20 +2613,22 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, if (!iova) return; - start_addr = iova->pfn_lo << PAGE_SHIFT; - size = aligned_size((u64)dev_addr, size); + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; - pr_debug("Device %s unmapping: %zx@%llx\n", - pci_name(pdev), size, (unsigned long long)start_addr); + pr_debug("Device %s unmapping: pfn %lx-%lx\n", + pci_name(pdev), start_pfn, last_pfn); /* clear the whole page */ - dma_pte_clear_range(domain, start_addr >> VTD_PAGE_SHIFT, - (start_addr + size - 1) >> VTD_PAGE_SHIFT); + dma_pte_clear_range(domain, start_pfn, last_pfn); + /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_free_pagetable(domain, start_pfn, last_pfn); + if (intel_iommu_strict) { - iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, domain->id, + start_pfn << VTD_PAGE_SHIFT, + last_pfn - start_pfn + 1); /* free iova */ __free_iova(&domain->iovad, iova); } else { @@ -2688,14 +2686,10 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; - size_t size = 0; - phys_addr_t addr; - struct scatterlist *sg; struct intel_iommu *iommu; if (iommu_no_mapping(pdev)) @@ -2709,21 +2703,19 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); if (!iova) return; - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); - } - start_addr = iova->pfn_lo << PAGE_SHIFT; + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; /* clear the whole page */ - dma_pte_clear_range(domain, start_addr >> VTD_PAGE_SHIFT, - (start_addr + size - 1) >> VTD_PAGE_SHIFT); - /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_pfn, last_pfn); - iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT); + /* free page tables */ + dma_pte_free_pagetable(domain, start_pfn, last_pfn); + + iommu_flush_iotlb_psi(iommu, domain->id, + start_pfn << VTD_PAGE_SHIFT, + (last_pfn - start_pfn + 1)); /* free iova */ __free_iova(&domain->iovad, iova); @@ -2804,8 +2796,8 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne start_addr >> VTD_PAGE_SHIFT, (start_addr + offset - 1) >> VTD_PAGE_SHIFT); /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, - start_addr + offset); + dma_pte_free_pagetable(domain, start_addr >> VTD_PAGE_SHIFT, + (start_addr + offset - 1) >> VTD_PAGE_SHIFT); /* free iova */ __free_iova(&domain->iovad, iova); return 0; @@ -3378,8 +3370,6 @@ static void iommu_free_vm_domain(struct dmar_domain *domain) static void vm_domain_exit(struct dmar_domain *domain) { - u64 end; - /* Domain 0 is reserved, so dont process it */ if (!domain) return; @@ -3387,14 +3377,12 @@ static void vm_domain_exit(struct dmar_domain *domain) vm_domain_remove_all_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~VTD_PAGE_MASK); /* clear ptes */ dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); iommu_free_vm_domain(domain); free_domain_mem(domain); From 163cc52ccd2cc5c5ae4e1c886f6fde8547feed2a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 00:51:17 +0100 Subject: [PATCH 19/58] intel-iommu: Clean up intel_iommu_unmap_range() Use unaligned address for domain->max_addr. That algorithm isn't ideal anyway -- we should probably just look at the last iova in the tree. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index fc593adb049a..21dc77311863 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3491,7 +3491,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain, if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) prot |= DMA_PTE_SNP; - max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size); + max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { int min_agaw; u64 end; @@ -3518,16 +3518,12 @@ static void intel_iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = domain->priv; - dma_addr_t base; - /* The address might not be aligned */ - base = iova & VTD_PAGE_MASK; - size = VTD_PAGE_ALIGN(size); - dma_pte_clear_range(dmar_domain, base >> VTD_PAGE_SHIFT, - (base + size - 1) >> VTD_PAGE_SHIFT); + dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, + (iova + size - 1) >> VTD_PAGE_SHIFT); - if (dmar_domain->max_addr == base + size) - dmar_domain->max_addr = base; + if (dmar_domain->max_addr == iova + size) + dmar_domain->max_addr = iova; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, From b026fd28ea23af24a3eea6e5be3f3d0193a8e87d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 10:37:25 +0100 Subject: [PATCH 20/58] intel-iommu: Change addr_to_dma_pte() to pfn_to_dma_pte() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 21dc77311863..dfbabd151a9c 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -692,23 +692,24 @@ static inline unsigned long align_to_level(unsigned long pfn, int level) return (pfn + level_size(level) - 1) & level_mask(level); } -static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) +static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, + unsigned long pfn) { - int addr_width = agaw_to_width(domain->agaw); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); int offset; unsigned long flags; BUG_ON(!domain->pgd); - BUG_ON(addr >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; spin_lock_irqsave(&domain->mapping_lock, flags); while (level > 0) { void *tmp_page; - offset = pfn_level_offset(addr >> VTD_PAGE_SHIFT, level); + offset = pfn_level_offset(pfn, level); pte = &parent[offset]; if (level == 1) break; @@ -1660,7 +1661,7 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT; index = 0; while (start_pfn < end_pfn) { - pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index); + pte = pfn_to_dma_pte(domain, (iova >> VTD_PAGE_SHIFT) + index); if (!pte) return -ENOMEM; /* We don't need lock here, nobody else @@ -3533,7 +3534,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, struct dma_pte *pte; u64 phys = 0; - pte = addr_to_dma_pte(dmar_domain, iova); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); if (pte) phys = dma_pte_addr(pte); From 1c5a46ed49e37f56f8aa9000bb1c2ac59670c372 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 10:53:37 +0100 Subject: [PATCH 21/58] intel-iommu: Clean up address handling in domain_page_mapping() No more masking and alignment; just use pfns. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index dfbabd151a9c..f08d7865fe00 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1647,20 +1647,18 @@ static int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, u64 hpa, size_t size, int prot) { - u64 start_pfn, end_pfn; + unsigned long start_pfn = hpa >> VTD_PAGE_SHIFT; + unsigned long last_pfn = (hpa + size - 1) >> VTD_PAGE_SHIFT; struct dma_pte *pte; - int index; - int addr_width = agaw_to_width(domain->agaw); + int index = 0; + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - BUG_ON(hpa >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - iova &= PAGE_MASK; - start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT; - end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT; - index = 0; - while (start_pfn < end_pfn) { + + while (start_pfn <= last_pfn) { pte = pfn_to_dma_pte(domain, (iova >> VTD_PAGE_SHIFT) + index); if (!pte) return -ENOMEM; From 61df744314079e8cb8cdec75f517cf0e704e41ef Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 11:55:58 +0100 Subject: [PATCH 22/58] intel-iommu: Introduce domain_pfn_mapping() ... and use it in the trivial cases; the other callers want individual (and bisectable) attention, since I screwed them up the first time... Make the BUG_ON() happen on too-large virtual address rather than physical address, too. That's the one we care about. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f08d7865fe00..7540ef91d5f7 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1643,40 +1643,48 @@ static int domain_context_mapped(struct pci_dev *pdev) tmp->devfn); } -static int -domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, - u64 hpa, size_t size, int prot) +static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phys_pfn, unsigned long nr_pages, + int prot) { - unsigned long start_pfn = hpa >> VTD_PAGE_SHIFT; - unsigned long last_pfn = (hpa + size - 1) >> VTD_PAGE_SHIFT; struct dma_pte *pte; - int index = 0; int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - while (start_pfn <= last_pfn) { - pte = pfn_to_dma_pte(domain, (iova >> VTD_PAGE_SHIFT) + index); + while (nr_pages--) { + pte = pfn_to_dma_pte(domain, iov_pfn); if (!pte) return -ENOMEM; /* We don't need lock here, nobody else * touches the iova range */ BUG_ON(dma_pte_addr(pte)); - dma_set_pte_pfn(pte, start_pfn); + dma_set_pte_pfn(pte, phys_pfn); dma_set_pte_prot(pte, prot); if (prot & DMA_PTE_SNP) dma_set_pte_snp(pte); domain_flush_cache(domain, pte, sizeof(*pte)); - start_pfn++; - index++; + iov_pfn++; + phys_pfn++; } return 0; } +static int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot) +{ + unsigned long first_pfn = hpa >> VTD_PAGE_SHIFT; + unsigned long last_pfn = (hpa + size - 1) >> VTD_PAGE_SHIFT; + + return domain_pfn_mapping(domain, iova >> VTD_PAGE_SHIFT, first_pfn, + last_pfn - first_pfn + 1, prot); + +} + static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) { if (!iommu) @@ -1893,8 +1901,10 @@ static int iommu_domain_identity_map(struct dmar_domain *domain, dma_pte_clear_range(domain, base >> VTD_PAGE_SHIFT, (base + size - 1) >> VTD_PAGE_SHIFT); - return domain_page_mapping(domain, base, base, size, - DMA_PTE_READ|DMA_PTE_WRITE); + return domain_pfn_mapping(domain, base >> VTD_PAGE_SHIFT, + base >> VTD_PAGE_SHIFT, + size >> VTD_PAGE_SHIFT, + DMA_PTE_READ|DMA_PTE_WRITE); } static int iommu_prepare_identity_map(struct pci_dev *pdev, From 0ab36de274ab094c3992b50c9c48c5c89072ec94 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 14:01:43 +0100 Subject: [PATCH 23/58] intel-iommu: Use domain_pfn_mapping() in __intel_map_single() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 7540ef91d5f7..dccd0a7b7a5f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2477,14 +2477,12 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, return 0; iommu = domain_get_iommu(domain); - size = aligned_size((u64)paddr, size); + size = aligned_size(paddr, size) >> VTD_PAGE_SHIFT; - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = __intel_alloc_iova(hwdev, domain, size << VTD_PAGE_SHIFT, pdev->dma_mask); if (!iova) goto error; - start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; - /* * Check if DMAR supports zero-length reads on write only * mappings.. @@ -2500,20 +2498,20 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, * might have two guest_addr mapping to the same host paddr, but this * is not a big problem */ - ret = domain_page_mapping(domain, start_paddr, - ((u64)paddr) & PHYSICAL_PAGE_MASK, - size, prot); + ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo), + paddr >> VTD_PAGE_SHIFT, size, prot); if (ret) goto error; + start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; + /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_paddr, - size >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, 0, start_paddr, size); else iommu_flush_write_buffer(iommu); - return start_paddr + ((u64)paddr & (~PAGE_MASK)); + return start_paddr + (paddr & (~PAGE_MASK)); error: if (iova) From ad05122162b67f64d5a1c6d35e001f7a88619b88 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 14:22:28 +0100 Subject: [PATCH 24/58] intel-iommu: Use domain_pfn_mapping() in intel_iommu_map_range() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index dccd0a7b7a5f..a490b39ca3d5 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3516,8 +3516,11 @@ static int intel_iommu_map_range(struct iommu_domain *domain, } dmar_domain->max_addr = max_addr; } - - ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot); + /* Round up size to next multiple of PAGE_SIZE, if it and + the low bits of hpa would take us onto the next page */ + size = aligned_size(hpa, size) >> VTD_PAGE_SHIFT; + ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, + hpa >> VTD_PAGE_SHIFT, size, prot); return ret; } From b536d24d212c994a7d98469ea3a8891573d45fd4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 14:49:31 +0100 Subject: [PATCH 25/58] intel-iommu: Clean up intel_map_sg(), remove domain_page_mapping() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 54 ++++++++++++++------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index a490b39ca3d5..bc49b121c667 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1674,17 +1674,6 @@ static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, return 0; } -static int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, - u64 hpa, size_t size, int prot) -{ - unsigned long first_pfn = hpa >> VTD_PAGE_SHIFT; - unsigned long last_pfn = (hpa + size - 1) >> VTD_PAGE_SHIFT; - - return domain_pfn_mapping(domain, iova >> VTD_PAGE_SHIFT, first_pfn, - last_pfn - first_pfn + 1, prot); - -} - static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) { if (!iommu) @@ -2745,17 +2734,16 @@ static int intel_nontranslate_map_sg(struct device *hddev, static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t addr; int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; size_t size = 0; int prot = 0; - size_t offset = 0; + size_t offset_pfn = 0; struct iova *iova = NULL; int ret; struct scatterlist *sg; - unsigned long start_addr; + unsigned long start_vpfn; struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); @@ -2768,10 +2756,8 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne iommu = domain_get_iommu(domain); - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); - } + for_each_sg(sglist, sg, nelems, i) + size += aligned_size(sg->offset, sg->length); iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); if (!iova) { @@ -2789,36 +2775,34 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) prot |= DMA_PTE_WRITE; - start_addr = iova->pfn_lo << PAGE_SHIFT; - offset = 0; + start_vpfn = mm_to_dma_pfn(iova->pfn_lo); + offset_pfn = 0; for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size = aligned_size((u64)addr, sg->length); - ret = domain_page_mapping(domain, start_addr + offset, - ((u64)addr) & PHYSICAL_PAGE_MASK, - size, prot); + int nr_pages = aligned_size(sg->offset, sg->length) >> VTD_PAGE_SHIFT; + ret = domain_pfn_mapping(domain, start_vpfn + offset_pfn, + page_to_dma_pfn(sg_page(sg)), + nr_pages, prot); if (ret) { /* clear the page */ - dma_pte_clear_range(domain, - start_addr >> VTD_PAGE_SHIFT, - (start_addr + offset - 1) >> VTD_PAGE_SHIFT); + dma_pte_clear_range(domain, start_vpfn, + start_vpfn + offset_pfn); /* free page tables */ - dma_pte_free_pagetable(domain, start_addr >> VTD_PAGE_SHIFT, - (start_addr + offset - 1) >> VTD_PAGE_SHIFT); + dma_pte_free_pagetable(domain, start_vpfn, + start_vpfn + offset_pfn); /* free iova */ __free_iova(&domain->iovad, iova); return 0; } - sg->dma_address = start_addr + offset + - ((u64)addr & (~PAGE_MASK)); + sg->dma_address = ((dma_addr_t)(start_vpfn + offset_pfn) + << VTD_PAGE_SHIFT) + sg->offset; sg->dma_length = sg->length; - offset += size; + offset_pfn += nr_pages; } /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_addr, - offset >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, 0, start_vpfn << VTD_PAGE_SHIFT, + offset_pfn); else iommu_flush_write_buffer(iommu); From 88cb6a7424d9465faf6caaaadff5af0766c93991 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 15:03:06 +0100 Subject: [PATCH 26/58] intel-iommu: Change aligned_size() to aligned_nrpages() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index bc49b121c667..22add36fd731 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2320,11 +2320,13 @@ error: return ret; } -static inline u64 aligned_size(u64 host_addr, size_t size) +static inline unsigned long aligned_nrpages(unsigned long host_addr, + size_t size) { - u64 addr; - addr = (host_addr & (~PAGE_MASK)) + size; - return PAGE_ALIGN(addr); + host_addr &= ~PAGE_MASK; + host_addr += size + PAGE_SIZE - 1; + + return host_addr >> VTD_PAGE_SHIFT; } struct iova * @@ -2466,7 +2468,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, return 0; iommu = domain_get_iommu(domain); - size = aligned_size(paddr, size) >> VTD_PAGE_SHIFT; + size = aligned_nrpages(paddr, size); iova = __intel_alloc_iova(hwdev, domain, size << VTD_PAGE_SHIFT, pdev->dma_mask); if (!iova) @@ -2757,9 +2759,10 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne iommu = domain_get_iommu(domain); for_each_sg(sglist, sg, nelems, i) - size += aligned_size(sg->offset, sg->length); + size += aligned_nrpages(sg->offset, sg->length); - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = __intel_alloc_iova(hwdev, domain, size << VTD_PAGE_SHIFT, + pdev->dma_mask); if (!iova) { sglist->dma_length = 0; return 0; @@ -2778,7 +2781,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne start_vpfn = mm_to_dma_pfn(iova->pfn_lo); offset_pfn = 0; for_each_sg(sglist, sg, nelems, i) { - int nr_pages = aligned_size(sg->offset, sg->length) >> VTD_PAGE_SHIFT; + int nr_pages = aligned_nrpages(sg->offset, sg->length); ret = domain_pfn_mapping(domain, start_vpfn + offset_pfn, page_to_dma_pfn(sg_page(sg)), nr_pages, prot); @@ -3502,7 +3505,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain, } /* Round up size to next multiple of PAGE_SIZE, if it and the low bits of hpa would take us onto the next page */ - size = aligned_size(hpa, size) >> VTD_PAGE_SHIFT; + size = aligned_nrpages(hpa, size); ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, hpa >> VTD_PAGE_SHIFT, size, prot); return ret; From 03d6a2461ab1704c171ce21081c5022378ef7a91 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 15:33:46 +0100 Subject: [PATCH 27/58] intel-iommu: Make iommu_flush_iotlb_psi() take pfn as argument Most of its callers are having to shift for themselves anyway, so we might as well do it in iommu_flush_iotlb_psi(). Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 22add36fd731..6afe44cb6815 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1058,11 +1058,11 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, } static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int pages) + unsigned long pfn, unsigned int pages) { unsigned int mask = ilog2(__roundup_pow_of_two(pages)); + uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; - BUG_ON(addr & (~VTD_PAGE_MASK)); BUG_ON(pages == 0); /* @@ -2494,15 +2494,15 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, if (ret) goto error; - start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; - /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_paddr, size); + iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size); else iommu_flush_write_buffer(iommu); - return start_paddr + (paddr & (~PAGE_MASK)); + start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; + start_paddr += paddr & ~PAGE_MASK; + return start_paddr; error: if (iova) @@ -2624,8 +2624,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, dma_pte_free_pagetable(domain, start_pfn, last_pfn); if (intel_iommu_strict) { - iommu_flush_iotlb_psi(iommu, domain->id, - start_pfn << VTD_PAGE_SHIFT, + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, last_pfn - start_pfn + 1); /* free iova */ __free_iova(&domain->iovad, iova); @@ -2711,8 +2710,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, /* free page tables */ dma_pte_free_pagetable(domain, start_pfn, last_pfn); - iommu_flush_iotlb_psi(iommu, domain->id, - start_pfn << VTD_PAGE_SHIFT, + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, (last_pfn - start_pfn + 1)); /* free iova */ @@ -2804,8 +2802,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_vpfn << VTD_PAGE_SHIFT, - offset_pfn); + iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn); else iommu_flush_write_buffer(iommu); From 1a4a45516d7a57de0691352d899d7008f2e090d1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 16:00:42 +0100 Subject: [PATCH 28/58] intel-iommu: Remove last use of PHYSICAL_PAGE_MASK, for reserving PCI BARs This is fairly broken anyway -- it doesn't take hotplug into account. We should probably be checking page_is_ram() instead. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 6afe44cb6815..a55f5fb06b14 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -62,9 +62,6 @@ #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32)) #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64)) -#ifndef PHYSICAL_PAGE_MASK -#define PHYSICAL_PAGE_MASK PAGE_MASK -#endif /* VT-d pages must always be _smaller_ than MM pages. Otherwise things are never going to work. */ @@ -1307,7 +1304,6 @@ static void dmar_init_reserved_ranges(void) struct pci_dev *pdev = NULL; struct iova *iova; int i; - u64 addr, size; init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN); @@ -1330,12 +1326,9 @@ static void dmar_init_reserved_ranges(void) r = &pdev->resource[i]; if (!r->flags || !(r->flags & IORESOURCE_MEM)) continue; - addr = r->start; - addr &= PHYSICAL_PAGE_MASK; - size = r->end - addr; - size = PAGE_ALIGN(size); - iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), - IOVA_PFN(size + addr) - 1); + iova = reserve_iova(&reserved_iova_list, + IOVA_PFN(r->start), + IOVA_PFN(r->end)); if (!iova) printk(KERN_ERR "Reserve iova failed\n"); } From c5395d5c4a82159889cb650de93b591ea51d8c56 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 16:35:56 +0100 Subject: [PATCH 29/58] intel-iommu: Clean up iommu_domain_identity_map() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index a55f5fb06b14..c5caf7d63a0f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1861,31 +1861,25 @@ static int iommu_domain_identity_map(struct dmar_domain *domain, unsigned long long start, unsigned long long end) { - unsigned long size; - unsigned long long base; + unsigned long first_vpfn = start >> VTD_PAGE_SHIFT; + unsigned long last_vpfn = end >> VTD_PAGE_SHIFT; - /* The address might not be aligned */ - base = start & PAGE_MASK; - size = end - base; - size = PAGE_ALIGN(size); - if (!reserve_iova(&domain->iovad, IOVA_PFN(base), - IOVA_PFN(base + size) - 1)) { + if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn), + dma_to_mm_pfn(last_vpfn))) { printk(KERN_ERR "IOMMU: reserve iova failed\n"); return -ENOMEM; } - pr_debug("Mapping reserved region %lx@%llx for domain %d\n", - size, base, domain->id); + pr_debug("Mapping reserved region %llx-%llx for domain %d\n", + start, end, domain->id); /* * RMRR range might have overlap with physical memory range, * clear it first */ - dma_pte_clear_range(domain, base >> VTD_PAGE_SHIFT, - (base + size - 1) >> VTD_PAGE_SHIFT); + dma_pte_clear_range(domain, first_vpfn, last_vpfn); - return domain_pfn_mapping(domain, base >> VTD_PAGE_SHIFT, - base >> VTD_PAGE_SHIFT, - size >> VTD_PAGE_SHIFT, + return domain_pfn_mapping(domain, first_vpfn, first_vpfn, + last_vpfn - first_vpfn + 1, DMA_PTE_READ|DMA_PTE_WRITE); } From 310a5ab93cb4ce29367238f682affd9ac352f4d0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 18:52:20 +0100 Subject: [PATCH 30/58] intel-iommu: Performance improvement for dma_pte_clear_range() It's a bit silly to repeatedly call domain_flush_cache() for each PTE individually, as we clear it. Instead, batch them up and flush a whole range at a time. We might as well refrain from recalculating the PTE address from scratch each time round the loop too. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index c5caf7d63a0f..ba7e37f7111a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -761,34 +761,33 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, return NULL; } -/* clear one page's page table */ -static void dma_pte_clear_one(struct dmar_domain *domain, unsigned long pfn) -{ - struct dma_pte *pte = NULL; - - /* get last level pte */ - pte = dma_pfn_level_pte(domain, pfn, 1); - - if (pte) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - } -} - /* clear last level pte, a tlb flush should be followed */ static void dma_pte_clear_range(struct dmar_domain *domain, unsigned long start_pfn, unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + struct dma_pte *first_pte, *pte; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); /* we don't need lock here; nobody else touches the iova range */ while (start_pfn <= last_pfn) { - dma_pte_clear_one(domain, start_pfn); - start_pfn++; + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); + if (!pte) { + start_pfn = align_to_level(start_pfn + 1, 2); + continue; + } + while (start_pfn <= last_pfn && + (unsigned long)pte >> VTD_PAGE_SHIFT == + (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + dma_clear_pte(pte); + start_pfn++; + pte++; + } + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); } } From 6f6a00e40aa3fdd3b29c30e3ef1fc9690506bc03 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 20:38:49 +0100 Subject: [PATCH 31/58] intel-iommu: Performance improvement for domain_pfn_mapping() As with dma_pte_clear_range(), don't keep flushing a single PTE at a time. And also micro-optimise the setting of PTE values rather than using the helper functions to do all the masking. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ba7e37f7111a..f8074236bcce 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1639,7 +1639,7 @@ static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phys_pfn, unsigned long nr_pages, int prot) { - struct dma_pte *pte; + struct dma_pte *first_pte = NULL, *pte = NULL; int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); @@ -1647,19 +1647,27 @@ static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + while (nr_pages--) { - pte = pfn_to_dma_pte(domain, iov_pfn); - if (!pte) - return -ENOMEM; + if (!pte) { + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + if (!pte) + return -ENOMEM; + } /* We don't need lock here, nobody else * touches the iova range */ BUG_ON(dma_pte_addr(pte)); - dma_set_pte_pfn(pte, phys_pfn); - dma_set_pte_prot(pte, prot); - if (prot & DMA_PTE_SNP) - dma_set_pte_snp(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); + pte->val = (phys_pfn << VTD_PAGE_SHIFT) | prot; + pte++; + if (!nr_pages || + (unsigned long)pte >> VTD_PAGE_SHIFT != + (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + pte = NULL; + } iov_pfn++; phys_pfn++; } From 875764de6f0ddb23d270c29357d5a339232a0488 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 28 Jun 2009 21:20:51 +0100 Subject: [PATCH 32/58] intel-iommu: Simplify __intel_alloc_iova() There's no need for the separate iommu_alloc_iova() function, and certainly not for it to be global. Remove the underscores while we're at it. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 47 ++++++++++++++------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f8074236bcce..11a23201445a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2323,43 +2323,31 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr, return host_addr >> VTD_PAGE_SHIFT; } -struct iova * -iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end) -{ - struct iova *piova; - - /* Make sure it's in range */ - end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end); - if (!size || (IOVA_START_ADDR + size > end)) - return NULL; - - piova = alloc_iova(&domain->iovad, - size >> PAGE_SHIFT, IOVA_PFN(end), 1); - return piova; -} - -static struct iova * -__intel_alloc_iova(struct device *dev, struct dmar_domain *domain, - size_t size, u64 dma_mask) +static struct iova *intel_alloc_iova(struct device *dev, + struct dmar_domain *domain, + unsigned long nrpages, uint64_t dma_mask) { struct pci_dev *pdev = to_pci_dev(dev); struct iova *iova = NULL; - if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac) - iova = iommu_alloc_iova(domain, size, dma_mask); - else { + /* Restrict dma_mask to the width that the iommu can handle */ + dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); + + if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { /* * First try to allocate an io virtual address in * DMA_BIT_MASK(32) and if that fails then try allocating * from higher range */ - iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32)); - if (!iova) - iova = iommu_alloc_iova(domain, size, dma_mask); + iova = alloc_iova(&domain->iovad, nrpages, + IOVA_PFN(DMA_BIT_MASK(32)), 1); + if (iova) + return iova; } - - if (!iova) { - printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev)); + iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1); + if (unlikely(!iova)) { + printk(KERN_ERR "Allocating %ld-page iova for %s failed", + nrpages, pci_name(pdev)); return NULL; } @@ -2464,7 +2452,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, iommu = domain_get_iommu(domain); size = aligned_nrpages(paddr, size); - iova = __intel_alloc_iova(hwdev, domain, size << VTD_PAGE_SHIFT, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); if (!iova) goto error; @@ -2753,8 +2741,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne for_each_sg(sglist, sg, nelems, i) size += aligned_nrpages(sg->offset, sg->length); - iova = __intel_alloc_iova(hwdev, domain, size << VTD_PAGE_SHIFT, - pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); if (!iova) { sglist->dma_length = 0; return 0; From e1605495c716ef4eebdb7606bcd1b593f28e2837 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 29 Jun 2009 11:17:38 +0100 Subject: [PATCH 33/58] intel-iommu: Introduce domain_sg_mapping() to speed up intel_map_sg() Instead of calling domain_pfn_mapping() repeatedly with single or small numbers of pages, just pass the sglist in. It can optimise the number of cache flushes like domain_pfn_mapping() does, and gives a huge speedup for large scatterlists. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 83 +++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 11a23201445a..28bd5f2d78fc 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1635,6 +1635,56 @@ static int domain_context_mapped(struct pci_dev *pdev) tmp->devfn); } +static int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long nr_pages, + int prot) +{ + struct dma_pte *first_pte = NULL, *pte = NULL; + uint64_t pteval; + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned long sg_res = 0; + + BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); + + if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) + return -EINVAL; + + prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + + while (nr_pages--) { + if (!sg_res) { + sg_res = (sg->offset + sg->length + VTD_PAGE_SIZE - 1) >> VTD_PAGE_SHIFT; + sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; + sg->dma_length = sg->length; + pteval = page_to_phys(sg_page(sg)) | prot; + } + if (!pte) { + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + if (!pte) + return -ENOMEM; + } + /* We don't need lock here, nobody else + * touches the iova range + */ + BUG_ON(dma_pte_addr(pte)); + pte->val = pteval; + pte++; + if (!nr_pages || + (unsigned long)pte >> VTD_PAGE_SHIFT != + (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + pte = NULL; + } + iov_pfn++; + pteval += VTD_PAGE_SIZE; + sg_res--; + if (!sg_res) + sg = sg_next(sg); + } + return 0; +} + static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phys_pfn, unsigned long nr_pages, int prot) @@ -2758,27 +2808,18 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne prot |= DMA_PTE_WRITE; start_vpfn = mm_to_dma_pfn(iova->pfn_lo); - offset_pfn = 0; - for_each_sg(sglist, sg, nelems, i) { - int nr_pages = aligned_nrpages(sg->offset, sg->length); - ret = domain_pfn_mapping(domain, start_vpfn + offset_pfn, - page_to_dma_pfn(sg_page(sg)), - nr_pages, prot); - if (ret) { - /* clear the page */ - dma_pte_clear_range(domain, start_vpfn, - start_vpfn + offset_pfn); - /* free page tables */ - dma_pte_free_pagetable(domain, start_vpfn, - start_vpfn + offset_pfn); - /* free iova */ - __free_iova(&domain->iovad, iova); - return 0; - } - sg->dma_address = ((dma_addr_t)(start_vpfn + offset_pfn) - << VTD_PAGE_SHIFT) + sg->offset; - sg->dma_length = sg->length; - offset_pfn += nr_pages; + + ret = domain_sg_mapping(domain, start_vpfn, sglist, mm_to_dma_pfn(size), prot); + if (unlikely(ret)) { + /* clear the page */ + dma_pte_clear_range(domain, start_vpfn, + start_vpfn + size - 1); + /* free page tables */ + dma_pte_free_pagetable(domain, start_vpfn, + start_vpfn + size - 1); + /* free iova */ + __free_iova(&domain->iovad, iova); + return 0; } /* it's a non-present to present mapping. Only flush if caching mode */ From 9051aa0268dc1c3e42cd79a802b0af1f2bfcadae Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 29 Jun 2009 12:30:54 +0100 Subject: [PATCH 34/58] intel-iommu: Combine domain_pfn_mapping() and domain_sg_mapping() Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 62 ++++++++++++++------------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 28bd5f2d78fc..14308533b1cb 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1635,14 +1635,14 @@ static int domain_context_mapped(struct pci_dev *pdev) tmp->devfn); } -static int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - struct scatterlist *sg, unsigned long nr_pages, - int prot) +static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long phys_pfn, + unsigned long nr_pages, int prot) { struct dma_pte *first_pte = NULL, *pte = NULL; - uint64_t pteval; + phys_addr_t uninitialized_var(pteval); int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - unsigned long sg_res = 0; + unsigned long sg_res; BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); @@ -1651,6 +1651,13 @@ static int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + if (sg) + sg_res = 0; + else { + sg_res = nr_pages + 1; + pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; + } + while (nr_pages--) { if (!sg_res) { sg_res = (sg->offset + sg->length + VTD_PAGE_SIZE - 1) >> VTD_PAGE_SHIFT; @@ -1685,43 +1692,18 @@ static int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, return 0; } -static int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, - int prot) +static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long nr_pages, + int prot) { - struct dma_pte *first_pte = NULL, *pte = NULL; - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); +} - BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); - - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) - return -EINVAL; - - prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; - - while (nr_pages--) { - if (!pte) { - first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); - if (!pte) - return -ENOMEM; - } - /* We don't need lock here, nobody else - * touches the iova range - */ - BUG_ON(dma_pte_addr(pte)); - pte->val = (phys_pfn << VTD_PAGE_SHIFT) | prot; - pte++; - if (!nr_pages || - (unsigned long)pte >> VTD_PAGE_SHIFT != - (unsigned long)first_pte >> VTD_PAGE_SHIFT) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - iov_pfn++; - phys_pfn++; - } - return 0; +static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phys_pfn, unsigned long nr_pages, + int prot) +{ + return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); } static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) From 1bf20f0dc5629032ddd07617139d9fbca66c1642 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 29 Jun 2009 22:06:43 +0100 Subject: [PATCH 35/58] intel-iommu: dump mappings but don't die on pte already set Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 14308533b1cb..40ce5a03f18f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1673,7 +1673,16 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - BUG_ON(dma_pte_addr(pte)); + if (unlikely(dma_pte_addr(pte))) { + static int dumps = 5; + printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx)\n", + iov_pfn, pte->val); + if (dumps) { + dumps--; + debug_dma_dump_mappings(NULL); + } + WARN_ON(1); + } pte->val = pteval; pte++; if (!nr_pages || From 3d7b0e4154b4963d6bd39991ec8eaa09caeb3994 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 30 Jun 2009 03:38:09 +0100 Subject: [PATCH 36/58] intel-iommu: Don't free too much in dma_pte_free_pagetable() The loop condition was wrong -- we should free a PMD only if its _entire_ range is within the range we're intending to clear. The early-termination condition was right, but not the loop. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 40ce5a03f18f..35bdd2a06caa 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -815,7 +815,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, if (tmp + level_size(level) - 1 > last_pfn) return; - while (tmp <= last_pfn) { + while (tmp + level_size(level) - 1 <= last_pfn) { pte = dma_pfn_level_pte(domain, tmp, level); if (pte) { free_pgtable_page( From f3a0a52fff4dbfdea2dccc908d00c038481d888e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 30 Jun 2009 03:40:07 +0100 Subject: [PATCH 37/58] intel-iommu: Performance improvement for dma_pte_free_pagetable() As with other functions, batch the CPU data cache flushes and don't keep recalculating PTE addresses. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 35bdd2a06caa..ec7e032d5ab5 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -797,7 +797,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - struct dma_pte *pte; + struct dma_pte *first_pte, *pte; int total = agaw_to_level(domain->agaw); int level; unsigned long tmp; @@ -805,25 +805,32 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); - /* we don't need lock here, nobody else touches the iova range */ + /* We don't need lock here; nobody else touches the iova range */ level = 2; while (level <= total) { tmp = align_to_level(start_pfn, level); - /* Only clear this pte/pmd if we're asked to clear its - _whole_ range */ + /* If we can't even clear one PTE at this level, we're done */ if (tmp + level_size(level) - 1 > last_pfn) return; while (tmp + level_size(level) - 1 <= last_pfn) { - pte = dma_pfn_level_pte(domain, tmp, level); - if (pte) { - free_pgtable_page( - phys_to_virt(dma_pte_addr(pte))); - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); + first_pte = pte = dma_pfn_level_pte(domain, tmp, level); + if (!pte) { + tmp = align_to_level(tmp + 1, level + 1); + continue; } - tmp += level_size(level); + while (tmp + level_size(level) - 1 <= last_pfn && + (unsigned long)pte >> VTD_PAGE_SHIFT == + (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); + dma_clear_pte(pte); + pte++; + tmp += level_size(level); + } + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + } level++; } From 3238c0c4d68d9a9022b411a11a4b933fbdb53a14 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 18:56:16 +0100 Subject: [PATCH 38/58] intel-iommu: Make iommu=pt work on i386 too Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 47630479b067..1a041bcf506b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -211,11 +211,11 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; +#endif if (!strncmp(p, "pt", 2)) { iommu_pass_through = 1; return 1; } -#endif gart_parse_options(p); From c85994e4771025ef2a66533eb1a4c6c2217b9cda Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 19:21:24 +0100 Subject: [PATCH 39/58] intel-iommu: Ensure that PTE writes are 64-bit atomic, even on i386 Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ec7e032d5ab5..eea1006c860a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -222,7 +222,12 @@ static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot) static inline u64 dma_pte_addr(struct dma_pte *pte) { - return (pte->val & VTD_PAGE_MASK); +#ifdef CONFIG_64BIT + return pte->val & VTD_PAGE_MASK; +#else + /* Must have a full atomic 64-bit read */ + return __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK; +#endif } static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn) @@ -712,6 +717,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, break; if (!dma_pte_present(pte)) { + uint64_t pteval; + tmp_page = alloc_pgtable_page(); if (!tmp_page) { @@ -719,15 +726,15 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, flags); return NULL; } - domain_flush_cache(domain, tmp_page, PAGE_SIZE); - dma_set_pte_pfn(pte, virt_to_dma_pfn(tmp_page)); - /* - * high level table always sets r/w, last level page - * table control read/write - */ - dma_set_pte_readable(pte); - dma_set_pte_writable(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); + domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); + pteval = (virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + if (cmpxchg64(&pte->val, 0ULL, pteval)) { + /* Someone else set it while we were thinking; use theirs. */ + free_pgtable_page(tmp_page); + } else { + dma_pte_addr(pte); + domain_flush_cache(domain, pte, sizeof(*pte)); + } } parent = phys_to_virt(dma_pte_addr(pte)); level--; @@ -1666,6 +1673,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, } while (nr_pages--) { + uint64_t tmp; + if (!sg_res) { sg_res = (sg->offset + sg->length + VTD_PAGE_SIZE - 1) >> VTD_PAGE_SHIFT; sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; @@ -1680,17 +1689,17 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - if (unlikely(dma_pte_addr(pte))) { + tmp = cmpxchg64(&pte->val, 0ULL, pteval); + if (tmp) { static int dumps = 5; - printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx)\n", - iov_pfn, pte->val); + printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", + iov_pfn, tmp, (unsigned long long)pteval); if (dumps) { dumps--; debug_dma_dump_mappings(NULL); } WARN_ON(1); } - pte->val = pteval; pte++; if (!nr_pages || (unsigned long)pte >> VTD_PAGE_SHIFT != From 206a73c102fc480ba072a9388bc2142c303113aa Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 19:30:28 +0100 Subject: [PATCH 40/58] intel-iommu: Kill superfluous mapping_lock Since we're using cmpxchg64() anyway (because that's the only way to do an atomic 64-bit store on i386), we might as well ditch the extra locking and just use cmpxchg64() to ensure that we don't add the page twice. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index eea1006c860a..02223e2e27d4 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -267,7 +267,6 @@ struct dmar_domain { struct iova_domain iovad; /* iova's that belong to this domain */ struct dma_pte *pgd; /* virtual address */ - spinlock_t mapping_lock; /* page table lock */ int gaw; /* max guest address width */ /* adjusted guest address width, 0 is level 2 30-bit */ @@ -701,13 +700,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); int offset; - unsigned long flags; BUG_ON(!domain->pgd); BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; - spin_lock_irqsave(&domain->mapping_lock, flags); while (level > 0) { void *tmp_page; @@ -721,11 +718,9 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, tmp_page = alloc_pgtable_page(); - if (!tmp_page) { - spin_unlock_irqrestore(&domain->mapping_lock, - flags); + if (!tmp_page) return NULL; - } + domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); pteval = (virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; if (cmpxchg64(&pte->val, 0ULL, pteval)) { @@ -740,7 +735,6 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, level--; } - spin_unlock_irqrestore(&domain->mapping_lock, flags); return pte; } @@ -1375,7 +1369,6 @@ static int domain_init(struct dmar_domain *domain, int guest_width) unsigned long sagaw; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); @@ -3336,7 +3329,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) int adjust_width; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); From 85b98276f2ffa66b25ae6328b00bfadfd74b74e7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 19:27:53 +0100 Subject: [PATCH 41/58] intel-iommu: Warn about unmatched unmap requests This would have found the bug in i386 pci_unmap_addr() a long time ago. We shouldn't just silently return without doing anything. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 02223e2e27d4..2bbc3fc88dc9 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2640,7 +2640,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); - if (!iova) + if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n", + (unsigned long long)dev_addr)) return; start_pfn = mm_to_dma_pfn(iova->pfn_lo); @@ -2730,7 +2731,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); - if (!iova) + if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n", + (unsigned long long)sglist[0].dma_address)) return; start_pfn = mm_to_dma_pfn(iova->pfn_lo); From 7766a3fb905f0b078b05f5d6a6be8df4c64b9f51 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 20:27:03 +0100 Subject: [PATCH 42/58] intel-iommu: Use cmpxchg64_local() for setting PTEs Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 2bbc3fc88dc9..2c1b2babfdc5 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1682,7 +1682,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - tmp = cmpxchg64(&pte->val, 0ULL, pteval); + tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); if (tmp) { static int dumps = 5; printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", From 75e6bf9638992dfc0fec9c3ca10444c8e0d6a638 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 2 Jul 2009 11:21:16 +0100 Subject: [PATCH 43/58] intel-iommu: Introduce first_pte_in_page() to simplify PTE-setting loops On Wed, 2009-07-01 at 16:59 -0700, Linus Torvalds wrote: > I also _really_ hate how you do > > (unsigned long)pte >> VTD_PAGE_SHIFT == > (unsigned long)first_pte >> VTD_PAGE_SHIFT Kill this, in favour of just looking to see if the incremented pte pointer has 'wrapped' onto the next page. Which means we have to check it _after_ incrementing it, not before. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 2c1b2babfdc5..dcf0295a9b60 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -240,6 +240,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline int first_pte_in_page(struct dma_pte *pte) +{ + return !((unsigned long)pte & ~VTD_PAGE_MASK); +} + /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -780,13 +785,12 @@ static void dma_pte_clear_range(struct dmar_domain *domain, start_pfn = align_to_level(start_pfn + 1, 2); continue; } - while (start_pfn <= last_pfn && - (unsigned long)pte >> VTD_PAGE_SHIFT == - (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + do { dma_clear_pte(pte); start_pfn++; pte++; - } + } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); } @@ -821,14 +825,14 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, tmp = align_to_level(tmp + 1, level + 1); continue; } - while (tmp + level_size(level) - 1 <= last_pfn && - (unsigned long)pte >> VTD_PAGE_SHIFT == - (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + do { free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); dma_clear_pte(pte); pte++; tmp += level_size(level); - } + } while (!first_pte_in_page(pte) && + tmp + level_size(level) - 1 <= last_pfn); + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); @@ -1694,9 +1698,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, WARN_ON(1); } pte++; - if (!nr_pages || - (unsigned long)pte >> VTD_PAGE_SHIFT != - (unsigned long)first_pte >> VTD_PAGE_SHIFT) { + if (!nr_pages || first_pte_in_page(pte)) { domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); pte = NULL; From 6a43e574c5af7d9bd084992b1c9c3cdbc3b6c0e9 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 2 Jul 2009 12:02:34 +0100 Subject: [PATCH 44/58] intel-iommu: Don't keep freeing page zero in dma_pte_free_pagetable() Check dma_pte_present() and only free the page if there _is_ one. Kind of surprising that there was no warning about this. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index dcf0295a9b60..53075424a434 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -826,8 +826,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, continue; } do { - free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); - dma_clear_pte(pte); + if (dma_pte_present(pte)) { + free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); + dma_clear_pte(pte); + } pte++; tmp += level_size(level); } while (!first_pte_in_page(pte) && From bdae997f44535ac4ebe1e055ffe59eeee946f453 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Wed, 1 Jul 2009 21:56:38 -0700 Subject: [PATCH 45/58] fs/notify/inotify: decrement user inotify count on close The per-user inotify_devs value is incremented each time a new file is allocated, but never decremented. This led to inotify_init failing after a limited number of calls. Signed-off-by: Keith Packard Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff231ad23895..ff27a2965844 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -296,12 +296,15 @@ static int inotify_fasync(int fd, struct file *file, int on) static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct user_struct *user = group->inotify_data.user; fsnotify_clear_marks_by_group(group); /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_put_group(group); + atomic_dec(&user->inotify_devs); + return 0; } From f597bb19ccd034cbcf05e1194238e2c8d9505a8a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 27 Jun 2009 21:06:22 -0400 Subject: [PATCH 46/58] Btrfs: don't log the inode in file_write while growing the file --- fs/btrfs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 126477eaecf5..7c3cd248d8d6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -151,7 +151,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, } if (end_pos > isize) { i_size_write(inode, end_pos); - btrfs_update_inode(trans, root, inode); + /* we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ } err = btrfs_end_transaction(trans, root); out_unlock: From c8a894d77de4a1e0a544577fd4eabc9aacd453a8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 27 Jun 2009 21:07:03 -0400 Subject: [PATCH 47/58] Btrfs: fix the file clone ioctl for preallocated extents --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index eff18f5b5362..9f4db848db10 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1028,7 +1028,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, struct btrfs_file_extent_item); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, @@ -1051,7 +1052,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) From a970b0a16cc416a509d5ae8b1d70978664e6f4fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 27 Jun 2009 21:07:34 -0400 Subject: [PATCH 48/58] Btrfs: account for space we may use in fallocate Using Eric Sandeen's xfstest for fallocate, you can easily trigger a ENOSPC panic on btrfs. This is because we do not account for data we may use when doing the fallocate. This patch fixes the problem by properly reserving space, and then just freeing it when we are done. The reservation stuff was made with delalloc in mind, so its a little crude for this case, but it keeps the box from panicing. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b68330f8585..1eacc78f6614 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5103,6 +5103,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; struct btrfs_trans_handle *trans; + struct btrfs_root *root; int ret; alloc_start = offset & ~mask; @@ -5121,6 +5122,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + root = BTRFS_I(inode)->root; + + ret = btrfs_check_data_free_space(root, inode, + alloc_end - alloc_start); + if (ret) + goto out; + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -5128,7 +5136,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); if (!trans) { ret = -EIO; - goto out; + goto out_free; } /* the extent lock is ordered inside the running @@ -5189,6 +5197,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); +out_free: + btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; From 2c47e605a91dde6b0514f689645e7ab336c8592a Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Sat, 27 Jun 2009 21:07:35 -0400 Subject: [PATCH 49/58] Btrfs: update backrefs while dropping snapshot The new backref format has restriction on type of backref item. If a tree block isn't referenced by its owner tree, full backrefs must be used for the pointers in it. When a tree block loses its owner tree's reference, backrefs for the pointers in it should be updated to full backrefs. Current btrfs_drop_snapshot misses the code that updates backrefs, so it's unsafe for general use. This patch adds backrefs update code to btrfs_drop_snapshot. It isn't a problem in the restricted form btrfs_drop_snapshot is used today, but for general snapshot deletion this update is required. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +- fs/btrfs/extent-tree.c | 574 ++++++++++++++++++++++++++++------------- fs/btrfs/relocation.c | 5 +- fs/btrfs/transaction.c | 4 +- 4 files changed, 400 insertions(+), 186 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03441a99ea38..a404ecc53eb1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2076,8 +2076,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root); +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index edc7d208c5ce..cd64cfc14f7f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -990,15 +990,13 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) { - int level; - BUG_ON(!path->keep_locks); - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + for (; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) break; - btrfs_assert_tree_locked(path->nodes[level]); if (path->slots[level] + 1 >= btrfs_header_nritems(path->nodes[level])) continue; @@ -1158,7 +1156,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, * For simplicity, we just do not add new inline back * ref if there is any kind of item for this block */ - if (find_next_key(path, &key) == 0 && key.objectid == bytenr && + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { err = -EAGAIN; goto out; @@ -4128,6 +4127,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } +#if 0 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { @@ -4171,8 +4171,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -#if 0 - static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_leaf_ref *ref) @@ -4553,262 +4551,471 @@ out: } #endif +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + /* - * helper function for drop_subtree, this function is similar to - * walk_down_tree. The main difference is that it checks reference - * counts while tree blocks are locked. + * hepler to process tree block while walking down the tree. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block. if the block is shared and + * we need update back refs for the subtree rooted at the + * block, this function changes wc->stage to UPDATE_BACKREF + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + struct btrfs_key key; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE && + wc->update_ref && wc->refs[level] > 1) { + BUG_ON(eb == root->node); + BUG_ON(path->slots[level] > 0); + if (level == 0) + btrfs_item_key_to_cpu(eb, &key, path->slots[level]); + else + btrfs_node_key_to_cpu(eb, &key, path->slots[level]); + if (btrfs_header_owner(eb) == root->root_key.objectid && + btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level; + } + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, 0); + BUG_ON(ret); + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; +} + +/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret = 0; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + BUG_ON(wc->refs[level] <= 1); + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + return 1; + } + } else { + BUG_ON(level != 0); + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + } + clean_tree_block(trans, root, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); + } + + ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, + root->root_key.objectid, level, 0); + BUG_ON(ret); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return ret; +} + static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int *level) + struct btrfs_path *path, + struct walk_control *wc) { struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u64 bytenr; u64 ptr_gen; - u64 refs; - u64 flags; u32 blocksize; + int level = wc->level; int ret; - cur = path->nodes[*level]; - ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, - &refs, &flags); - BUG_ON(ret); - if (refs > 1) - goto out; + while (level >= 0) { + cur = path->nodes[level]; + BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - - while (*level >= 0) { - cur = path->nodes[*level]; - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - clean_tree_block(trans, root, cur); + ret = walk_down_proc(trans, root, path, wc); + if (ret > 0) break; - } - if (path->slots[*level] >= btrfs_header_nritems(cur)) { - clean_tree_block(trans, root, cur); - break; - } - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + if (level == 0) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); next = read_tree_block(root, bytenr, blocksize, ptr_gen); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); - BUG_ON(ret); - if (refs > 1) { - parent = path->nodes[*level]; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - *level - 1, 0); - BUG_ON(ret); - path->slots[*level]++; - btrfs_tree_unlock(next); - free_extent_buffer(next); - continue; - } - - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - - *level = btrfs_header_level(next); - path->nodes[*level] = next; - path->slots[*level] = 0; - path->locks[*level] = 1; - cond_resched(); + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; } -out: - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - bytenr = path->nodes[*level]->start; - blocksize = path->nodes[*level]->len; - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, - btrfs_header_owner(parent), *level, 0); - BUG_ON(ret); - - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; - cond_resched(); return 0; } -/* - * helper for dropping snapshots. This walks back up the tree in the path - * to find the first node higher up where we haven't yet gone through - * all the slots - */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int *level, int max_level) + struct walk_control *wc, int max_level) { - struct btrfs_root_item *root_item = &root->root_item; - int i; - int slot; + int level = wc->level; int ret; - for (i = *level; i < max_level && path->nodes[i]; i++) { - slot = path->slots[i]; - if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { - /* - * there is more work to do in this level. - * Update the drop_progress marker to reflect - * the work we've done so far, and then bump - * the slot number - */ - path->slots[i]++; - WARN_ON(*level == 0); - if (max_level == BTRFS_MAX_LEVEL) { - btrfs_node_key(path->nodes[i], - &root_item->drop_progress, - path->slots[i]); - root_item->drop_level = i; - } - *level = i; + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; return 0; } else { - struct extent_buffer *parent; + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; - /* - * this whole node is done, free our reference - * on it and go up one level - */ - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - clean_tree_block(trans, root, path->nodes[i]); - ret = btrfs_free_extent(trans, root, - path->nodes[i]->start, - path->nodes[i]->len, - parent->start, - btrfs_header_owner(parent), - *level, 0); - BUG_ON(ret); - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[i]); - path->locks[i] = 0; + if (path->locks[level]) { + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - *level = i + 1; + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; } } return 1; } /* - * drop the reference count on the tree rooted at 'snap'. This traverses - * the tree freeing any blocks that have a ref count of zero after being - * decremented. + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) { - int ret = 0; - int wret; - int level; struct btrfs_path *path; - int update_count; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; path = btrfs_alloc_path(); BUG_ON(!path); - level = btrfs_header_level(root->node); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + trans = btrfs_start_transaction(tree_root, 1); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; path->locks[level] = 1; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); } else { - struct btrfs_key key; - struct btrfs_disk_key found_key; - struct extent_buffer *node; - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + level = root_item->drop_level; + BUG_ON(level == 0); path->lowest_level = level; - wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (wret < 0) { - ret = wret; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; goto out; } - node = path->nodes[level]; - btrfs_node_key(node, &found_key, path->slots[level]); - WARN_ON(memcmp(&found_key, &root_item->drop_progress, - sizeof(found_key))); + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + /* * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ btrfs_unlock_up_safe(path, 0); - } - while (1) { - unsigned long update; - wret = walk_down_tree(trans, root, path, &level); - if (wret > 0) - break; - if (wret < 0) - ret = wret; - wret = walk_up_tree(trans, root, path, &level, - BTRFS_MAX_LEVEL); - if (wret > 0) - break; - if (wret < 0) - ret = wret; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { - ret = -EAGAIN; + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + path->nodes[level]->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + WARN_ON(wc->refs[level] != 1); + level--; + } + } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + + while (1) { + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; break; } - for (update_count = 0; update_count < 16; update_count++) { + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; + break; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + BUG_ON(ret); + + btrfs_end_transaction(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 1); + } else { + unsigned long update; update = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (update) - btrfs_run_delayed_refs(trans, root, update); - else - break; + btrfs_run_delayed_refs(trans, tree_root, + update); } } + btrfs_release_path(root, path); + BUG_ON(err); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); btrfs_free_path(path); - return ret; + return err; } +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { struct btrfs_path *path; + struct walk_control *wc; int level; int parent_level; int ret = 0; int wret; + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + path = btrfs_alloc_path(); BUG_ON(!path); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); @@ -4817,24 +5024,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(node); level = btrfs_header_level(node); - extent_buffer_get(node); path->nodes[level] = node; path->slots[level] = 0; + path->locks[level] = 1; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; while (1) { - wret = walk_down_tree(trans, root, path, &level); - if (wret < 0) + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { ret = wret; - if (wret != 0) break; + } - wret = walk_up_tree(trans, root, path, &level, parent_level); + wret = walk_up_tree(trans, root, path, wc, parent_level); if (wret < 0) ret = wret; if (wret != 0) break; } + kfree(wc); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b23dc209ae10..008397934778 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1788,7 +1788,7 @@ static void merge_func(struct btrfs_work *work) btrfs_end_transaction(trans, root); } - btrfs_drop_dead_root(reloc_root); + btrfs_drop_snapshot(reloc_root, 0); if (atomic_dec_and_test(async->num_pending)) complete(async->done); @@ -2075,9 +2075,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); } if (!lowest) { btrfs_tree_unlock(upper->eb); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4e83457ea253..2dbf1c1f56ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -593,6 +593,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +#if 0 /* * when dropping snapshots, we generate a ton of delayed refs, and it makes * sense not to join the transaction while it is trying to flush the current @@ -681,6 +682,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root) btrfs_btree_balance_dirty(tree_root, nr); return ret; } +#endif /* * new snapshots need to be created at a very specific time in the @@ -1081,7 +1083,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); list_del_init(&root->root_list); - btrfs_drop_dead_root(root); + btrfs_drop_snapshot(root, 0); } return 0; } From 9427216476d4fa75103f39d4b228c47d56ba20da Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 2 Jul 2009 12:26:06 -0400 Subject: [PATCH 50/58] Btrfs: honor nodatacow/sum mount options for new files The btrfs attr patches unconditionally inherited the inode flags field without honoring nodatacow and nodatasum. This fix makes sure we properly record the nodatacow/sum mount options in new inodes. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1eacc78f6614..a48c084f6d3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3587,12 +3587,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 1; BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_hint, owner); - if ((mode & S_IFREG)) { - if (btrfs_test_opt(root, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - } key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -3647,6 +3641,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + } + insert_inode_hash(inode); inode_tree_add(inode); return inode; From 9b627e9bf49ebfeb060dfae0435bdba06cf27cb8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 2 Jul 2009 13:50:58 -0400 Subject: [PATCH 51/58] Btrfs: fix use after free in btrfs_start_workers fail path worker memory is already freed on one fail path in btrfs_start_workers, but is still dereferenced. Switch the dereference and kfree. Signed-off-by: Jiri Slaby Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7f88628a1a72..6e4f6c50a120 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -299,8 +299,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) "btrfs-%s-%d", workers->name, workers->num_workers + i); if (IS_ERR(worker->task)) { - kfree(worker); ret = PTR_ERR(worker->task); + kfree(worker); goto fail; } From 68f5a38c3ea4ae9cc7a40f86ff6d6d031583d93a Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Thu, 2 Jul 2009 13:55:45 -0400 Subject: [PATCH 52/58] Btrfs: fix error message formatting Make an error msg look nicer by inserting a space between number and word. Signed-off-by: Hu Tao Signed-off-by: Jiri Kosina Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd64cfc14f7f..a5aca3997d42 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2696,7 +2696,7 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use" + "%llu bytes_pinned, %llu bytes_readonly, %llu may use " "%llu total\n", (unsigned long long)bytes, (unsigned long long)data_sinfo->bytes_delalloc, (unsigned long long)data_sinfo->bytes_used, From 44e18e9eda1f5c318a888989d929188eea93c8dc Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 3 Jul 2009 08:39:34 +0000 Subject: [PATCH 53/58] sh-sci: update receive error handling for muxed irqs This patch updates the receive error code for muxed interrupts in the sh-sci driver. Receive error interrupts may be generated by the hardware if RE or REIE bits in SCSCR are set. Update the muxed interrupt handling code to acknowledge error interrupts if RE or REIE is set, instead of only acknowledging if REIE is set. Without this patch error interrupts may be generated but never acked resulting in a "nobody cared" crash. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 66f52674ca0c..12bd64684f12 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -707,12 +707,13 @@ static irqreturn_t sci_br_interrupt(int irq, void *ptr) static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr) { - unsigned short ssr_status, scr_status; + unsigned short ssr_status, scr_status, err_enabled; struct uart_port *port = ptr; irqreturn_t ret = IRQ_NONE; ssr_status = sci_in(port, SCxSR); scr_status = sci_in(port, SCSCR); + err_enabled = scr_status & (SCI_CTRL_FLAGS_REIE | SCI_CTRL_FLAGS_RIE); /* Tx Interrupt */ if ((ssr_status & 0x0020) && (scr_status & SCI_CTRL_FLAGS_TIE)) @@ -721,10 +722,10 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr) if ((ssr_status & 0x0002) && (scr_status & SCI_CTRL_FLAGS_RIE)) ret = sci_rx_interrupt(irq, ptr); /* Error Interrupt */ - if ((ssr_status & 0x0080) && (scr_status & SCI_CTRL_FLAGS_REIE)) + if ((ssr_status & 0x0080) && err_enabled) ret = sci_er_interrupt(irq, ptr); /* Break Interrupt */ - if ((ssr_status & 0x0010) && (scr_status & SCI_CTRL_FLAGS_REIE)) + if ((ssr_status & 0x0010) && err_enabled) ret = sci_br_interrupt(irq, ptr); return ret; From 64614e66fbfcaa57598c1678d699520dfbf4f531 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 3 Jul 2009 09:12:46 +0000 Subject: [PATCH 54/58] usb: allow sh7724 to enable on-chip r8a66597 The sh7724 processor has two on-chip r8a66597 blocks, so add it to the list of processors for SUPERH_ON_CHIP_R8A66597. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/usb/host/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 1576a0520adf..0c03471f0d41 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -337,10 +337,10 @@ config USB_R8A66597_HCD config SUPERH_ON_CHIP_R8A66597 boolean "Enable SuperH on-chip R8A66597 USB" - depends on USB_R8A66597_HCD && (CPU_SUBTYPE_SH7366 || CPU_SUBTYPE_SH7723) + depends on USB_R8A66597_HCD && (CPU_SUBTYPE_SH7366 || CPU_SUBTYPE_SH7723 || CPU_SUBTYPE_SH7724) help This driver enables support for the on-chip R8A66597 in the - SH7366 and SH7723 processors. + SH7366, SH7723 and SH7724 processors. config USB_WHCI_HCD tristate "Wireless USB Host Controller Interface (WHCI) driver (EXPERIMENTAL)" From 9731f4a202f29ff402ea2ddad7ff6f3a559c0c82 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 3 Jul 2009 09:40:03 +0000 Subject: [PATCH 55/58] sh: add r8a66597 usb0 host to the se7724 board Add USB host support for port CN27 on the Solution Engine 7724 board. The r8a66597-hcd driver is hooked up as a platform device and some registers are configured to enable the USB host function. The hardware driving the USB port is the on-chip USB0 block in the sh7724 processor configured as USB host controller. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7724/setup.c | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index c050a8d76dfd..8fed45a2fb85 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -19,6 +19,7 @@ #include #include #include +#include #include