Re: [PATCH v6 RESED 1/2] dma: replace zone_dma_bits by zone_dma_limit
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: 2024-08-26 19:28:34
Also in:
linux-arm-kernel, linux-iommu, linux-rockchip, linux-s390, lkml
Dear All, On 11.08.2024 09:09, Baruch Siach wrote:
From: Catalin Marinas <catalin.marinas@arm.com> Hardware DMA limit might not be power of 2. When RAM range starts above 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit can not encode this limit. Use plain address for DMA zone limit. Since DMA zone can now potentially span beyond 4GB physical limit of DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Co-developed-by: Baruch Siach <baruch@tkos.co.il> Signed-off-by: Baruch Siach <baruch@tkos.co.il> ---
This patch landed recently in linux-next as commit ba0fb44aed47
("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my
tests I found that it introduces the following warning on ARM64/Rockchip
based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):
------------[ cut here ]------------
dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096
overflow (mask ffffffff, bus limit 0).
WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308
Modules linked in:
CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278
Hardware name: Hardkernel ODROID-M1 (DT)
pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : swiotlb_map+0x2f0/0x308
lr : swiotlb_map+0x2f0/0x308
...
Call trace:
swiotlb_map+0x2f0/0x308
dma_direct_map_sg+0x9c/0x2e4
__dma_map_sg_attrs+0x28/0x94
dma_map_sg_attrs+0x10/0x24
dw_mci_pre_dma_transfer+0xb8/0xf4
dw_mci_pre_req+0x50/0x68
mmc_blk_mq_issue_rq+0x3e0/0x964
mmc_mq_queue_rq+0x118/0x2b4
blk_mq_dispatch_rq_list+0x21c/0x714
__blk_mq_sched_dispatch_requests+0x490/0x58c
blk_mq_sched_dispatch_requests+0x30/0x6c
blk_mq_run_hw_queue+0x284/0x40c
blk_mq_flush_plug_list.part.0+0x190/0x974
blk_mq_flush_plug_list+0x1c/0x2c
__blk_flush_plug+0xe4/0x140
blk_finish_plug+0x38/0x4c
__ext4_get_inode_loc+0x22c/0x654
__ext4_get_inode_loc_noinmem+0x40/0xa8
__ext4_iget+0x154/0xcc0
ext4_get_journal_inode+0x30/0x110
ext4_load_and_init_journal+0x9c/0xaf0
ext4_fill_super+0x1fec/0x2d90
get_tree_bdev+0x140/0x1d8
ext4_get_tree+0x18/0x24
vfs_get_tree+0x28/0xe8
path_mount+0x3e8/0xb7c
init_mount+0x68/0xac
do_mount_root+0x108/0x1dc
mount_root_generic+0x100/0x330
mount_root+0x160/0x2d0
initrd_load+0x1f0/0x2a0
prepare_namespace+0x4c/0x29c
kernel_init_freeable+0x4b4/0x50c
kernel_init+0x20/0x1d8
ret_from_fork+0x10/0x20
irq event stamp: 1305682
hardirqs last enabled at (1305681): [<ffff8000800e332c>]
console_unlock+0x124/0x130
hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c
softirqs last enabled at (1305678): [<ffff80008005be1c>]
handle_softirqs+0x4cc/0x4e4
softirqs last disabled at (1305665): [<ffff8000800105b0>]
__do_softirq+0x14/0x20
---[ end trace 0000000000000000 ]---
This "bus limit 0" seems to be a bit suspicious to me as well as the
fact that swiotlb is used for the MMC DMA. I will investigate this
further tomorrow. The board boots fine though.
quoted hunk ↗ jump to hunk
arch/arm64/mm/init.c | 30 +++++++++++++++--------------- arch/powerpc/mm/mem.c | 5 ++++- arch/s390/mm/init.c | 2 +- include/linux/dma-direct.h | 2 +- kernel/dma/direct.c | 6 +++--- kernel/dma/pool.c | 4 ++-- kernel/dma/swiotlb.c | 6 +++--- 7 files changed, 29 insertions(+), 26 deletions(-)diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..c45e2152ca9e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c@@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) } /* - * Return the maximum physical address for a zone accessible by the given bits - * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * Return the maximum physical address for a zone given its limit. + * If DRAM starts above 32-bit, expand the zone to the maximum * available memory, otherwise cap it at 32-bit. */ -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) { - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); phys_addr_t phys_start = memblock_start_of_DRAM(); if (phys_start > U32_MAX) - zone_mask = PHYS_ADDR_MAX; - else if (phys_start > zone_mask) - zone_mask = U32_MAX; + zone_limit = PHYS_ADDR_MAX; + else if (phys_start > zone_limit) + zone_limit = U32_MAX; - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; - unsigned int __maybe_unused acpi_zone_dma_bits; - unsigned int __maybe_unused dt_zone_dma_bits; - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); + phys_addr_t __maybe_unused acpi_zone_dma_limit; + phys_addr_t __maybe_unused dt_zone_dma_limit; + phys_addr_t __maybe_unused dma32_phys_limit = + max_zone_phys(DMA_BIT_MASK(32)); #ifdef CONFIG_ZONE_DMA - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d325217ab201..05b7f702b3f7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c@@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) * everything else. GFP_DMA32 page allocations automatically fall back to * ZONE_DMA. * - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by * ZONE_DMA.@@ -230,6 +230,7 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); + int zone_dma_bits; #ifdef CONFIG_HIGHMEM unsigned long v = __fix_to_virt(FIX_KMAP_END);@@ -256,6 +257,8 @@ void __init paging_init(void) else zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); + #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(max_low_pfn, 1UL << (zone_dma_bits - PAGE_SHIFT));diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ddcd39ef4346..91fc2b91adfc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c@@ -97,7 +97,7 @@ void __init paging_init(void) vmem_map_init(); sparse_init(); - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn;diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index edbe13d00776..d7e30d4f7503 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h@@ -12,7 +12,7 @@ #include <linux/mem_encrypt.h> #include <linux/swiotlb.h> -extern unsigned int zone_dma_bits; +extern u64 zone_dma_limit; /* * Record the mapping of CPU physical to DMA addresses for a given region.diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4480a3cd92e0..f2ba074a6a54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c@@ -20,7 +20,7 @@ * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ -unsigned int zone_dma_bits __ro_after_init = 24; +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys)@@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) * zones. */ *phys_limit = dma_to_phys(dev, dma_limit); - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= zone_dma_limit) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32;@@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); + min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); }diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d10613eb0f63..7b04f7575796 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c@@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) /* CMA can't cross zone boundaries, see cma_activate_area() */ end = cma_get_base(cma) + size - 1; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) - return end <= DMA_BIT_MASK(zone_dma_bits); + return end <= zone_dma_limit; if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return end <= DMA_BIT_MASK(32); + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); return true; }diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index df68d29740a0..abcf3fa63a56 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c@@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); + io_tlb_default_mem.phys_limit = zone_dma_limit; else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif@@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, } gfp &= ~GFP_ZONEMASK; - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (phys_limit <= zone_dma_limit) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32;
Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland