Project import generated by Copybara.
GitOrigin-RevId: 2de72294c8e408329dc6ac31b7d92e07b6d457fb
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
index b7b0040..ea1a947 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
@@ -80,7 +80,14 @@
* on the physical pages tracking object. When the last
* reference to the tracking object is dropped the pages
* would be unpinned if they weren't unpinned before.
+ *
+ * Region should be CPU cached: abort if it isn't.
*/
+ if (WARN_ON(!(reg->flags & KBASE_REG_CPU_CACHED))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = kbase_jd_user_buf_pin_pages(kctx, reg);
if (ret)
goto out;
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
index a620b8d..9b26701 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
/*
*
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
*
* This program is free software and is provided to you under the terms of the
* GNU General Public License version 2 as published by the Free Software
@@ -2050,6 +2050,7 @@
src = ((unsigned char *)kmap(gpu_page)) + offset;
dst = ((unsigned char *)kmap(cpu_page)) + offset;
}
+
memcpy(dst, src, size);
kunmap(gpu_page);
kunmap(cpu_page);
@@ -4890,10 +4891,7 @@
struct page **pages;
struct tagged_addr *pa;
long i, dma_mapped_pages;
- unsigned long address;
struct device *dev;
- unsigned long offset_within_page;
- unsigned long remaining_size;
unsigned long gwt_mask = ~0;
/* Calls to this function are inherently asynchronous, with respect to
* MMU operations.
@@ -4909,20 +4907,33 @@
alloc = reg->gpu_alloc;
pa = kbase_get_gpu_phy_pages(reg);
- address = alloc->imported.user_buf.address;
pinned_pages = alloc->nents;
pages = alloc->imported.user_buf.pages;
dev = kctx->kbdev->dev;
- offset_within_page = address & ~PAGE_MASK;
- remaining_size = alloc->imported.user_buf.size;
+ /* Manual CPU cache synchronization.
+ *
+ * The driver disables automatic CPU cache synchronization because the
+ * memory pages that enclose the imported region may also contain
+ * sub-regions which are not imported and that are allocated and used
+ * by the user process. This may be the case of memory at the beginning
+ * of the first page and at the end of the last page. Automatic CPU cache
+ * synchronization would force some operations on those memory allocations,
+ * unbeknown to the user process: in particular, a CPU cache invalidate
+ * upon unmapping would destroy the content of dirty CPU caches and cause
+ * the user process to lose CPU writes to the non-imported sub-regions.
+ *
+ * When the GPU claims ownership of the imported memory buffer, it shall
+ * commit CPU writes for the whole of all pages that enclose the imported
+ * region, otherwise the initial content of memory would be wrong.
+ */
for (i = 0; i < pinned_pages; i++) {
- unsigned long map_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
- dma_addr_t dma_addr = dma_map_page(dev, pages[i],
- offset_within_page, map_size,
- DMA_BIDIRECTIONAL);
-
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+ dma_addr_t dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+ dma_addr_t dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
err = dma_mapping_error(dev, dma_addr);
if (err)
goto unwind;
@@ -4930,8 +4941,7 @@
alloc->imported.user_buf.dma_addrs[i] = dma_addr;
pa[i] = as_tagged(page_to_phys(pages[i]));
- remaining_size -= map_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
}
#ifdef CONFIG_MALI_CINSTR_GWT
@@ -4949,19 +4959,28 @@
/* fall down */
unwind:
alloc->nents = 0;
- offset_within_page = address & ~PAGE_MASK;
- remaining_size = alloc->imported.user_buf.size;
dma_mapped_pages = i;
- /* Run the unmap loop in the same order as map loop */
+ /* Run the unmap loop in the same order as map loop, and perform again
+ * CPU cache synchronization to re-write the content of dirty CPU caches
+ * to memory. This is precautionary measure in case a GPU job has taken
+ * advantage of a partially GPU-mapped range to write and corrupt the
+ * content of memory, either inside or outside the imported region.
+ *
+ * Notice that this error recovery path doesn't try to be optimal and just
+ * flushes the entire page range.
+ */
for (i = 0; i < dma_mapped_pages; i++) {
- unsigned long unmap_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
+ dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
- dma_unmap_page(kctx->kbdev->dev,
- alloc->imported.user_buf.dma_addrs[i],
- unmap_size, DMA_BIDIRECTIONAL);
- remaining_size -= unmap_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+ dma_unmap_page(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i], PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+#else
+ dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
}
/* The user buffer could already have been previously pinned before
@@ -5002,12 +5021,94 @@
#endif
for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
+ unsigned long imported_size = MIN(remaining_size, PAGE_SIZE - offset_within_page);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
unsigned long unmap_size =
- MIN(remaining_size, PAGE_SIZE - offset_within_page);
+ MIN(remaining_size, PAGE_SIZE - offset_within_page);
+#endif
+ /* Notice: this is a temporary variable that is used for DMA sync
+ * operations, and that could be incremented by an offset if the
+ * current page contains both imported and non-imported memory
+ * sub-regions.
+ *
+ * It is valid to add an offset to this value, because the offset
+ * is always kept within the physically contiguous dma-mapped range
+ * and there's no need to translate to physical address to offset it.
+ *
+ * This variable is not going to be used for the actual DMA unmap
+ * operation, that shall always use the original DMA address of the
+ * whole memory page.
+ */
dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
+ /* Manual CPU cache synchronization.
+ *
+ * When the GPU returns ownership of the buffer to the CPU, the driver
+ * needs to treat imported and non-imported memory differently.
+ *
+ * The first case to consider is non-imported sub-regions at the
+ * beginning of the first page and at the end of last page. For these
+ * sub-regions: CPU cache shall be committed with a clean+invalidate,
+ * in order to keep the last CPU write.
+ *
+ * Imported region prefers the opposite treatment: this memory has been
+ * legitimately mapped and used by the GPU, hence GPU writes shall be
+ * committed to memory, while CPU cache shall be invalidated to make
+ * sure that CPU reads the correct memory content.
+ *
+ * The following diagram shows the expect value of the variables
+ * used in this loop in the corner case of an imported region encloed
+ * by a single memory page:
+ *
+ * page boundary ->|---------- | <- dma_addr (initial value)
+ * | |
+ * | - - - - - | <- offset_within_page
+ * |XXXXXXXXXXX|\
+ * |XXXXXXXXXXX| \
+ * |XXXXXXXXXXX| }- imported_size
+ * |XXXXXXXXXXX| /
+ * |XXXXXXXXXXX|/
+ * | - - - - - | <- offset_within_page + imported_size
+ * | |\
+ * | | }- PAGE_SIZE - imported_size - offset_within_page
+ * | |/
+ * page boundary ->|-----------|
+ *
+ * If the imported region is enclosed by more than one page, then
+ * offset_within_page = 0 for any page after the first.
+ */
+
+ /* Only for first page: handle non-imported range at the beginning. */
+ if (offset_within_page > 0) {
+ dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, offset_within_page,
+ DMA_BIDIRECTIONAL);
+ dma_addr += offset_within_page;
+ }
+
+ /* For every page: handle imported range. */
+ if (imported_size > 0)
+ dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, imported_size,
+ DMA_BIDIRECTIONAL);
+
+ /* Only for last page (that may coincide with first page):
+ * handle non-imported range at the end.
+ */
+ if ((imported_size + offset_within_page) < PAGE_SIZE) {
+ dma_addr += imported_size;
+ dma_sync_single_for_device(kctx->kbdev->dev, dma_addr,
+ PAGE_SIZE - imported_size - offset_within_page,
+ DMA_BIDIRECTIONAL);
+ }
+
+ /* Notice: use the original DMA address to unmap the whole memory page. */
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
dma_unmap_page(kctx->kbdev->dev, dma_addr, unmap_size,
- DMA_BIDIRECTIONAL);
+ DMA_BIDIRECTIONAL);
+#else
+ dma_unmap_page_attrs(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i],
+ PAGE_SIZE, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
+
if (writeable)
set_page_dirty_lock(pages[i]);
#if !MALI_USE_CSF
@@ -5015,7 +5116,7 @@
pages[i] = NULL;
#endif
- remaining_size -= unmap_size;
+ remaining_size -= imported_size;
offset_within_page = 0;
}
#if !MALI_USE_CSF
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
index 1aebde8..8157ab4 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
*
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
*
* This program is free software and is provided to you under the terms of the
* GNU General Public License version 2 as published by the Free Software
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
index a701ba8..031cdab 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
/*
*
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
*
* This program is free software and is provided to you under the terms of the
* GNU General Public License version 2 as published by the Free Software
@@ -1531,10 +1531,16 @@
int zone = KBASE_REG_ZONE_CUSTOM_VA;
bool shared_zone = false;
u32 cache_line_alignment = kbase_get_cache_line_alignment(kctx->kbdev);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
unsigned long offset_within_page;
unsigned long remaining_size;
+ unsigned long map_size;
+ unsigned long unmap_size;
+#endif
struct kbase_alloc_import_user_buf *user_buf;
struct page **pages = NULL;
+ struct tagged_addr *pa;
+ struct device *dev;
int write;
/* Flag supported only for dma-buf imported memory */
@@ -1676,29 +1682,49 @@
reg->gpu_alloc->nents = 0;
reg->extension = 0;
+ pa = kbase_get_gpu_phy_pages(reg);
+ dev = kctx->kbdev->dev;
+
if (pages) {
- struct device *dev = kctx->kbdev->dev;
- struct tagged_addr *pa = kbase_get_gpu_phy_pages(reg);
-
/* Top bit signifies that this was pinned on import */
- user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
-
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
offset_within_page = user_buf->address & ~PAGE_MASK;
remaining_size = user_buf->size;
- for (i = 0; i < faulted_pages; i++) {
- unsigned long map_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
- dma_addr_t dma_addr = dma_map_page(dev, pages[i],
- offset_within_page, map_size, DMA_BIDIRECTIONAL);
+ map_size = MIN(PAGE_SIZE - offset_within_page, remaining_size);
+#endif
+ user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
+ /* Manual CPU cache synchronization.
+ *
+ * The driver disables automatic CPU cache synchronization because the
+ * memory pages that enclose the imported region may also contain
+ * sub-regions which are not imported and that are allocated and used
+ * by the user process. This may be the case of memory at the beginning
+ * of the first page and at the end of the last page. Automatic CPU cache
+ * synchronization would force some operations on those memory allocations,
+ * unbeknown to the user process: in particular, a CPU cache invalidate
+ * upon unmapping would destroy the content of dirty CPU caches and cause
+ * the user process to lose CPU writes to the non-imported sub-regions.
+ *
+ * When the GPU claims ownership of the imported memory buffer, it shall
+ * commit CPU writes for the whole of all pages that enclose the imported
+ * region, otherwise the initial content of memory would be wrong.
+ */
+ for (i = 0; i < faulted_pages; i++) {
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+ dma_addr_t dma_addr = dma_map_page(dev, pages[i],
+ offset_within_page, map_size, DMA_BIDIRECTIONAL);
+#else
+ dma_addr_t dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
if (dma_mapping_error(dev, dma_addr))
goto unwind_dma_map;
user_buf->dma_addrs[i] = dma_addr;
pa[i] = as_tagged(page_to_phys(pages[i]));
- remaining_size -= map_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
}
reg->gpu_alloc->nents = faulted_pages;
@@ -1707,19 +1733,26 @@
return reg;
unwind_dma_map:
- offset_within_page = user_buf->address & ~PAGE_MASK;
- remaining_size = user_buf->size;
dma_mapped_pages = i;
- /* Run the unmap loop in the same order as map loop */
+ /* Run the unmap loop in the same order as map loop, and perform again
+ * CPU cache synchronization to re-write the content of dirty CPU caches
+ * to memory. This precautionary measure is kept here to keep this code
+ * aligned with kbase_jd_user_buf_map() to allow for a potential refactor
+ * in the future.
+ */
for (i = 0; i < dma_mapped_pages; i++) {
- unsigned long unmap_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
+ dma_addr_t dma_addr = user_buf->dma_addrs[i];
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+ unmap_size = MIN(PAGE_SIZE - offset_within_page, remaining_size);
dma_unmap_page(kctx->kbdev->dev,
user_buf->dma_addrs[i],
unmap_size, DMA_BIDIRECTIONAL);
- remaining_size -= unmap_size;
- offset_within_page = 0;
+#else
+ dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
}
fault_mismatch:
if (pages) {
@@ -1739,7 +1772,6 @@
no_region:
bad_size:
return NULL;
-
}
@@ -2022,7 +2054,10 @@
/* Remove COHERENT_SYSTEM flag if coherent mem is unavailable */
*flags &= ~BASE_MEM_COHERENT_SYSTEM;
}
-
+ if (((*flags & BASE_MEM_CACHED_CPU) == 0) && (type == BASE_MEM_IMPORT_TYPE_USER_BUFFER)) {
+ dev_warn(kctx->kbdev->dev, "USER_BUFFER must be CPU cached");
+ goto bad_flags;
+ }
if ((padding != 0) && (type != BASE_MEM_IMPORT_TYPE_UMM)) {
dev_warn(kctx->kbdev->dev,
"padding is only supported for UMM");
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
index 4fe577d..a247de0 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
@@ -1341,6 +1341,7 @@
kbase_gpu_vm_unlock(kctx);
} else {
int ret = -ENOMEM;
+ const u8 group_id = region->gpu_alloc->group_id;
kbase_gpu_vm_unlock(kctx);
@@ -1352,8 +1353,7 @@
if (grow_2mb_pool) {
/* Round page requirement up to nearest 2 MB */
struct kbase_mem_pool *const lp_mem_pool =
- &kctx->mem_pools.large[
- region->gpu_alloc->group_id];
+ &kctx->mem_pools.large[group_id];
pages_to_grow = (pages_to_grow +
((1 << lp_mem_pool->order) - 1))
@@ -1364,8 +1364,7 @@
} else {
#endif
struct kbase_mem_pool *const mem_pool =
- &kctx->mem_pools.small[
- region->gpu_alloc->group_id];
+ &kctx->mem_pools.small[group_id];
ret = kbase_mem_pool_grow(mem_pool,
pages_to_grow);