Project import generated by Copybara.

GitOrigin-RevId: 2de72294c8e408329dc6ac31b7d92e07b6d457fb
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
index b7b0040..ea1a947 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/csf/mali_kbase_csf_kcpu.c
@@ -80,7 +80,14 @@
 		 * on the physical pages tracking object. When the last
 		 * reference to the tracking object is dropped the pages
 		 * would be unpinned if they weren't unpinned before.
+		 *
+		 * Region should be CPU cached: abort if it isn't.
 		 */
+		if (WARN_ON(!(reg->flags & KBASE_REG_CPU_CACHED))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		ret = kbase_jd_user_buf_pin_pages(kctx, reg);
 		if (ret)
 			goto out;
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
index a620b8d..9b26701 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -2050,6 +2050,7 @@
 			src = ((unsigned char *)kmap(gpu_page)) + offset;
 			dst = ((unsigned char *)kmap(cpu_page)) + offset;
 		}
+
 		memcpy(dst, src, size);
 		kunmap(gpu_page);
 		kunmap(cpu_page);
@@ -4890,10 +4891,7 @@
 	struct page **pages;
 	struct tagged_addr *pa;
 	long i, dma_mapped_pages;
-	unsigned long address;
 	struct device *dev;
-	unsigned long offset_within_page;
-	unsigned long remaining_size;
 	unsigned long gwt_mask = ~0;
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
@@ -4909,20 +4907,33 @@
 
 	alloc = reg->gpu_alloc;
 	pa = kbase_get_gpu_phy_pages(reg);
-	address = alloc->imported.user_buf.address;
 	pinned_pages = alloc->nents;
 	pages = alloc->imported.user_buf.pages;
 	dev = kctx->kbdev->dev;
-	offset_within_page = address & ~PAGE_MASK;
-	remaining_size = alloc->imported.user_buf.size;
 
+	/* Manual CPU cache synchronization.
+	 *
+	 * The driver disables automatic CPU cache synchronization because the
+	 * memory pages that enclose the imported region may also contain
+	 * sub-regions which are not imported and that are allocated and used
+	 * by the user process. This may be the case of memory at the beginning
+	 * of the first page and at the end of the last page. Automatic CPU cache
+	 * synchronization would force some operations on those memory allocations,
+	 * unbeknown to the user process: in particular, a CPU cache invalidate
+	 * upon unmapping would destroy the content of dirty CPU caches and cause
+	 * the user process to lose CPU writes to the non-imported sub-regions.
+	 *
+	 * When the GPU claims ownership of the imported memory buffer, it shall
+	 * commit CPU writes for the whole of all pages that enclose the imported
+	 * region, otherwise the initial content of memory would be wrong.
+	 */
 	for (i = 0; i < pinned_pages; i++) {
-		unsigned long map_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
-		dma_addr_t dma_addr = dma_map_page(dev, pages[i],
-				offset_within_page, map_size,
-				DMA_BIDIRECTIONAL);
-
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+		dma_addr_t dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_addr_t dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+							 DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 		err = dma_mapping_error(dev, dma_addr);
 		if (err)
 			goto unwind;
@@ -4930,8 +4941,7 @@
 		alloc->imported.user_buf.dma_addrs[i] = dma_addr;
 		pa[i] = as_tagged(page_to_phys(pages[i]));
 
-		remaining_size -= map_size;
-		offset_within_page = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 	}
 
 #ifdef CONFIG_MALI_CINSTR_GWT
@@ -4949,19 +4959,28 @@
 	/* fall down */
 unwind:
 	alloc->nents = 0;
-	offset_within_page = address & ~PAGE_MASK;
-	remaining_size = alloc->imported.user_buf.size;
 	dma_mapped_pages = i;
-	/* Run the unmap loop in the same order as map loop */
+	/* Run the unmap loop in the same order as map loop, and perform again
+	 * CPU cache synchronization to re-write the content of dirty CPU caches
+	 * to memory. This is precautionary measure in case a GPU job has taken
+	 * advantage of a partially GPU-mapped range to write and corrupt the
+	 * content of memory, either inside or outside the imported region.
+	 *
+	 * Notice that this error recovery path doesn't try to be optimal and just
+	 * flushes the entire page range.
+	 */
 	for (i = 0; i < dma_mapped_pages; i++) {
-		unsigned long unmap_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
-		dma_unmap_page(kctx->kbdev->dev,
-				alloc->imported.user_buf.dma_addrs[i],
-				unmap_size, DMA_BIDIRECTIONAL);
-		remaining_size -= unmap_size;
-		offset_within_page = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i], PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 	}
 
 	/* The user buffer could already have been previously pinned before
@@ -5002,12 +5021,94 @@
 #endif
 
 	for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
+		unsigned long imported_size = MIN(remaining_size, PAGE_SIZE - offset_within_page);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
 		unsigned long unmap_size =
-			MIN(remaining_size, PAGE_SIZE - offset_within_page);
+						MIN(remaining_size, PAGE_SIZE - offset_within_page);
+#endif
+		/* Notice: this is a temporary variable that is used for DMA sync
+		 * operations, and that could be incremented by an offset if the
+		 * current page contains both imported and non-imported memory
+		 * sub-regions.
+		 *
+		 * It is valid to add an offset to this value, because the offset
+		 * is always kept within the physically contiguous dma-mapped range
+		 * and there's no need to translate to physical address to offset it.
+		 *
+		 * This variable is not going to be used for the actual DMA unmap
+		 * operation, that shall always use the original DMA address of the
+		 * whole memory page.
+		 */
 		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
+		/* Manual CPU cache synchronization.
+		 *
+		 * When the GPU returns ownership of the buffer to the CPU, the driver
+		 * needs to treat imported and non-imported memory differently.
+		 *
+		 * The first case to consider is non-imported sub-regions at the
+		 * beginning of the first page and at the end of last page. For these
+		 * sub-regions: CPU cache shall be committed with a clean+invalidate,
+		 * in order to keep the last CPU write.
+		 *
+		 * Imported region prefers the opposite treatment: this memory has been
+		 * legitimately mapped and used by the GPU, hence GPU writes shall be
+		 * committed to memory, while CPU cache shall be invalidated to make
+		 * sure that CPU reads the correct memory content.
+		 *
+		 * The following diagram shows the expect value of the variables
+		 * used in this loop in the corner case of an imported region encloed
+		 * by a single memory page:
+		 *
+		 * page boundary ->|---------- | <- dma_addr (initial value)
+		 *                 |           |
+		 *                 | - - - - - | <- offset_within_page
+		 *                 |XXXXXXXXXXX|\
+		 *                 |XXXXXXXXXXX| \
+		 *                 |XXXXXXXXXXX|  }- imported_size
+		 *                 |XXXXXXXXXXX| /
+		 *                 |XXXXXXXXXXX|/
+		 *                 | - - - - - | <- offset_within_page + imported_size
+		 *                 |           |\
+		 *                 |           | }- PAGE_SIZE - imported_size - offset_within_page
+		 *                 |           |/
+		 * page boundary ->|-----------|
+		 *
+		 * If the imported region is enclosed by more than one page, then
+		 * offset_within_page = 0 for any page after the first.
+		 */
+
+		/* Only for first page: handle non-imported range at the beginning. */
+		if (offset_within_page > 0) {
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, offset_within_page,
+						   DMA_BIDIRECTIONAL);
+			dma_addr += offset_within_page;
+		}
+
+		/* For every page: handle imported range. */
+		if (imported_size > 0)
+			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, imported_size,
+						DMA_BIDIRECTIONAL);
+
+		/* Only for last page (that may coincide with first page):
+		 * handle non-imported range at the end.
+		 */
+		if ((imported_size + offset_within_page) < PAGE_SIZE) {
+			dma_addr += imported_size;
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr,
+						   PAGE_SIZE - imported_size - offset_within_page,
+						   DMA_BIDIRECTIONAL);
+		}
+
+		/* Notice: use the original DMA address to unmap the whole memory page. */
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
 		dma_unmap_page(kctx->kbdev->dev, dma_addr, unmap_size,
-				DMA_BIDIRECTIONAL);
+						DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i],
+				     PAGE_SIZE, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
+
 		if (writeable)
 			set_page_dirty_lock(pages[i]);
 #if !MALI_USE_CSF
@@ -5015,7 +5116,7 @@
 		pages[i] = NULL;
 #endif
 
-		remaining_size -= unmap_size;
+		remaining_size -= imported_size;
 		offset_within_page = 0;
 	}
 #if !MALI_USE_CSF
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
index 1aebde8..8157ab4 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
index a701ba8..031cdab 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mali_kbase_mem_linux.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -1531,10 +1531,16 @@
 	int zone = KBASE_REG_ZONE_CUSTOM_VA;
 	bool shared_zone = false;
 	u32 cache_line_alignment = kbase_get_cache_line_alignment(kctx->kbdev);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
 	unsigned long offset_within_page;
 	unsigned long remaining_size;
+	unsigned long map_size;
+	unsigned long unmap_size;
+#endif
 	struct kbase_alloc_import_user_buf *user_buf;
 	struct page **pages = NULL;
+	struct tagged_addr *pa;
+	struct device *dev;
 	int write;
 
 	/* Flag supported only for dma-buf imported memory */
@@ -1676,29 +1682,49 @@
 	reg->gpu_alloc->nents = 0;
 	reg->extension = 0;
 
+	pa = kbase_get_gpu_phy_pages(reg);
+	dev = kctx->kbdev->dev;
+
 	if (pages) {
-		struct device *dev = kctx->kbdev->dev;
-		struct tagged_addr *pa = kbase_get_gpu_phy_pages(reg);
-
 		/* Top bit signifies that this was pinned on import */
-		user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
-
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
 		offset_within_page = user_buf->address & ~PAGE_MASK;
 		remaining_size = user_buf->size;
-		for (i = 0; i < faulted_pages; i++) {
-			unsigned long map_size =
-				MIN(PAGE_SIZE - offset_within_page, remaining_size);
-			dma_addr_t dma_addr = dma_map_page(dev, pages[i],
-				offset_within_page, map_size, DMA_BIDIRECTIONAL);
+		map_size = MIN(PAGE_SIZE - offset_within_page, remaining_size);
+#endif
+		user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
 
+		/* Manual CPU cache synchronization.
+		 *
+		 * The driver disables automatic CPU cache synchronization because the
+		 * memory pages that enclose the imported region may also contain
+		 * sub-regions which are not imported and that are allocated and used
+		 * by the user process. This may be the case of memory at the beginning
+		 * of the first page and at the end of the last page. Automatic CPU cache
+		 * synchronization would force some operations on those memory allocations,
+		 * unbeknown to the user process: in particular, a CPU cache invalidate
+		 * upon unmapping would destroy the content of dirty CPU caches and cause
+		 * the user process to lose CPU writes to the non-imported sub-regions.
+		 *
+		 * When the GPU claims ownership of the imported memory buffer, it shall
+		 * commit CPU writes for the whole of all pages that enclose the imported
+		 * region, otherwise the initial content of memory would be wrong.
+		 */
+		for (i = 0; i < faulted_pages; i++) {
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+				dma_addr_t dma_addr = dma_map_page(dev, pages[i],
+										offset_within_page, map_size, DMA_BIDIRECTIONAL);
+#else
+				dma_addr_t dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL,
+						   DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 			if (dma_mapping_error(dev, dma_addr))
 				goto unwind_dma_map;
 
 			user_buf->dma_addrs[i] = dma_addr;
 			pa[i] = as_tagged(page_to_phys(pages[i]));
 
-			remaining_size -= map_size;
-			offset_within_page = 0;
+			dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 		}
 
 		reg->gpu_alloc->nents = faulted_pages;
@@ -1707,19 +1733,26 @@
 	return reg;
 
 unwind_dma_map:
-	offset_within_page = user_buf->address & ~PAGE_MASK;
-	remaining_size = user_buf->size;
 	dma_mapped_pages = i;
-	/* Run the unmap loop in the same order as map loop */
+	/* Run the unmap loop in the same order as map loop, and perform again
+	 * CPU cache synchronization to re-write the content of dirty CPU caches
+	 * to memory. This precautionary measure is kept here to keep this code
+	 * aligned with kbase_jd_user_buf_map() to allow for a potential refactor
+	 * in the future.
+	 */
 	for (i = 0; i < dma_mapped_pages; i++) {
-		unsigned long unmap_size =
-			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+		dma_addr_t dma_addr = user_buf->dma_addrs[i];
 
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+		unmap_size = MIN(PAGE_SIZE - offset_within_page, remaining_size);
 		dma_unmap_page(kctx->kbdev->dev,
 				user_buf->dma_addrs[i],
 				unmap_size, DMA_BIDIRECTIONAL);
-		remaining_size -= unmap_size;
-		offset_within_page = 0;
+#else
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 	}
 fault_mismatch:
 	if (pages) {
@@ -1739,7 +1772,6 @@
 no_region:
 bad_size:
 	return NULL;
-
 }
 
 
@@ -2022,7 +2054,10 @@
 		/* Remove COHERENT_SYSTEM flag if coherent mem is unavailable */
 		*flags &= ~BASE_MEM_COHERENT_SYSTEM;
 	}
-
+	if (((*flags & BASE_MEM_CACHED_CPU) == 0) && (type == BASE_MEM_IMPORT_TYPE_USER_BUFFER)) {
+		dev_warn(kctx->kbdev->dev, "USER_BUFFER must be CPU cached");
+		goto bad_flags;
+	}
 	if ((padding != 0) && (type != BASE_MEM_IMPORT_TYPE_UMM)) {
 		dev_warn(kctx->kbdev->dev,
 				"padding is only supported for UMM");
diff --git a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
index 4fe577d..a247de0 100644
--- a/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
+++ b/bifrost/r38p2/kernel/drivers/gpu/arm/midgard/mmu/mali_kbase_mmu.c
@@ -1341,6 +1341,7 @@
 		kbase_gpu_vm_unlock(kctx);
 	} else {
 		int ret = -ENOMEM;
+		const u8 group_id = region->gpu_alloc->group_id;
 
 		kbase_gpu_vm_unlock(kctx);
 
@@ -1352,8 +1353,7 @@
 			if (grow_2mb_pool) {
 				/* Round page requirement up to nearest 2 MB */
 				struct kbase_mem_pool *const lp_mem_pool =
-					&kctx->mem_pools.large[
-					region->gpu_alloc->group_id];
+					&kctx->mem_pools.large[group_id];
 
 				pages_to_grow = (pages_to_grow +
 					((1 << lp_mem_pool->order) - 1))
@@ -1364,8 +1364,7 @@
 			} else {
 #endif
 				struct kbase_mem_pool *const mem_pool =
-					&kctx->mem_pools.small[
-					region->gpu_alloc->group_id];
+					&kctx->mem_pools.small[group_id];
 
 				ret = kbase_mem_pool_grow(mem_pool,
 					pages_to_grow);