Thread (69 messages) 69 messages, 9 authors, 2011-08-08

Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core

From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: 2011-07-18 04:02:19
Also in: lkml
Subsystem: libata subsystem (serial and parallel ata drivers), memory management, memory management - core, the rest · Maintainers: Damien Le Moal, Niklas Cassel, Andrew Morton, David Hildenbrand, Linus Torvalds

On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:
In fact, with such a flag, we could probably avoid the ifdef entirely, and
always go toward the PTE fixup path when called in such a fixup case, my gut
feeling is that this is going to be seldom enough not to hurt x86 measurably
but we'll have to try it out.

That leads to that even less tested patch:
 And here's a version that builds (still not tested :-)

Shan, can you verify whether that fixes the problem for you ?

I also had a cursory glance at the ARM code and it seems to rely on the
same stuff as embedded powerpc does for dirty/young updates, so in
theory it should exhibit the same problem.

I suspect the scenario is rare enough in practice in embedded workloads
that nobody noticed until now.

Cheers,
Ben.
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 06c564e..e8ef0e6 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1296,7 +1296,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 				DRV_NAME);
 	if (rc)
 		return rc;
-	iomap = pcim_iomap_table(pdev);
+	iomap = pcim_iomap_table~(pdev);
 
 	/* apply workaround for completion IRQ loss on PCI-X errata */
 	if (pi.flags & SIL24_FLAG_PCIX_IRQ_WOC) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..02adff7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..94b1d3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
+		update_mmu_cache(vma, address, ptep);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,16 +1537,22 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		/*
-		 * pte_mkyoung() would be more correct here, but atomic care
-		 * is needed to avoid losing the dirty bit: it is easier to use
-		 * mark_page_accessed().
-		 */
-		mark_page_accessed(page);
+
+	if (!pte_young(pte) || ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+		if (flags & FOLL_FIXFAULT)
+			handle_pte_sw_young_dirty(vma, address, ptep,
+						  flags & FOLL_WRITE);
+		else if (flags & FOLL_TOUCH) {
+			if ((flags & FOLL_WRITE) &&
+			    !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			/*
+			 * pte_mkyoung() would be more correct here, but atomic care
+			 * is needed to avoid losing the dirty bit: it is easier to use
+			 * mark_page_accessed().
+			 */
+			mark_page_accessed(page);
+		}
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
@@ -3358,21 +3387,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help