arm64/mm: Change THP helpers to comply with generic MM semantics
authorAnshuman Khandual <anshuman.khandual@arm.com>
Wed, 9 Sep 2020 04:53:02 +0000 (10:23 +0530)
committerWill Deacon <will@kernel.org>
Fri, 11 Sep 2020 13:57:30 +0000 (14:57 +0100)
pmd_present() and pmd_trans_huge() are expected to behave in the following
manner during various phases of a given PMD. It is derived from a previous
detailed discussion on this topic [1] and present THP documentation [2].

pmd_present(pmd):

- Returns true if pmd refers to system RAM with a valid pmd_page(pmd)
- Returns false if pmd refers to a migration or swap entry

pmd_trans_huge(pmd):

- Returns true if pmd refers to system RAM and is a trans huge mapping

-------------------------------------------------------------------------
| PMD states | pmd_present | pmd_trans_huge |
-------------------------------------------------------------------------
| Mapped | Yes | Yes |
-------------------------------------------------------------------------
| Splitting | Yes | Yes |
-------------------------------------------------------------------------
| Migration/Swap | No | No |
-------------------------------------------------------------------------

The problem:

PMD is first invalidated with pmdp_invalidate() before it's splitting. This
invalidation clears PMD_SECT_VALID as below.

PMD Split -> pmdp_invalidate() -> pmd_mkinvalid -> Clears PMD_SECT_VALID

Once PMD_SECT_VALID gets cleared, it results in pmd_present() return false
on the PMD entry. It will need another bit apart from PMD_SECT_VALID to re-
affirm pmd_present() as true during the THP split process. To comply with
above mentioned semantics, pmd_trans_huge() should also check pmd_present()
first before testing presence of an actual transparent huge mapping.

The solution:

Ideally PMD_TYPE_SECT should have been used here instead. But it shares the
bit position with PMD_SECT_VALID which is used for THP invalidation. Hence
it will not be there for pmd_present() check after pmdp_invalidate().

A new software defined PMD_PRESENT_INVALID (bit 59) can be set on the PMD
entry during invalidation which can help pmd_present() return true and in
recognizing the fact that it still points to memory.

This bit is transient. During the split process it will be overridden by a
page table page representing normal pages in place of erstwhile huge page.
Other pmdp_invalidate() callers always write a fresh PMD value on the entry
overriding this transient PMD_PRESENT_INVALID bit, which makes it safe.

[1]: https://lkml.org/lkml/2018/10/17/231
[2]: https://www.kernel.org/doc/Documentation/vm/transhuge.txt

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/pgtable.h

index 4d867c6446c4844c035a1a38d5660e5b70f7085c..2df4b75fce3c4575f3609f04da1d390c478f6ddd 100644 (file)
 #define PTE_DEVMAP             (_AT(pteval_t, 1) << 57)
 #define PTE_PROT_NONE          (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
 
+/*
+ * This bit indicates that the entry is present i.e. pmd_page()
+ * still points to a valid huge page in memory even if the pmd
+ * has been invalidated.
+ */
+#define PMD_PRESENT_INVALID    (_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */
+
 #ifndef __ASSEMBLY__
 
 #include <asm/cpufeature.h>
index d5d3fbe739534f7523a8f5e28bb9afb7364520fd..d8258ae8fce09cdc991b5ccdc881cb71098c8499 100644 (file)
@@ -145,6 +145,18 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
        return pte;
 }
 
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+       pmd_val(pmd) &= ~pgprot_val(prot);
+       return pmd;
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+       pmd_val(pmd) |= pgprot_val(prot);
+       return pmd;
+}
+
 static inline pte_t pte_wrprotect(pte_t pte)
 {
        pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
@@ -363,15 +375,24 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif
 
+#define pmd_present_invalid(pmd)     (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))
+
+static inline int pmd_present(pmd_t pmd)
+{
+       return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
+}
+
 /*
  * THP definitions.
  */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pmd_trans_huge(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-#define pmd_present(pmd)       pte_present(pmd_pte(pmd))
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
 #define pmd_valid(pmd)         pte_valid(pmd_pte(pmd))
@@ -381,7 +402,14 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mkinvalid(pmd)     (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+       pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
+       pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));
+
+       return pmd;
+}
 
 #define pmd_thp_or_huge(pmd)   (pmd_huge(pmd) || pmd_trans_huge(pmd))