s390/mm: add page table dumper
authorHeiko Carstens <heiko.carstens@de.ibm.com>
Thu, 4 Oct 2012 12:46:12 +0000 (14:46 +0200)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Tue, 9 Oct 2012 12:16:58 +0000 (14:16 +0200)
This is more or less the same as the x86 page table dumper which was
merged four years ago: 926e5392 "x86: add code to dump the (kernel)
page tables for visual inspection by kernel developers".

We add a file at /sys/kernel/debug/kernel_page_tables for debugging
purposes so it's quite easy to see the kernel page table layout and
possible odd mappings:

---[ Identity Mapping ]---
0x0000000000000000-0x0000000000100000        1M PTE RW
---[ Kernel Image Start ]---
0x0000000000100000-0x0000000000800000        7M PMD RO
0x0000000000800000-0x00000000008a9000      676K PTE RO
0x00000000008a9000-0x0000000000900000      348K PTE RW
0x0000000000900000-0x0000000001500000       12M PMD RW
---[ Kernel Image End ]---
0x0000000001500000-0x0000000280000000    10219M PMD RW
0x0000000280000000-0x000003d280000000     3904G PUD I
---[ vmemmap Area ]---
0x000003d280000000-0x000003d288c00000      140M PTE RW
0x000003d288c00000-0x000003d300000000     1908M PMD I
0x000003d300000000-0x000003e000000000       52G PUD I
---[ vmalloc Area ]---
0x000003e000000000-0x000003e000009000       36K PTE RW
0x000003e000009000-0x000003e0000ee000      916K PTE I
0x000003e0000ee000-0x000003e000146000      352K PTE RW
0x000003e000146000-0x000003e000200000      744K PTE I
0x000003e000200000-0x000003e080000000     2046M PMD I
0x000003e080000000-0x0000040000000000      126G PUD I

This usually makes only sense for kernel developers. The output
with CONFIG_DEBUG_PAGEALLOC is not very helpful, because of the
huge number of mapped out pages, however I decided for the time
being to not add a !DEBUG_PAGEALLOC dependency.
Maybe it's helpful for somebody even with that option.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
arch/s390/Kconfig.debug
arch/s390/mm/Makefile
arch/s390/mm/dump_pagetables.c [new file with mode: 0644]

index d76cef3fef37bac6f3df5ad6d336a30cdfc8233f..fc32a2df497464cf7707f6843ba55a5d60f56341 100644 (file)
@@ -31,6 +31,18 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
          If unsure, or if you run an older (pre 4.4) gcc, say N.
 
+config S390_PTDUMP
+       bool "Export kernel pagetable layout to userspace via debugfs"
+       depends on DEBUG_KERNEL
+       select DEBUG_FS
+       ---help---
+         Say Y here if you want to show the kernel pagetable layout in a
+         debugfs file. This information is only useful for kernel developers
+         who are working in architecture specific areas of the kernel.
+         It is probably not a good idea to enable this feature in a production
+         kernel.
+         If in doubt, say "N"
+
 config DEBUG_SET_MODULE_RONX
        def_bool y
        depends on MODULES
index 0f5536b0c1a1c5368ff9976ff70beb873e92675b..1bea6d1f55ab334b86db2042d84c84e5f01231df 100644 (file)
@@ -7,3 +7,4 @@ obj-y    := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_DEBUG_SET_MODULE_RONX) += pageattr.o
+obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
new file mode 100644 (file)
index 0000000..cd1c62d
--- /dev/null
@@ -0,0 +1,219 @@
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+
+static unsigned long max_addr;
+
+struct addr_marker {
+       unsigned long start_address;
+       const char *name;
+};
+
+enum address_markers_idx {
+       IDENTITY_NR = 0,
+       KERNEL_START_NR,
+       KERNEL_END_NR,
+       VMEMMAP_NR,
+       VMALLOC_NR,
+};
+
+static struct addr_marker address_markers[] = {
+       [IDENTITY_NR]     = {0, "Identity Mapping"},
+       [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"},
+       [KERNEL_END_NR]   = {(unsigned long)&_end, "Kernel Image End"},
+       [VMEMMAP_NR]      = {0, "vmemmap Area"},
+       [VMALLOC_NR]      = {0, "vmalloc Area"},
+       { -1, NULL }
+};
+
+struct pg_state {
+       int level;
+       unsigned int current_prot;
+       unsigned long start_address;
+       unsigned long current_address;
+       const struct addr_marker *marker;
+};
+
+static void print_prot(struct seq_file *m, unsigned int pr, int level)
+{
+       static const char * const level_name[] =
+               { "ASCE", "PGD", "PUD", "PMD", "PTE" };
+
+       seq_printf(m, "%s ", level_name[level]);
+       if (pr & _PAGE_INVALID)
+               seq_printf(m, "I\n");
+       else
+               seq_printf(m, "%s\n", pr & _PAGE_RO ? "RO" : "RW");
+}
+
+static void note_page(struct seq_file *m, struct pg_state *st,
+                    unsigned int new_prot, int level)
+{
+       static const char units[] = "KMGTPE";
+       int width = sizeof(unsigned long) * 2;
+       const char *unit = units;
+       unsigned int prot, cur;
+       unsigned long delta;
+
+       /*
+        * If we have a "break" in the series, we need to flush the state
+        * that we have now. "break" is either changing perms, levels or
+        * address space marker.
+        */
+       prot = new_prot;
+       cur = st->current_prot;
+
+       if (!st->level) {
+               /* First entry */
+               st->current_prot = new_prot;
+               st->level = level;
+               st->marker = address_markers;
+               seq_printf(m, "---[ %s ]---\n", st->marker->name);
+       } else if (prot != cur || level != st->level ||
+                  st->current_address >= st->marker[1].start_address) {
+               /* Print the actual finished series */
+               seq_printf(m, "0x%0*lx-0x%0*lx",
+                          width, st->start_address,
+                          width, st->current_address);
+               delta = (st->current_address - st->start_address) >> 10;
+               while (!(delta & 0x3ff) && unit[1]) {
+                       delta >>= 10;
+                       unit++;
+               }
+               seq_printf(m, "%9lu%c ", delta, *unit);
+               print_prot(m, st->current_prot, st->level);
+               if (st->current_address >= st->marker[1].start_address) {
+                       st->marker++;
+                       seq_printf(m, "---[ %s ]---\n", st->marker->name);
+               }
+               st->start_address = st->current_address;
+               st->current_prot = new_prot;
+               st->level = level;
+       }
+}
+
+/*
+ * The actual page table walker functions. In order to keep the implementation
+ * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO
+ * flags to note_page() if a region, segment or page table entry is invalid or
+ * read-only.
+ * After all it's just a hint that the current level being walked contains an
+ * invalid or read-only entry.
+ */
+static void walk_pte_level(struct seq_file *m, struct pg_state *st,
+                          pmd_t *pmd, unsigned long addr)
+{
+       unsigned int prot;
+       pte_t *pte;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
+               st->current_address = addr;
+               pte = pte_offset_kernel(pmd, addr);
+               prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID);
+               note_page(m, st, prot, 4);
+               addr += PAGE_SIZE;
+       }
+}
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
+                          pud_t *pud, unsigned long addr)
+{
+       unsigned int prot;
+       pmd_t *pmd;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) {
+               st->current_address = addr;
+               pmd = pmd_offset(pud, addr);
+               if (!pmd_none(*pmd)) {
+                       if (pmd_large(*pmd)) {
+                               prot = pmd_val(*pmd) & _SEGMENT_ENTRY_RO;
+                               note_page(m, st, prot, 3);
+                       } else
+                               walk_pte_level(m, st, pmd, addr);
+               } else
+                       note_page(m, st, _PAGE_INVALID, 3);
+               addr += PMD_SIZE;
+       }
+}
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st,
+                          pgd_t *pgd, unsigned long addr)
+{
+       pud_t *pud;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) {
+               st->current_address = addr;
+               pud = pud_offset(pgd, addr);
+               if (!pud_none(*pud))
+                       walk_pmd_level(m, st, pud, addr);
+               else
+                       note_page(m, st, _PAGE_INVALID, 2);
+               addr += PUD_SIZE;
+       }
+}
+
+static void walk_pgd_level(struct seq_file *m)
+{
+       unsigned long addr = 0;
+       struct pg_state st;
+       pgd_t *pgd;
+       int i;
+
+       memset(&st, 0, sizeof(st));
+       for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
+               st.current_address = addr;
+               pgd = pgd_offset_k(addr);
+               if (!pgd_none(*pgd))
+                       walk_pud_level(m, &st, pgd, addr);
+               else
+                       note_page(m, &st, _PAGE_INVALID, 1);
+               addr += PGDIR_SIZE;
+       }
+       /* Flush out the last page */
+       st.current_address = max_addr;
+       note_page(m, &st, 0, 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+       walk_pgd_level(m);
+       return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+       .open           = ptdump_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int pt_dump_init(void)
+{
+       /*
+        * Figure out the maximum virtual address being accessible with the
+        * kernel ASCE. We need this to keep the page table walker functions
+        * from accessing non-existent entries.
+        */
+#ifdef CONFIG_64BIT
+       max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+       max_addr = 1UL << (max_addr * 11 + 31);
+#else
+       max_addr = 1UL << 31;
+#endif
+       address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+       address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+       debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+       return 0;
+}
+device_initcall(pt_dump_init);