Merge tag 'powerpc-6.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 31 Aug 2023 19:43:10 +0000 (12:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 31 Aug 2023 19:43:10 +0000 (12:43 -0700)
Pull powerpc updates from Michael Ellerman:

 - Add HOTPLUG_SMT support (/sys/devices/system/cpu/smt) and honour the
   configured SMT state when hotplugging CPUs into the system

 - Combine final TLB flush and lazy TLB mm shootdown IPIs when using the
   Radix MMU to avoid a broadcast TLBIE flush on exit

 - Drop the exclusion between ptrace/perf watchpoints, and drop the now
   unused associated arch hooks

 - Add support for the "nohlt" command line option to disable CPU idle

 - Add support for -fpatchable-function-entry for ftrace, with GCC >=
   13.1

 - Rework memory block size determination, and support 256MB size on
   systems with GPUs that have hotpluggable memory

 - Various other small features and fixes

Thanks to Andrew Donnellan, Aneesh Kumar K.V, Arnd Bergmann, Athira
Rajeev, Benjamin Gray, Christophe Leroy, Frederic Barrat, Gautam
Menghani, Geoff Levand, Hari Bathini, Immad Mir, Jialin Zhang, Joel
Stanley, Jordan Niethe, Justin Stitt, Kajol Jain, Kees Cook, Krzysztof
Kozlowski, Laurent Dufour, Liang He, Linus Walleij, Mahesh Salgaonkar,
Masahiro Yamada, Michal Suchanek, Nageswara R Sastry, Nathan Chancellor,
Nathan Lynch, Naveen N Rao, Nicholas Piggin, Nick Desaulniers, Omar
Sandoval, Randy Dunlap, Reza Arbab, Rob Herring, Russell Currey, Sourabh
Jain, Thomas Gleixner, Trevor Woerner, Uwe Kleine-König, Vaibhav Jain,
Xiongfeng Wang, Yuan Tan, Zhang Rui, and Zheng Zengkai.

* tag 'powerpc-6.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (135 commits)
  macintosh/ams: linux/platform_device.h is needed
  powerpc/xmon: Reapply "Relax frame size for clang"
  powerpc/mm/book3s64: Use 256M as the upper limit with coherent device memory attached
  powerpc/mm/book3s64: Fix build error with SPARSEMEM disabled
  powerpc/iommu: Fix notifiers being shared by PCI and VIO buses
  powerpc/mpc5xxx: Add missing fwnode_handle_put()
  powerpc/config: Disable SLAB_DEBUG_ON in skiroot
  powerpc/pseries: Remove unused hcall tracing instruction
  powerpc/pseries: Fix hcall tracepoints with JUMP_LABEL=n
  powerpc: dts: add missing space before {
  powerpc/eeh: Use pci_dev_id() to simplify the code
  powerpc/64s: Move CPU -mtune options into Kconfig
  powerpc/powermac: Fix unused function warning
  powerpc/pseries: Rework lppaca_shared_proc() to avoid DEBUG_PREEMPT
  powerpc: Don't include lppaca.h in paca.h
  powerpc/pseries: Move hcall_vphn() prototype into vphn.h
  powerpc/pseries: Move VPHN constants into vphn.h
  cxl: Drop unused detach_spa()
  powerpc: Drop zalloc_maybe_bootmem()
  powerpc/powernv: Use struct opal_prd_msg in more places
  ...

27 files changed:
1  2 
Documentation/admin-guide/kernel-parameters.txt
arch/powerpc/Kconfig
arch/powerpc/configs/pmac32_defconfig
arch/powerpc/configs/ppc64_defconfig
arch/powerpc/configs/ppc6xx_defconfig
arch/powerpc/crypto/Kconfig
arch/powerpc/include/asm/book3s/32/pgtable.h
arch/powerpc/include/asm/nohash/32/pgtable.h
arch/powerpc/include/asm/nohash/64/pgtable.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/processor.h
arch/powerpc/kernel/head_64.S
arch/powerpc/kernel/trace/ftrace_entry.S
arch/powerpc/mm/book3s64/pgtable.c
arch/powerpc/mm/book3s64/radix_pgtable.c
arch/powerpc/mm/book3s64/radix_tlb.c
arch/powerpc/mm/init_64.c
arch/powerpc/platforms/8xx/adder875.c
arch/powerpc/platforms/8xx/mpc885ads_setup.c
arch/powerpc/platforms/8xx/tqm8xx_setup.c
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/pseries/hotplug-memory.c
arch/powerpc/sysdev/fsl_soc.c
arch/powerpc/xmon/xmon.c
drivers/macintosh/ams/ams.h
drivers/net/ethernet/freescale/fs_enet/fs_enet.h
drivers/net/ethernet/freescale/fs_enet/mac-fcc.c

        arm64.nobti     [ARM64] Unconditionally disable Branch Target
                        Identification support
  
 -      arm64.nopauth   [ARM64] Unconditionally disable Pointer Authentication
 -                      support
 +      arm64.nomops    [ARM64] Unconditionally disable Memory Copy and Memory
 +                      Set instructions support
  
        arm64.nomte     [ARM64] Unconditionally disable Memory Tagging Extension
                        support
  
 -      arm64.nosve     [ARM64] Unconditionally disable Scalable Vector
 -                      Extension support
 +      arm64.nopauth   [ARM64] Unconditionally disable Pointer Authentication
 +                      support
  
        arm64.nosme     [ARM64] Unconditionally disable Scalable Matrix
                        Extension support
  
 -      arm64.nomops    [ARM64] Unconditionally disable Memory Copy and Memory
 -                      Set instructions support
 +      arm64.nosve     [ARM64] Unconditionally disable Scalable Vector
 +                      Extension support
  
        ataflop=        [HW,M68k]
  
                        others).
  
        ccw_timeout_log [S390]
 -                      See Documentation/s390/common_io.rst for details.
 +                      See Documentation/arch/s390/common_io.rst for details.
  
        cgroup_disable= [KNL] Disable a particular controller or optional feature
                        Format: {name of the controller(s) or feature(s) to disable}
                        Setting checkreqprot to 1 is deprecated.
  
        cio_ignore=     [S390]
 -                      See Documentation/s390/common_io.rst for details.
 +                      See Documentation/arch/s390/common_io.rst for details.
  
        clearcpuid=X[,X...] [X86]
                        Disable CPUID feature X for the kernel. See
                        kernel/dma/contiguous.c
  
        cma_pernuma=nn[MG]
 -                      [ARM64,KNL,CMA]
 +                      [KNL,CMA]
                        Sets the size of kernel per-numa memory area for
                        contiguous memory allocations. A value of 0 disables
                        per-numa CMA altogether. And If this option is not
                        which is located in node nid, if the allocation fails,
                        they will fallback to the global default memory area.
  
 +      numa_cma=<node>:nn[MG][,<node>:nn[MG]]
 +                      [KNL,CMA]
 +                      Sets the size of kernel numa memory area for
 +                      contiguous memory allocations. It will reserve CMA
 +                      area for the specified node.
 +
 +                      With numa CMA enabled, DMA users on node nid will
 +                      first try to allocate buffer from the numa area
 +                      which is located in node nid, if the allocation fails,
 +                      they will fallback to the global default memory area.
 +
        cmo_free_hint=  [PPC] Format: { yes | no }
                        Specify whether pages are marked as being inactive
                        when they are freed.  This is used in CMO environments
                        Format: off | on
                        default: on
  
 +      gather_data_sampling=
 +                      [X86,INTEL] Control the Gather Data Sampling (GDS)
 +                      mitigation.
 +
 +                      Gather Data Sampling is a hardware vulnerability which
 +                      allows unprivileged speculative access to data which was
 +                      previously stored in vector registers.
 +
 +                      This issue is mitigated by default in updated microcode.
 +                      The mitigation may have a performance impact but can be
 +                      disabled. On systems without the microcode mitigation
 +                      disabling AVX serves as a mitigation.
 +
 +                      force:  Disable AVX to mitigate systems without
 +                              microcode mitigation. No effect if the microcode
 +                              mitigation is present. Known to cause crashes in
 +                              userspace with buggy AVX enumeration.
 +
 +                      off:    Disable GDS mitigation.
 +
        gcov_persist=   [GCOV] When non-zero (default), profiling data for
                        kernel modules is saved and remains accessible via
                        debugfs, even when the module is unloaded/reloaded.
  
        kvm-intel.flexpriority=
                        [KVM,Intel] Control KVM's use of FlexPriority feature
 -                      (TPR shadow). Default is 1 (enabled). Disalbe by KVM if
 +                      (TPR shadow). Default is 1 (enabled). Disable by KVM if
                        hardware lacks support for it.
  
        kvm-intel.nested=
        locktorture.torture_type= [KNL]
                        Specify the locking implementation to test.
  
 +      locktorture.writer_fifo= [KNL]
 +                      Run the write-side locktorture kthreads at
 +                      sched_set_fifo() real-time priority.
 +
        locktorture.verbose= [KNL]
                        Enable additional printk() statements.
  
                        [KNL,SH] Allow user to override the default size for
                        per-device physically contiguous DMA buffers.
  
 -      memhp_default_state=online/offline
 +      memhp_default_state=online/offline/online_kernel/online_movable
                        [KNL] Set the initial state for the memory hotplug
                        onlining policy. If not specified, the default value is
                        set according to the
                                Disable all optional CPU mitigations.  This
                                improves system performance, but it may also
                                expose users to several CPU vulnerabilities.
 -                              Equivalent to: nopti [X86,PPC]
 -                                             if nokaslr then kpti=0 [ARM64]
 -                                             nospectre_v1 [X86,PPC]
 -                                             nobp=0 [S390]
 -                                             nospectre_v2 [X86,PPC,S390,ARM64]
 -                                             spectre_v2_user=off [X86]
 -                                             spec_store_bypass_disable=off [X86,PPC]
 -                                             ssbd=force-off [ARM64]
 -                                             nospectre_bhb [ARM64]
 +                              Equivalent to: if nokaslr then kpti=0 [ARM64]
 +                                             gather_data_sampling=off [X86]
 +                                             kvm.nx_huge_pages=off [X86]
                                               l1tf=off [X86]
                                               mds=off [X86]
 -                                             tsx_async_abort=off [X86]
 -                                             kvm.nx_huge_pages=off [X86]
 -                                             srbds=off [X86,INTEL]
 +                                             mmio_stale_data=off [X86]
                                               no_entry_flush [PPC]
                                               no_uaccess_flush [PPC]
 -                                             mmio_stale_data=off [X86]
 +                                             nobp=0 [S390]
 +                                             nopti [X86,PPC]
 +                                             nospectre_bhb [ARM64]
 +                                             nospectre_v1 [X86,PPC]
 +                                             nospectre_v2 [X86,PPC,S390,ARM64]
                                               retbleed=off [X86]
 +                                             spec_store_bypass_disable=off [X86,PPC]
 +                                             spectre_v2_user=off [X86]
 +                                             srbds=off [X86,INTEL]
 +                                             ssbd=force-off [ARM64]
 +                                             tsx_async_abort=off [X86]
  
                                Exceptions:
                                               This does not have any effect on
  
        nohibernate     [HIBERNATION] Disable hibernation and resume.
  
-       nohlt           [ARM,ARM64,MICROBLAZE,MIPS,SH] Forces the kernel to
+       nohlt           [ARM,ARM64,MICROBLAZE,MIPS,PPC,SH] Forces the kernel to
                        busy wait in do_idle() and not use the arch_cpu_idle()
                        implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP
                        to be effective. This is useful on platforms where the
        nosmp           [SMP] Tells an SMP kernel to act as a UP kernel,
                        and disable the IO APIC.  legacy for "maxcpus=0".
  
-       nosmt           [KNL,MIPS,S390] Disable symmetric multithreading (SMT).
+       nosmt           [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT).
                        Equivalent to smt=1.
  
-                       [KNL,X86] Disable symmetric multithreading (SMT).
+                       [KNL,X86,PPC] Disable symmetric multithreading (SMT).
                        nosmt=force: Force disable SMT, cannot be undone
                                     via the sysfs control file.
  
                        timeout < 0: reboot immediately
                        Format: <timeout>
  
 -      panic_print=    Bitmask for printing system info when panic happens.
 -                      User can chose combination of the following bits:
 -                      bit 0: print all tasks info
 -                      bit 1: print system memory info
 -                      bit 2: print timer info
 -                      bit 3: print locks info if CONFIG_LOCKDEP is on
 -                      bit 4: print ftrace buffer
 -                      bit 5: print all printk messages in buffer
 -                      bit 6: print all CPUs backtrace (if available in the arch)
 -                      *Be aware* that this option may print a _lot_ of lines,
 -                      so there are risks of losing older messages in the log.
 -                      Use this option carefully, maybe worth to setup a
 -                      bigger log buffer with "log_buf_len" along with this.
 -
        panic_on_taint= Bitmask for conditionally calling panic() in add_taint()
                        Format: <hex>[,nousertaint]
                        Hexadecimal bitmask representing the set of TAINT flags
        panic_on_warn=1 panic() instead of WARN().  Useful to cause kdump
                        on a WARN().
  
 +      panic_print=    Bitmask for printing system info when panic happens.
 +                      User can chose combination of the following bits:
 +                      bit 0: print all tasks info
 +                      bit 1: print system memory info
 +                      bit 2: print timer info
 +                      bit 3: print locks info if CONFIG_LOCKDEP is on
 +                      bit 4: print ftrace buffer
 +                      bit 5: print all printk messages in buffer
 +                      bit 6: print all CPUs backtrace (if available in the arch)
 +                      *Be aware* that this option may print a _lot_ of lines,
 +                      so there are risks of losing older messages in the log.
 +                      Use this option carefully, maybe worth to setup a
 +                      bigger log buffer with "log_buf_len" along with this.
 +
        parkbd.port=    [HW] Parallel port number the keyboard adapter is
                        connected to, default is 0.
                        Format: <parport#>
                        mode 0, bit 1 is for mode 1, and so on.  Mode 0 only
                        allowed by default.
  
 -      pause_on_oops=
 +      pause_on_oops=<int>
                        Halt all CPUs after the first oops has been printed for
                        the specified number of seconds.  This is to be used if
                        your oopses keep scrolling off the screen.
                        test until boot completes in order to avoid
                        interference.
  
 +      rcuscale.kfree_by_call_rcu= [KNL]
 +                      In kernels built with CONFIG_RCU_LAZY=y, test
 +                      call_rcu() instead of kfree_rcu().
 +
 +      rcuscale.kfree_mult= [KNL]
 +                      Instead of allocating an object of size kfree_obj,
 +                      allocate one of kfree_mult * sizeof(kfree_obj).
 +                      Defaults to 1.
 +
        rcuscale.kfree_rcu_test= [KNL]
                        Set to measure performance of kfree_rcu() flooding.
  
                        Number of loops doing rcuscale.kfree_alloc_num number
                        of allocations and frees.
  
 +      rcuscale.minruntime= [KNL]
 +                      Set the minimum test run time in seconds.  This
 +                      does not affect the data-collection interval,
 +                      but instead allows better measurement of things
 +                      like CPU consumption.
 +
        rcuscale.nreaders= [KNL]
                        Set number of RCU readers.  The value -1 selects
                        N, where N is the number of CPUs.  A value
                        the same as for rcuscale.nreaders.
                        N, where N is the number of CPUs
  
 -      rcuscale.perf_type= [KNL]
 +      rcuscale.scale_type= [KNL]
                        Specify the RCU implementation to test.
  
        rcuscale.shutdown= [KNL]
                        in microseconds.  The default of zero says
                        no holdoff.
  
 +      rcuscale.writer_holdoff_jiffies= [KNL]
 +                      Additional write-side holdoff between grace
 +                      periods, but in jiffies.  The default of zero
 +                      says no holdoff.
 +
        rcutorture.fqs_duration= [KNL]
                        Set duration of force_quiescent_state bursts
                        in microseconds.
                        number avoids disturbing real-time workloads,
                        but lengthens grace periods.
  
 +      rcupdate.rcu_task_lazy_lim= [KNL]
 +                      Number of callbacks on a given CPU that will
 +                      cancel laziness on that CPU.  Use -1 to disable
 +                      cancellation of laziness, but be advised that
 +                      doing so increases the danger of OOM due to
 +                      callback flooding.
 +
        rcupdate.rcu_task_stall_info= [KNL]
                        Set initial timeout in jiffies for RCU task stall
                        informational messages, which give some indication
                        A change in value does not take effect until
                        the beginning of the next grace period.
  
 +      rcupdate.rcu_tasks_lazy_ms= [KNL]
 +                      Set timeout in milliseconds RCU Tasks asynchronous
 +                      callback batching for call_rcu_tasks().
 +                      A negative value will take the default.  A value
 +                      of zero will disable batching.  Batching is
 +                      always disabled for synchronize_rcu_tasks().
 +
 +      rcupdate.rcu_tasks_rude_lazy_ms= [KNL]
 +                      Set timeout in milliseconds RCU Tasks
 +                      Rude asynchronous callback batching for
 +                      call_rcu_tasks_rude().  A negative value
 +                      will take the default.  A value of zero will
 +                      disable batching.  Batching is always disabled
 +                      for synchronize_rcu_tasks_rude().
 +
 +      rcupdate.rcu_tasks_trace_lazy_ms= [KNL]
 +                      Set timeout in milliseconds RCU Tasks
 +                      Trace asynchronous callback batching for
 +                      call_rcu_tasks_trace().  A negative value
 +                      will take the default.  A value of zero will
 +                      disable batching.  Batching is always disabled
 +                      for synchronize_rcu_tasks_trace().
 +
        rcupdate.rcu_self_test= [KNL]
                        Run the RCU early boot self tests
  
                        Useful for devices that are detected asynchronously
                        (e.g. USB and MMC devices).
  
 +      rootwait=       [KNL] Maximum time (in seconds) to wait for root device
 +                      to show up before attempting to mount the root
 +                      filesystem.
 +
        rproc_mem=nn[KMG][@address]
                        [KNL,ARM,CMA] Remoteproc physical memory block.
                        Memory area to be used by remote processor image,
                        Not specifying this option is equivalent to
                        spectre_v2_user=auto.
  
 +      spec_rstack_overflow=
 +                      [X86] Control RAS overflow mitigation on AMD Zen CPUs
 +
 +                      off             - Disable mitigation
 +                      microcode       - Enable microcode mitigation only
 +                      safe-ret        - Enable sw-only safe RET mitigation (default)
 +                      ibpb            - Enable mitigation by issuing IBPB on
 +                                        kernel entry
 +                      ibpb-vmexit     - Issue IBPB only on VMEXIT
 +                                        (cloud-specific mitigation)
 +
        spec_store_bypass_disable=
                        [HW] Control Speculative Store Bypass (SSB) Disable mitigation
                        (Speculative Store Bypass vulnerability)
                        -1: disable all critical trip points in all thermal zones
                        <degrees C>: override all critical trip points
  
 -      thermal.nocrt=  [HW,ACPI]
 -                      Set to disable actions on ACPI thermal zone
 -                      critical and hot trip points.
 -
        thermal.off=    [HW,ACPI]
                        1: disable ACPI thermal control
  
                        This will guarantee that all the other pcrs
                        are saved.
  
 +      tpm_tis.interrupts= [HW,TPM]
 +                      Enable interrupts for the MMIO based physical layer
 +                      for the FIFO interface. By default it is set to false
 +                      (0). For more information about TPM hardware interfaces
 +                      defined by Trusted Computing Group (TCG) see
 +                      https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/
 +
        tp_printk       [FTRACE]
                        Have the tracepoints sent to printk as well as the
                        tracing ring buffer. This is useful for early boot up
diff --combined arch/powerpc/Kconfig
@@@ -157,7 -157,6 +157,7 @@@ config PP
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select ARCH_KEEP_MEMBLOCK
 +      select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
        select ARCH_OPTIONAL_KERNEL_RWX         if ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_WANT_IPC_PARSE_VERSION
        select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
        select ARCH_WANT_LD_ORPHAN_WARN
 +      select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP   if PPC_RADIX_MMU
        select ARCH_WANTS_MODULES_DATA_IN_VMALLOC       if PPC_BOOK3S_32 || PPC_8xx
        select ARCH_WEAK_RELEASE_ACQUIRE
        select BINFMT_ELF
        select DYNAMIC_FTRACE                   if FUNCTION_TRACER
        select EDAC_ATOMIC_SCRUB
        select EDAC_SUPPORT
+       select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if ARCH_USING_PATCHABLE_FUNCTION_ENTRY
        select GENERIC_ATOMIC64                 if PPC32
        select GENERIC_CLOCKEVENTS_BROADCAST    if SMP
        select GENERIC_CMOS_UPDATE
        select GENERIC_CPU_VULNERABILITIES      if PPC_BARRIER_NOSPEC
        select GENERIC_EARLY_IOREMAP
        select GENERIC_GETTIMEOFDAY
+       select GENERIC_IDLE_POLL_SETUP
 +      select GENERIC_IOREMAP
        select GENERIC_IRQ_SHOW
        select GENERIC_IRQ_SHOW_LEVEL
        select GENERIC_PCI_IOMAP                if PCI
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DYNAMIC_FTRACE
-       select HAVE_DYNAMIC_FTRACE_WITH_ARGS    if MPROFILE_KERNEL || PPC32
-       select HAVE_DYNAMIC_FTRACE_WITH_REGS    if MPROFILE_KERNEL || PPC32
+       select HAVE_DYNAMIC_FTRACE_WITH_ARGS    if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
+       select HAVE_DYNAMIC_FTRACE_WITH_REGS    if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
        select HAVE_EBPF_JIT
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select HAVE_FAST_GUP
        select HAVE_MOD_ARCH_SPECIFIC
        select HAVE_NMI                         if PERF_EVENTS || (PPC64 && PPC_BOOK3S)
        select HAVE_OPTPROBES
-       select HAVE_OBJTOOL                     if PPC32 || MPROFILE_KERNEL
+       select HAVE_OBJTOOL                     if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
        select HAVE_OBJTOOL_MCOUNT              if HAVE_OBJTOOL
        select HAVE_PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI             if PPC64
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_VIRT_CPU_ACCOUNTING
        select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select HOTPLUG_SMT                      if HOTPLUG_CPU
+       select SMT_NUM_THREADS_DYNAMIC
        select HUGETLB_PAGE_SIZE_VARIABLE       if PPC_BOOK3S_64 && HUGETLB_PAGE
        select IOMMU_HELPER                     if PPC64
        select IRQ_DOMAIN
@@@ -554,6 -555,13 +558,13 @@@ config MPROFILE_KERNE
        def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -mlittle-endian) if CPU_LITTLE_ENDIAN
        def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -mbig-endian) if CPU_BIG_ENDIAN
  
+ config ARCH_USING_PATCHABLE_FUNCTION_ENTRY
+       depends on FUNCTION_TRACER && (PPC32 || PPC64_ELF_ABI_V2)
+       depends on $(cc-option,-fpatchable-function-entry=2)
+       def_bool y if PPC32
+       def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mlittle-endian) if PPC64 && CPU_LITTLE_ENDIAN
+       def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mbig-endian) if PPC64 && CPU_BIG_ENDIAN
  config HOTPLUG_CPU
        bool "Support for enabling/disabling CPUs"
        depends on SMP && (PPC_PSERIES || \
@@@ -592,21 -600,41 +603,21 @@@ config PPC64_SUPPORTS_MEMORY_FAILUR
        default "y" if PPC_POWERNV
        select ARCH_SUPPORTS_MEMORY_FAILURE
  
 -config KEXEC
 -      bool "kexec system call"
 -      depends on PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 -      select KEXEC_CORE
 -      help
 -        kexec is a system call that implements the ability to shutdown your
 -        current kernel, and to start another kernel.  It is like a reboot
 -        but it is independent of the system firmware.   And like a reboot
 -        you can start any kernel with it, not just Linux.
 -
 -        The name comes from the similarity to the exec system call.
 -
 -        It is an ongoing process to be certain the hardware in a machine
 -        is properly shutdown, so do not be surprised if this code does not
 -        initially work for you.  As of this writing the exact hardware
 -        interface is strongly in flux, so no good recommendation can be
 -        made.
 -
 -config KEXEC_FILE
 -      bool "kexec file based system call"
 -      select KEXEC_CORE
 -      select HAVE_IMA_KEXEC if IMA
 -      select KEXEC_ELF
 -      depends on PPC64
 -      depends on CRYPTO=y
 -      depends on CRYPTO_SHA256=y
 -      help
 -        This is a new version of the kexec system call. This call is
 -        file based and takes in file descriptors as system call arguments
 -        for kernel and initramfs as opposed to a list of segments as is the
 -        case for the older kexec call.
 +config ARCH_SUPPORTS_KEXEC
 +      def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 +
 +config ARCH_SUPPORTS_KEXEC_FILE
 +      def_bool PPC64 && CRYPTO=y && CRYPTO_SHA256=y
  
 -config ARCH_HAS_KEXEC_PURGATORY
 +config ARCH_SUPPORTS_KEXEC_PURGATORY
        def_bool KEXEC_FILE
  
 +config ARCH_SELECTS_KEXEC_FILE
 +      def_bool y
 +      depends on KEXEC_FILE
 +      select KEXEC_ELF
 +      select HAVE_IMA_KEXEC if IMA
 +
  config PPC64_BIG_ENDIAN_ELF_ABI_V2
        # Option is available to BFD, but LLD does not support ELFv1 so this is
        # always true there.
@@@ -666,13 -694,14 +677,13 @@@ config RELOCATABLE_TES
          loaded at, which tends to be non-zero and therefore test the
          relocation code.
  
 -config CRASH_DUMP
 -      bool "Build a dump capture kernel"
 -      depends on PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP)
 +config ARCH_SUPPORTS_CRASH_DUMP
 +      def_bool PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP)
 +
 +config ARCH_SELECTS_CRASH_DUMP
 +      def_bool y
 +      depends on CRASH_DUMP
        select RELOCATABLE if PPC64 || 44x || PPC_85xx
 -      help
 -        Build a kernel suitable for use as a dump capture kernel.
 -        The same kernel binary can be used as production kernel and dump
 -        capture kernel.
  
  config FA_DUMP
        bool "Firmware-assisted dump"
@@@ -1126,12 -1155,6 +1137,6 @@@ config FSL_GT
        help
          Freescale General-purpose Timers support
  
- config PCI_8260
-       bool
-       depends on PCI && 8260
-       select PPC_INDIRECT_PCI
-       default y
  config FSL_RIO
        bool "Freescale Embedded SRIO Controller support"
        depends on RAPIDIO = y && HAVE_RAPIDIO
@@@ -176,8 -176,9 +176,9 @@@ CONFIG_MOUSE_APPLETOUCH=
  # CONFIG_SERIO_I8042 is not set
  # CONFIG_SERIO_SERPORT is not set
  CONFIG_SERIAL_8250=m
- CONFIG_SERIAL_PMACZILOG=m
+ CONFIG_SERIAL_PMACZILOG=y
  CONFIG_SERIAL_PMACZILOG_TTYS=y
+ CONFIG_SERIAL_PMACZILOG_CONSOLE=y
  CONFIG_NVRAM=y
  CONFIG_I2C_CHARDEV=m
  CONFIG_APM_POWER=y
@@@ -254,7 -255,7 +255,7 @@@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=
  CONFIG_EXT2_FS=y
  CONFIG_EXT4_FS=y
  CONFIG_EXT4_FS_POSIX_ACL=y
 -CONFIG_AUTOFS4_FS=m
 +CONFIG_AUTOFS_FS=m
  CONFIG_FUSE_FS=m
  CONFIG_ISO9660_FS=y
  CONFIG_JOLIET=y
@@@ -327,7 -327,7 +327,7 @@@ CONFIG_BTRFS_FS=
  CONFIG_BTRFS_FS_POSIX_ACL=y
  CONFIG_NILFS2_FS=m
  CONFIG_FS_DAX=y
 -CONFIG_AUTOFS4_FS=m
 +CONFIG_AUTOFS_FS=m
  CONFIG_FUSE_FS=m
  CONFIG_OVERLAY_FS=m
  CONFIG_ISO9660_FS=y
@@@ -390,8 -390,11 +390,11 @@@ CONFIG_CRYPTO_SHA256=
  CONFIG_CRYPTO_WP512=m
  CONFIG_CRYPTO_LZO=m
  CONFIG_CRYPTO_CRC32C_VPMSUM=m
+ CONFIG_CRYPTO_CRCT10DIF_VPMSUM=m
+ CONFIG_CRYPTO_VPMSUM_TESTER=m
  CONFIG_CRYPTO_MD5_PPC=m
  CONFIG_CRYPTO_SHA1_PPC=m
+ CONFIG_CRYPTO_AES_GCM_P10=m
  CONFIG_CRYPTO_DEV_NX=y
  CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
  CONFIG_CRYPTO_DEV_VMX=y
@@@ -183,7 -183,6 +183,6 @@@ CONFIG_IP_NF_MATCH_TTL=
  CONFIG_IP_NF_FILTER=m
  CONFIG_IP_NF_TARGET_REJECT=m
  CONFIG_IP_NF_MANGLE=m
- CONFIG_IP_NF_TARGET_CLUSTERIP=m
  CONFIG_IP_NF_TARGET_ECN=m
  CONFIG_IP_NF_TARGET_TTL=m
  CONFIG_IP_NF_RAW=m
@@@ -969,7 -968,7 +968,7 @@@ CONFIG_XFS_POSIX_ACL=
  CONFIG_GFS2_FS=m
  CONFIG_FS_DAX=y
  CONFIG_QUOTA_NETLINK_INTERFACE=y
 -CONFIG_AUTOFS4_FS=m
 +CONFIG_AUTOFS_FS=m
  CONFIG_FUSE_FS=m
  CONFIG_ISO9660_FS=y
  CONFIG_JOLIET=y
@@@ -100,7 -100,7 +100,7 @@@ config CRYPTO_AES_GCM_P1
        select CRYPTO_LIB_AES
        select CRYPTO_ALGAPI
        select CRYPTO_AEAD
-       default m
+       select CRYPTO_SKCIPHER
        help
          AEAD cipher: AES cipher algorithms (FIPS-197)
          GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
          Support for cryptographic acceleration instructions on Power10 or
          later CPU. This module supports stitched acceleration for AES/GCM.
  
 +config CRYPTO_CHACHA20_P10
 +      tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
 +      depends on PPC64 && CPU_LITTLE_ENDIAN
 +      select CRYPTO_SKCIPHER
 +      select CRYPTO_LIB_CHACHA_GENERIC
 +      select CRYPTO_ARCH_HAVE_LIB_CHACHA
 +      help
 +        Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
 +        stream cipher algorithms
 +
 +        Architecture: PowerPC64
 +        - Power10 or later
 +        - Little-endian
 +
 +config CRYPTO_POLY1305_P10
 +      tristate "Hash functions: Poly1305 (P10 or later)"
 +      depends on PPC64 && CPU_LITTLE_ENDIAN
 +      select CRYPTO_HASH
 +      select CRYPTO_LIB_POLY1305_GENERIC
 +      help
 +        Poly1305 authenticator algorithm (RFC7539)
 +
 +        Architecture: PowerPC64
 +        - Power10 or later
 +        - Little-endian
 +
  endmenu
@@@ -462,6 -462,11 +462,6 @@@ static inline pte_t pfn_pte(unsigned lo
                     pgprot_val(pgprot));
  }
  
 -static inline unsigned long pte_pfn(pte_t pte)
 -{
 -      return pte_val(pte) >> PTE_RPN_SHIFT;
 -}
 -
  /* Generic modifiers for PTE bits */
  static inline pte_t pte_wrprotect(pte_t pte)
  {
@@@ -493,7 -498,7 +493,7 @@@ static inline pte_t pte_mkpte(pte_t pte
        return pte;
  }
  
 -static inline pte_t pte_mkwrite(pte_t pte)
 +static inline pte_t pte_mkwrite_novma(pte_t pte)
  {
        return __pte(pte_val(pte) | _PAGE_RW);
  }
@@@ -536,58 -541,43 +536,43 @@@ static inline pte_t pte_modify(pte_t pt
  
  
  /* This low level function performs the actual PTE insertion
-  * Setting the PTE depends on the MMU type and other factors. It's
-  * an horrible mess that I'm not going to try to clean up now but
-  * I'm keeping it in one place rather than spread around
+  * Setting the PTE depends on the MMU type and other factors.
+  *
+  * First case is 32-bit in UP mode with 32-bit PTEs, we need to preserve
+  * the _PAGE_HASHPTE bit since we may not have invalidated the previous
+  * translation in the hash yet (done in a subsequent flush_tlb_xxx())
+  * and see we need to keep track that this PTE needs invalidating.
+  *
+  * Second case is 32-bit with 64-bit PTE.  In this case, we
+  * can just store as long as we do the two halves in the right order
+  * with a barrier in between. This is possible because we take care,
+  * in the hash code, to pre-invalidate if the PTE was already hashed,
+  * which synchronizes us with any concurrent invalidation.
+  * In the percpu case, we fallback to the simple update preserving
+  * the hash bits (ie, same as the non-SMP case).
+  *
+  * Third case is 32-bit in SMP mode with 32-bit PTEs. We use the
+  * helper pte_update() which does an atomic update. We need to do that
+  * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
+  * per-CPU PTE such as a kmap_atomic, we also do a simple update preserving
+  * the hash bits instead.
   */
  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, pte_t pte, int percpu)
  {
- #if defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
-       /* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
-        * helper pte_update() which does an atomic update. We need to do that
-        * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
-        * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
-        * the hash bits instead (ie, same as the non-SMP case)
-        */
-       if (percpu)
-               *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-                             | (pte_val(pte) & ~_PAGE_HASHPTE));
-       else
-               pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, pte_val(pte), 0);
+       if ((!IS_ENABLED(CONFIG_SMP) && !IS_ENABLED(CONFIG_PTE_64BIT)) || percpu) {
+               *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE) |
+                             (pte_val(pte) & ~_PAGE_HASHPTE));
+       } else if (IS_ENABLED(CONFIG_PTE_64BIT)) {
+               if (pte_val(*ptep) & _PAGE_HASHPTE)
+                       flush_hash_entry(mm, ptep, addr);
  
- #elif defined(CONFIG_PTE_64BIT)
-       /* Second case is 32-bit with 64-bit PTE.  In this case, we
-        * can just store as long as we do the two halves in the right order
-        * with a barrier in between. This is possible because we take care,
-        * in the hash code, to pre-invalidate if the PTE was already hashed,
-        * which synchronizes us with any concurrent invalidation.
-        * In the percpu case, we also fallback to the simple update preserving
-        * the hash bits
-        */
-       if (percpu) {
-               *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-                             | (pte_val(pte) & ~_PAGE_HASHPTE));
-               return;
+               asm volatile("stw%X0 %2,%0; eieio; stw%X1 %L2,%1" :
+                            "=m" (*ptep), "=m" (*((unsigned char *)ptep+4)) :
+                            "r" (pte) : "memory");
+       } else {
+               pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, pte_val(pte), 0);
        }
-       if (pte_val(*ptep) & _PAGE_HASHPTE)
-               flush_hash_entry(mm, ptep, addr);
-       __asm__ __volatile__("\
-               stw%X0 %2,%0\n\
-               eieio\n\
-               stw%X1 %L2,%1"
-       : "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
-       : "r" (pte) : "memory");
- #else
-       /* Third case is 32-bit hash table in UP mode, we need to preserve
-        * the _PAGE_HASHPTE bit since we may not have invalidated the previous
-        * translation in the hash yet (done in a subsequent flush_tlb_xxx())
-        * and see we need to keep track that this PTE needs invalidating
-        */
-       *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-                     | (pte_val(pte) & ~_PAGE_HASHPTE));
- #endif
  }
  
  /*
@@@ -170,8 -170,8 +170,8 @@@ void unmap_kernel_page(unsigned long va
  #define pte_clear(mm, addr, ptep) \
        do { pte_update(mm, addr, ptep, ~0, 0, 0); } while (0)
  
 -#ifndef pte_mkwrite
 -static inline pte_t pte_mkwrite(pte_t pte)
 +#ifndef pte_mkwrite_novma
 +static inline pte_t pte_mkwrite_novma(pte_t pte)
  {
        return __pte(pte_val(pte) | _PAGE_RW);
  }
@@@ -355,7 -355,7 +355,7 @@@ static inline int pte_young(pte_t pte
  #define pmd_pfn(pmd)          (pmd_val(pmd) >> PAGE_SHIFT)
  #else
  #define pmd_page_vaddr(pmd)   \
-       ((unsigned long)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
+       ((const void *)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
  #define pmd_pfn(pmd)          (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
  #endif
  
@@@ -85,7 -85,7 +85,7 @@@
  #ifndef __ASSEMBLY__
  /* pte_clear moved to later in this file */
  
 -static inline pte_t pte_mkwrite(pte_t pte)
 +static inline pte_t pte_mkwrite_novma(pte_t pte)
  {
        return __pte(pte_val(pte) | _PAGE_RW);
  }
@@@ -127,7 -127,7 +127,7 @@@ static inline pte_t pmd_pte(pmd_t pmd
  #define       pmd_bad(pmd)            (!is_kernel_addr(pmd_val(pmd)) \
                                 || (pmd_val(pmd) & PMD_BAD_BITS))
  #define       pmd_present(pmd)        (!pmd_none(pmd))
- #define pmd_page_vaddr(pmd)   (pmd_val(pmd) & ~PMD_MASKED_BITS)
+ #define pmd_page_vaddr(pmd)   ((const void *)(pmd_val(pmd) & ~PMD_MASKED_BITS))
  extern struct page *pmd_page(pmd_t pmd);
  #define pmd_pfn(pmd)          (page_to_pfn(pmd_page(pmd)))
  
@@@ -41,12 -41,6 +41,12 @@@ struct mm_struct
  
  #ifndef __ASSEMBLY__
  
 +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 +              pte_t pte, unsigned int nr);
 +#define set_ptes set_ptes
 +#define update_mmu_cache(vma, addr, ptep) \
 +      update_mmu_cache_range(NULL, vma, addr, ptep, 1)
 +
  #ifndef MAX_PTRS_PER_PGD
  #define MAX_PTRS_PER_PGD PTRS_PER_PGD
  #endif
  /* Keep these as a macros to avoid include dependency mess */
  #define pte_page(x)           pfn_to_page(pte_pfn(x))
  #define mk_pte(page, pgprot)  pfn_pte(page_to_pfn(page), (pgprot))
 +
 +static inline unsigned long pte_pfn(pte_t pte)
 +{
 +      return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT;
 +}
 +
  /*
   * Select all bits except the pfn
   */
@@@ -72,9 -60,9 +72,9 @@@ static inline pgprot_t pte_pgprot(pte_
  }
  
  #ifndef pmd_page_vaddr
- static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+ static inline const void *pmd_page_vaddr(pmd_t pmd)
  {
-       return ((unsigned long)__va(pmd_val(pmd) & ~PMD_MASKED_BITS));
+       return __va(pmd_val(pmd) & ~PMD_MASKED_BITS);
  }
  #define pmd_page_vaddr pmd_page_vaddr
  #endif
@@@ -170,30 -158,13 +170,30 @@@ static inline pgtable_t pmd_pgtable(pmd
  }
  
  #ifdef CONFIG_PPC64
 -#define is_ioremap_addr is_ioremap_addr
 -static inline bool is_ioremap_addr(const void *x)
 +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size);
 +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
 +                         unsigned long page_size);
 +/*
 + * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
 + * some of the restrictions. We don't check for PMD_SIZE because our
 + * vmemmap allocation code can fallback correctly. The pageblock
 + * alignment requirement is met using altmap->reserve blocks.
 + */
 +#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
 +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
  {
 -      unsigned long addr = (unsigned long)x;
 -
 -      return addr >= IOREMAP_BASE && addr < IOREMAP_END;
 +      if (!radix_enabled())
 +              return false;
 +      /*
 +       * With 4K page size and 2M PMD_SIZE, we can align
 +       * things better with memory block size value
 +       * starting from 128MB. Hence align things with PMD_SIZE.
 +       */
 +      if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
 +              return IS_ALIGNED(vmemmap_size, PMD_SIZE);
 +      return true;
  }
 +
  #endif /* CONFIG_PPC64 */
  
  #endif /* __ASSEMBLY__ */
@@@ -172,11 -172,6 +172,6 @@@ struct thread_struct 
        unsigned int    align_ctl;      /* alignment handling control */
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
        struct perf_event *ptrace_bps[HBP_NUM_MAX];
-       /*
-        * Helps identify source of single-step exception and subsequent
-        * hw-breakpoint enablement
-        */
-       struct perf_event *last_hit_ubp[HBP_NUM_MAX];
  #endif /* CONFIG_HAVE_HW_BREAKPOINT */
        struct arch_hw_breakpoint hw_brk[HBP_NUM_MAX]; /* hardware breakpoint info */
        unsigned long   trap_nr;        /* last trap # on this thread */
@@@ -393,6 -388,7 +388,6 @@@ int validate_sp_size(unsigned long sp, 
   */
  #define ARCH_HAS_PREFETCH
  #define ARCH_HAS_PREFETCHW
 -#define ARCH_HAS_SPINLOCK_PREFETCH
  
  static inline void prefetch(const void *x)
  {
@@@ -410,6 -406,8 +405,6 @@@ static inline void prefetchw(const voi
        __asm__ __volatile__ ("dcbtst 0,%0" : : "r" (x));
  }
  
 -#define spin_lock_prefetch(x) prefetchw(x)
 -
  /* asm stubs */
  extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
  extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
@@@ -40,7 -40,6 +40,6 @@@
  #include <asm/hw_irq.h>
  #include <asm/cputhreads.h>
  #include <asm/ppc-opcode.h>
- #include <asm/export.h>
  #include <asm/feature-fixups.h>
  #ifdef CONFIG_PPC_BOOK3S
  #include <asm/exception-64s.h>
@@@ -375,7 -374,8 +374,7 @@@ _GLOBAL(generic_secondary_smp_init
        beq     20f
  
        /* start the specified thread */
 -      LOAD_REG_ADDR(r5, fsl_secondary_thread_init)
 -      ld      r4, 0(r5)
 +      LOAD_REG_ADDR(r5, DOTSYM(fsl_secondary_thread_init))
        bl      book3e_start_thread
  
        /* stop the current thread */
@@@ -3,12 -3,12 +3,12 @@@
   * Split from ftrace_64.S
   */
  
+ #include <linux/export.h>
  #include <linux/magic.h>
  #include <asm/ppc_asm.h>
  #include <asm/asm-offsets.h>
  #include <asm/ftrace.h>
  #include <asm/ppc-opcode.h>
- #include <asm/export.h>
  #include <asm/thread_info.h>
  #include <asm/bug.h>
  #include <asm/ptrace.h>
@@@ -33,9 -33,6 +33,9 @@@
   * and then arrange for the ftrace function to be called.
   */
  .macro        ftrace_regs_entry allregs
 +      /* Create a minimal stack frame for representing B */
 +      PPC_STLU        r1, -STACK_FRAME_MIN_SIZE(r1)
 +
        /* Create our stack frame + pt_regs */
        PPC_STLU        r1,-SWITCH_FRAME_SIZE(r1)
  
@@@ -45,7 -42,7 +45,7 @@@
  
  #ifdef CONFIG_PPC64
        /* Save the original return address in A's stack frame */
 -      std     r0, LRSAVE+SWITCH_FRAME_SIZE(r1)
 +      std     r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1)
        /* Ok to continue? */
        lbz     r3, PACA_FTRACE_ENABLED(r13)
        cmpdi   r3, 0
@@@ -80,8 -77,6 +80,8 @@@
        mflr    r7
        /* Save it as pt_regs->nip */
        PPC_STL r7, _NIP(r1)
 +      /* Also save it in B's stackframe header for proper unwind */
 +      PPC_STL r7, LRSAVE+SWITCH_FRAME_SIZE(r1)
        /* Save the read LR in pt_regs->link */
        PPC_STL r0, _LINK(r1)
  
  #endif
  
        /* Pop our stack frame */
 -      addi r1, r1, SWITCH_FRAME_SIZE
 +      addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
  
  #ifdef CONFIG_LIVEPATCH_64
          /* Based on the cmpd above, if the NIP was altered handle livepatch */
@@@ -254,3 -249,70 +254,70 @@@ livepatch_handler
        /* Return to original caller of live patched function */
        blr
  #endif /* CONFIG_LIVEPATCH */
+ #ifndef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY
+ _GLOBAL(mcount)
+ _GLOBAL(_mcount)
+ EXPORT_SYMBOL(_mcount)
+       mflr    r12
+       mtctr   r12
+       mtlr    r0
+       bctr
+ #endif
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ _GLOBAL(return_to_handler)
+       /* need to save return values */
+ #ifdef CONFIG_PPC64
+       std     r4,  -32(r1)
+       std     r3,  -24(r1)
+       /* save TOC */
+       std     r2,  -16(r1)
+       std     r31, -8(r1)
+       mr      r31, r1
+       stdu    r1, -112(r1)
+       /*
+        * We might be called from a module.
+        * Switch to our TOC to run inside the core kernel.
+        */
+       LOAD_PACA_TOC()
+ #else
+       stwu    r1, -16(r1)
+       stw     r3, 8(r1)
+       stw     r4, 12(r1)
+ #endif
+       bl      ftrace_return_to_handler
+       nop
+       /* return value has real return address */
+       mtlr    r3
+ #ifdef CONFIG_PPC64
+       ld      r1, 0(r1)
+       ld      r4,  -32(r1)
+       ld      r3,  -24(r1)
+       ld      r2,  -16(r1)
+       ld      r31, -8(r1)
+ #else
+       lwz     r3, 8(r1)
+       lwz     r4, 12(r1)
+       addi    r1, r1, 16
+ #endif
+       /* Jump back to real return address */
+       blr
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ .pushsection ".tramp.ftrace.text","aw",@progbits;
+ .globl ftrace_tramp_text
+ ftrace_tramp_text:
+       .space 32
+ .popsection
+ .pushsection ".tramp.ftrace.init","aw",@progbits;
+ .globl ftrace_tramp_init
+ ftrace_tramp_init:
+       .space 32
+ .popsection
@@@ -9,6 -9,7 +9,7 @@@
  #include <linux/memremap.h>
  #include <linux/pkeys.h>
  #include <linux/debugfs.h>
+ #include <linux/proc_fs.h>
  #include <misc/cxl-base.h>
  
  #include <asm/pgalloc.h>
@@@ -64,39 -65,11 +65,39 @@@ int pmdp_set_access_flags(struct vm_are
        return changed;
  }
  
 +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 +                        pud_t *pudp, pud_t entry, int dirty)
 +{
 +      int changed;
 +#ifdef CONFIG_DEBUG_VM
 +      WARN_ON(!pud_devmap(*pudp));
 +      assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
 +#endif
 +      changed = !pud_same(*(pudp), entry);
 +      if (changed) {
 +              /*
 +               * We can use MMU_PAGE_1G here, because only radix
 +               * path look at the psize.
 +               */
 +              __ptep_set_access_flags(vma, pudp_ptep(pudp),
 +                                      pud_pte(entry), address, MMU_PAGE_1G);
 +      }
 +      return changed;
 +}
 +
 +
  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long address, pmd_t *pmdp)
  {
        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
  }
 +
 +int pudp_test_and_clear_young(struct vm_area_struct *vma,
 +                            unsigned long address, pud_t *pudp)
 +{
 +      return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
 +}
 +
  /*
   * set a new huge pmd. We should not be called for updating
   * an existing pmd entry. That should go via pmd_hugepage_update.
@@@ -118,23 -91,6 +119,23 @@@ void set_pmd_at(struct mm_struct *mm, u
        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
  }
  
 +void set_pud_at(struct mm_struct *mm, unsigned long addr,
 +              pud_t *pudp, pud_t pud)
 +{
 +#ifdef CONFIG_DEBUG_VM
 +      /*
 +       * Make sure hardware valid bit is not set. We don't do
 +       * tlb flush for this update.
 +       */
 +
 +      WARN_ON(pte_hw_valid(pud_pte(*pudp)));
 +      assert_spin_locked(pud_lockptr(mm, pudp));
 +      WARN_ON(!(pud_large(pud)));
 +#endif
 +      trace_hugepage_set_pud(addr, pud_val(pud));
 +      return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
 +}
 +
  static void do_serialize(void *arg)
  {
        /* We've taken the IPI, so try to trim the mask while here */
@@@ -192,35 -148,11 +193,35 @@@ pmd_t pmdp_huge_get_and_clear_full(stru
        return pmd;
  }
  
 +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
 +                                 unsigned long addr, pud_t *pudp, int full)
 +{
 +      pud_t pud;
 +
 +      VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
 +      VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
 +                !pud_present(*pudp));
 +      pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
 +      /*
 +       * if it not a fullmm flush, then we can possibly end up converting
 +       * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
 +       * Make sure we flush the tlb in this case.
 +       */
 +      if (!full)
 +              flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
 +      return pud;
 +}
 +
  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
  {
        return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
  }
  
 +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
 +{
 +      return __pud(pud_val(pud) | pgprot_val(pgprot));
 +}
 +
  /*
   * At some point we should be able to get rid of
   * pmd_mkhuge() and mk_huge_pmd() when we update all the
@@@ -235,15 -167,6 +236,15 @@@ pmd_t pfn_pmd(unsigned long pfn, pgprot
        return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
  }
  
 +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
 +{
 +      unsigned long pudv;
 +
 +      pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
 +
 +      return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
 +}
 +
  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
  {
        return pfn_pmd(page_to_pfn(page), pgprot);
@@@ -384,22 -307,22 +385,22 @@@ static pmd_t *get_pmd_from_cache(struc
  static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
  {
        void *ret = NULL;
 -      struct page *page;
 +      struct ptdesc *ptdesc;
        gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
  
        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
 -      page = alloc_page(gfp);
 -      if (!page)
 +      ptdesc = pagetable_alloc(gfp, 0);
 +      if (!ptdesc)
                return NULL;
 -      if (!pgtable_pmd_page_ctor(page)) {
 -              __free_pages(page, 0);
 +      if (!pagetable_pmd_ctor(ptdesc)) {
 +              pagetable_free(ptdesc);
                return NULL;
        }
  
 -      atomic_set(&page->pt_frag_refcount, 1);
 +      atomic_set(&ptdesc->pt_frag_refcount, 1);
  
 -      ret = page_address(page);
 +      ret = ptdesc_address(ptdesc);
        /*
         * if we support only one fragment just return the
         * allocated page.
  
        spin_lock(&mm->page_table_lock);
        /*
 -       * If we find pgtable_page set, we return
 +       * If we find ptdesc_page set, we return
         * the allocated page with single fragment
         * count.
         */
        if (likely(!mm->context.pmd_frag)) {
 -              atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
 +              atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
                mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
        }
        spin_unlock(&mm->page_table_lock);
@@@ -435,15 -358,15 +436,15 @@@ pmd_t *pmd_fragment_alloc(struct mm_str
  
  void pmd_fragment_free(unsigned long *pmd)
  {
 -      struct page *page = virt_to_page(pmd);
 +      struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
  
 -      if (PageReserved(page))
 -              return free_reserved_page(page);
 +      if (pagetable_is_reserved(ptdesc))
 +              return free_reserved_ptdesc(ptdesc);
  
 -      BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
 -      if (atomic_dec_and_test(&page->pt_frag_refcount)) {
 -              pgtable_pmd_page_dtor(page);
 -              __free_page(page);
 +      BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
 +      if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
 +              pagetable_pmd_dtor(ptdesc);
 +              pagetable_free(ptdesc);
        }
  }
  
@@@ -37,7 -37,6 +37,6 @@@
  #include <mm/mmu_decl.h>
  
  unsigned int mmu_base_pid;
- unsigned long radix_mem_block_size __ro_after_init;
  
  static __ref void *early_alloc_pgtable(unsigned long size, int nid,
                        unsigned long region_start, unsigned long region_end)
@@@ -300,7 -299,7 +299,7 @@@ static int __meminit create_physical_ma
        bool prev_exec, exec = false;
        pgprot_t prot;
        int psize;
-       unsigned long max_mapping_size = radix_mem_block_size;
+       unsigned long max_mapping_size = memory_block_size;
  
        if (debug_pagealloc_enabled_or_kfence())
                max_mapping_size = PAGE_SIZE;
@@@ -502,58 -501,6 +501,6 @@@ static int __init radix_dt_scan_page_si
        return 1;
  }
  
- #ifdef CONFIG_MEMORY_HOTPLUG
- static int __init probe_memory_block_size(unsigned long node, const char *uname, int
-                                         depth, void *data)
- {
-       unsigned long *mem_block_size = (unsigned long *)data;
-       const __be32 *prop;
-       int len;
-       if (depth != 1)
-               return 0;
-       if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
-               return 0;
-       prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
-       if (!prop || len < dt_root_size_cells * sizeof(__be32))
-               /*
-                * Nothing in the device tree
-                */
-               *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-       else
-               *mem_block_size = of_read_number(prop, dt_root_size_cells);
-       return 1;
- }
- static unsigned long __init radix_memory_block_size(void)
- {
-       unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-       /*
-        * OPAL firmware feature is set by now. Hence we are ok
-        * to test OPAL feature.
-        */
-       if (firmware_has_feature(FW_FEATURE_OPAL))
-               mem_block_size = 1UL * 1024 * 1024 * 1024;
-       else
-               of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
-       return mem_block_size;
- }
- #else   /* CONFIG_MEMORY_HOTPLUG */
- static unsigned long __init radix_memory_block_size(void)
- {
-       return 1UL * 1024 * 1024 * 1024;
- }
- #endif /* CONFIG_MEMORY_HOTPLUG */
  void __init radix__early_init_devtree(void)
  {
        int rc;
                mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
                        psize_to_rpti_pgsize(MMU_PAGE_64K);
        }
-       /*
-        * Max mapping size used when mapping pages. We don't use
-        * ppc_md.memory_block_size() here because this get called
-        * early and we don't have machine probe called yet. Also
-        * the pseries implementation only check for ibm,lmb-size.
-        * All hypervisor supporting radix do expose that device
-        * tree node.
-        */
-       radix_mem_block_size = radix_memory_block_size();
        return;
  }
  
@@@ -601,6 -538,17 +538,6 @@@ void __init radix__early_init_mmu(void
  #else
        mmu_virtual_psize = MMU_PAGE_4K;
  #endif
 -
 -#ifdef CONFIG_SPARSEMEM_VMEMMAP
 -      /* vmemmap mapping */
 -      if (mmu_psize_defs[MMU_PAGE_2M].shift) {
 -              /*
 -               * map vmemmap using 2M if available
 -               */
 -              mmu_vmemmap_psize = MMU_PAGE_2M;
 -      } else
 -              mmu_vmemmap_psize = mmu_virtual_psize;
 -#endif
  #endif
        /*
         * initialize page table size
@@@ -733,58 -681,8 +670,58 @@@ static void free_pud_table(pud_t *pud_s
        p4d_clear(p4d);
  }
  
 -static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 -                           unsigned long end, bool direct)
 +#ifdef CONFIG_SPARSEMEM_VMEMMAP
 +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
 +{
 +      unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
 +
 +      return !vmemmap_populated(start, PMD_SIZE);
 +}
 +
 +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
 +{
 +      unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
 +
 +      return !vmemmap_populated(start, PAGE_SIZE);
 +
 +}
 +#endif
 +
 +static void __meminit free_vmemmap_pages(struct page *page,
 +                                       struct vmem_altmap *altmap,
 +                                       int order)
 +{
 +      unsigned int nr_pages = 1 << order;
 +
 +      if (altmap) {
 +              unsigned long alt_start, alt_end;
 +              unsigned long base_pfn = page_to_pfn(page);
 +
 +              /*
 +               * with 2M vmemmap mmaping we can have things setup
 +               * such that even though atlmap is specified we never
 +               * used altmap.
 +               */
 +              alt_start = altmap->base_pfn;
 +              alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
 +
 +              if (base_pfn >= alt_start && base_pfn < alt_end) {
 +                      vmem_altmap_free(altmap, nr_pages);
 +                      return;
 +              }
 +      }
 +
 +      if (PageReserved(page)) {
 +              /* allocated from memblock */
 +              while (nr_pages--)
 +                      free_reserved_page(page++);
 +      } else
 +              free_pages((unsigned long)page_address(page), order);
 +}
 +
 +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
 +                                     unsigned long end, bool direct,
 +                                     struct vmem_altmap *altmap)
  {
        unsigned long next, pages = 0;
        pte_t *pte;
                if (!pte_present(*pte))
                        continue;
  
 -              if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
 -                      /*
 -                       * The vmemmap_free() and remove_section_mapping()
 -                       * codepaths call us with aligned addresses.
 -                       */
 -                      WARN_ONCE(1, "%s: unaligned range\n", __func__);
 -                      continue;
 +              if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
 +                      if (!direct)
 +                              free_vmemmap_pages(pte_page(*pte), altmap, 0);
 +                      pte_clear(&init_mm, addr, pte);
 +                      pages++;
                }
 -
 -              pte_clear(&init_mm, addr, pte);
 -              pages++;
 +#ifdef CONFIG_SPARSEMEM_VMEMMAP
 +              else if (!direct && vmemmap_page_is_unused(addr, next)) {
 +                      free_vmemmap_pages(pte_page(*pte), altmap, 0);
 +                      pte_clear(&init_mm, addr, pte);
 +              }
 +#endif
        }
        if (direct)
                update_page_count(mmu_virtual_psize, -pages);
  }
  
  static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 -                                     unsigned long end, bool direct)
 +                                     unsigned long end, bool direct,
 +                                     struct vmem_altmap *altmap)
  {
        unsigned long next, pages = 0;
        pte_t *pte_base;
                        continue;
  
                if (pmd_is_leaf(*pmd)) {
 -                      if (!IS_ALIGNED(addr, PMD_SIZE) ||
 -                          !IS_ALIGNED(next, PMD_SIZE)) {
 -                              WARN_ONCE(1, "%s: unaligned range\n", __func__);
 -                              continue;
 +                      if (IS_ALIGNED(addr, PMD_SIZE) &&
 +                          IS_ALIGNED(next, PMD_SIZE)) {
 +                              if (!direct)
 +                                      free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
 +                              pte_clear(&init_mm, addr, (pte_t *)pmd);
 +                              pages++;
                        }
 -                      pte_clear(&init_mm, addr, (pte_t *)pmd);
 -                      pages++;
 +#ifdef CONFIG_SPARSEMEM_VMEMMAP
 +                      else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
 +                              free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
 +                              pte_clear(&init_mm, addr, (pte_t *)pmd);
 +                      }
 +#endif
                        continue;
                }
  
                pte_base = (pte_t *)pmd_page_vaddr(*pmd);
 -              remove_pte_table(pte_base, addr, next, direct);
 +              remove_pte_table(pte_base, addr, next, direct, altmap);
                free_pte_table(pte_base, pmd);
        }
        if (direct)
  }
  
  static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
 -                                     unsigned long end, bool direct)
 +                                     unsigned long end, bool direct,
 +                                     struct vmem_altmap *altmap)
  {
        unsigned long next, pages = 0;
        pmd_t *pmd_base;
                }
  
                pmd_base = pud_pgtable(*pud);
 -              remove_pmd_table(pmd_base, addr, next, direct);
 +              remove_pmd_table(pmd_base, addr, next, direct, altmap);
                free_pmd_table(pmd_base, pud);
        }
        if (direct)
                update_page_count(MMU_PAGE_1G, -pages);
  }
  
 -static void __meminit remove_pagetable(unsigned long start, unsigned long end,
 -                                     bool direct)
 +static void __meminit
 +remove_pagetable(unsigned long start, unsigned long end, bool direct,
 +               struct vmem_altmap *altmap)
  {
        unsigned long addr, next;
        pud_t *pud_base;
                }
  
                pud_base = p4d_pgtable(*p4d);
 -              remove_pud_table(pud_base, addr, next, direct);
 +              remove_pud_table(pud_base, addr, next, direct, altmap);
                free_pud_table(pud_base, p4d);
        }
  
@@@ -943,7 -831,7 +880,7 @@@ int __meminit radix__create_section_map
  
  int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
  {
 -      remove_pagetable(start, end, true);
 +      remove_pagetable(start, end, true, NULL);
        return 0;
  }
  #endif /* CONFIG_MEMORY_HOTPLUG */
@@@ -975,429 -863,10 +912,429 @@@ int __meminit radix__vmemmap_create_map
        return 0;
  }
  
 +
 +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
 +{
 +      if (radix_enabled())
 +              return __vmemmap_can_optimize(altmap, pgmap);
 +
 +      return false;
 +}
 +
 +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 +                              unsigned long addr, unsigned long next)
 +{
 +      int large = pmd_large(*pmdp);
 +
 +      if (large)
 +              vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
 +
 +      return large;
 +}
 +
 +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
 +                             unsigned long addr, unsigned long next)
 +{
 +      pte_t entry;
 +      pte_t *ptep = pmdp_ptep(pmdp);
 +
 +      VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
 +      entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 +      set_pte_at(&init_mm, addr, ptep, entry);
 +      asm volatile("ptesync": : :"memory");
 +
 +      vmemmap_verify(ptep, node, addr, next);
 +}
 +
 +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
 +                                                   int node,
 +                                                   struct vmem_altmap *altmap,
 +                                                   struct page *reuse)
 +{
 +      pte_t *pte = pte_offset_kernel(pmdp, addr);
 +
 +      if (pte_none(*pte)) {
 +              pte_t entry;
 +              void *p;
 +
 +              if (!reuse) {
 +                      /*
 +                       * make sure we don't create altmap mappings
 +                       * covering things outside the device.
 +                       */
 +                      if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
 +                              altmap = NULL;
 +
 +                      p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 +                      if (!p && altmap)
 +                              p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
 +                      if (!p)
 +                              return NULL;
 +                      pr_debug("PAGE_SIZE vmemmap mapping\n");
 +              } else {
 +                      /*
 +                       * When a PTE/PMD entry is freed from the init_mm
 +                       * there's a free_pages() call to this page allocated
 +                       * above. Thus this get_page() is paired with the
 +                       * put_page_testzero() on the freeing path.
 +                       * This can only called by certain ZONE_DEVICE path,
 +                       * and through vmemmap_populate_compound_pages() when
 +                       * slab is available.
 +                       */
 +                      get_page(reuse);
 +                      p = page_to_virt(reuse);
 +                      pr_debug("Tail page reuse vmemmap mapping\n");
 +              }
 +
 +              VM_BUG_ON(!PAGE_ALIGNED(addr));
 +              entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 +              set_pte_at(&init_mm, addr, pte, entry);
 +              asm volatile("ptesync": : :"memory");
 +      }
 +      return pte;
 +}
 +
 +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
 +                                     unsigned long address)
 +{
 +      pud_t *pud;
 +
 +      /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
 +      if (unlikely(p4d_none(*p4dp))) {
 +              if (unlikely(!slab_is_available())) {
 +                      pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
 +                      p4d_populate(&init_mm, p4dp, pud);
 +                      /* go to the pud_offset */
 +              } else
 +                      return pud_alloc(&init_mm, p4dp, address);
 +      }
 +      return pud_offset(p4dp, address);
 +}
 +
 +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
 +                                     unsigned long address)
 +{
 +      pmd_t *pmd;
 +
 +      /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
 +      if (unlikely(pud_none(*pudp))) {
 +              if (unlikely(!slab_is_available())) {
 +                      pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
 +                      pud_populate(&init_mm, pudp, pmd);
 +              } else
 +                      return pmd_alloc(&init_mm, pudp, address);
 +      }
 +      return pmd_offset(pudp, address);
 +}
 +
 +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
 +                                     unsigned long address)
 +{
 +      pte_t *pte;
 +
 +      /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
 +      if (unlikely(pmd_none(*pmdp))) {
 +              if (unlikely(!slab_is_available())) {
 +                      pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
 +                      pmd_populate(&init_mm, pmdp, pte);
 +              } else
 +                      return pte_alloc_kernel(pmdp, address);
 +      }
 +      return pte_offset_kernel(pmdp, address);
 +}
 +
 +
 +
 +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
 +                                    struct vmem_altmap *altmap)
 +{
 +      unsigned long addr;
 +      unsigned long next;
 +      pgd_t *pgd;
 +      p4d_t *p4d;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +
 +      for (addr = start; addr < end; addr = next) {
 +              next = pmd_addr_end(addr, end);
 +
 +              pgd = pgd_offset_k(addr);
 +              p4d = p4d_offset(pgd, addr);
 +              pud = vmemmap_pud_alloc(p4d, node, addr);
 +              if (!pud)
 +                      return -ENOMEM;
 +              pmd = vmemmap_pmd_alloc(pud, node, addr);
 +              if (!pmd)
 +                      return -ENOMEM;
 +
 +              if (pmd_none(READ_ONCE(*pmd))) {
 +                      void *p;
 +
 +                      /*
 +                       * keep it simple by checking addr PMD_SIZE alignment
 +                       * and verifying the device boundary condition.
 +                       * For us to use a pmd mapping, both addr and pfn should
 +                       * be aligned. We skip if addr is not aligned and for
 +                       * pfn we hope we have extra area in the altmap that
 +                       * can help to find an aligned block. This can result
 +                       * in altmap block allocation failures, in which case
 +                       * we fallback to RAM for vmemmap allocation.
 +                       */
 +                      if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
 +                                     altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
 +                              /*
 +                               * make sure we don't create altmap mappings
 +                               * covering things outside the device.
 +                               */
 +                              goto base_mapping;
 +                      }
 +
 +                      p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
 +                      if (p) {
 +                              vmemmap_set_pmd(pmd, p, node, addr, next);
 +                              pr_debug("PMD_SIZE vmemmap mapping\n");
 +                              continue;
 +                      } else if (altmap) {
 +                              /*
 +                               * A vmemmap block allocation can fail due to
 +                               * alignment requirements and we trying to align
 +                               * things aggressively there by running out of
 +                               * space. Try base mapping on failure.
 +                               */
 +                              goto base_mapping;
 +                      }
 +              } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
 +                      /*
 +                       * If a huge mapping exist due to early call to
 +                       * vmemmap_populate, let's try to use that.
 +                       */
 +                      continue;
 +              }
 +base_mapping:
 +              /*
 +               * Not able allocate higher order memory to back memmap
 +               * or we found a pointer to pte page. Allocate base page
 +               * size vmemmap
 +               */
 +              pte = vmemmap_pte_alloc(pmd, node, addr);
 +              if (!pte)
 +                      return -ENOMEM;
 +
 +              pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
 +              if (!pte)
 +                      return -ENOMEM;
 +
 +              vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 +              next = addr + PAGE_SIZE;
 +      }
 +      return 0;
 +}
 +
 +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
 +                                                       struct vmem_altmap *altmap,
 +                                                       struct page *reuse)
 +{
 +      pgd_t *pgd;
 +      p4d_t *p4d;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +
 +      pgd = pgd_offset_k(addr);
 +      p4d = p4d_offset(pgd, addr);
 +      pud = vmemmap_pud_alloc(p4d, node, addr);
 +      if (!pud)
 +              return NULL;
 +      pmd = vmemmap_pmd_alloc(pud, node, addr);
 +      if (!pmd)
 +              return NULL;
 +      if (pmd_leaf(*pmd))
 +              /*
 +               * The second page is mapped as a hugepage due to a nearby request.
 +               * Force our mapping to page size without deduplication
 +               */
 +              return NULL;
 +      pte = vmemmap_pte_alloc(pmd, node, addr);
 +      if (!pte)
 +              return NULL;
 +      radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
 +      vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 +
 +      return pte;
 +}
 +
 +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
 +                                                  unsigned long pfn_offset, int node)
 +{
 +      pgd_t *pgd;
 +      p4d_t *p4d;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +      unsigned long map_addr;
 +
 +      /* the second vmemmap page which we use for duplication */
 +      map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
 +      pgd = pgd_offset_k(map_addr);
 +      p4d = p4d_offset(pgd, map_addr);
 +      pud = vmemmap_pud_alloc(p4d, node, map_addr);
 +      if (!pud)
 +              return NULL;
 +      pmd = vmemmap_pmd_alloc(pud, node, map_addr);
 +      if (!pmd)
 +              return NULL;
 +      if (pmd_leaf(*pmd))
 +              /*
 +               * The second page is mapped as a hugepage due to a nearby request.
 +               * Force our mapping to page size without deduplication
 +               */
 +              return NULL;
 +      pte = vmemmap_pte_alloc(pmd, node, map_addr);
 +      if (!pte)
 +              return NULL;
 +      /*
 +       * Check if there exist a mapping to the left
 +       */
 +      if (pte_none(*pte)) {
 +              /*
 +               * Populate the head page vmemmap page.
 +               * It can fall in different pmd, hence
 +               * vmemmap_populate_address()
 +               */
 +              pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
 +              if (!pte)
 +                      return NULL;
 +              /*
 +               * Populate the tail pages vmemmap page
 +               */
 +              pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
 +              if (!pte)
 +                      return NULL;
 +              vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
 +              return pte;
 +      }
 +      return pte;
 +}
 +
 +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 +                                            unsigned long start,
 +                                            unsigned long end, int node,
 +                                            struct dev_pagemap *pgmap)
 +{
 +      /*
 +       * we want to map things as base page size mapping so that
 +       * we can save space in vmemmap. We could have huge mapping
 +       * covering out both edges.
 +       */
 +      unsigned long addr;
 +      unsigned long addr_pfn = start_pfn;
 +      unsigned long next;
 +      pgd_t *pgd;
 +      p4d_t *p4d;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +
 +      for (addr = start; addr < end; addr = next) {
 +
 +              pgd = pgd_offset_k(addr);
 +              p4d = p4d_offset(pgd, addr);
 +              pud = vmemmap_pud_alloc(p4d, node, addr);
 +              if (!pud)
 +                      return -ENOMEM;
 +              pmd = vmemmap_pmd_alloc(pud, node, addr);
 +              if (!pmd)
 +                      return -ENOMEM;
 +
 +              if (pmd_leaf(READ_ONCE(*pmd))) {
 +                      /* existing huge mapping. Skip the range */
 +                      addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
 +                      next = pmd_addr_end(addr, end);
 +                      continue;
 +              }
 +              pte = vmemmap_pte_alloc(pmd, node, addr);
 +              if (!pte)
 +                      return -ENOMEM;
 +              if (!pte_none(*pte)) {
 +                      /*
 +                       * This could be because we already have a compound
 +                       * page whose VMEMMAP_RESERVE_NR pages were mapped and
 +                       * this request fall in those pages.
 +                       */
 +                      addr_pfn += 1;
 +                      next = addr + PAGE_SIZE;
 +                      continue;
 +              } else {
 +                      unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
 +                      unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
 +                      pte_t *tail_page_pte;
 +
 +                      /*
 +                       * if the address is aligned to huge page size it is the
 +                       * head mapping.
 +                       */
 +                      if (pfn_offset == 0) {
 +                              /* Populate the head page vmemmap page */
 +                              pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
 +                              if (!pte)
 +                                      return -ENOMEM;
 +                              vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 +
 +                              /*
 +                               * Populate the tail pages vmemmap page
 +                               * It can fall in different pmd, hence
 +                               * vmemmap_populate_address()
 +                               */
 +                              pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
 +                              if (!pte)
 +                                      return -ENOMEM;
 +
 +                              addr_pfn += 2;
 +                              next = addr + 2 * PAGE_SIZE;
 +                              continue;
 +                      }
 +                      /*
 +                       * get the 2nd mapping details
 +                       * Also create it if that doesn't exist
 +                       */
 +                      tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
 +                      if (!tail_page_pte) {
 +
 +                              pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
 +                              if (!pte)
 +                                      return -ENOMEM;
 +                              vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 +
 +                              addr_pfn += 1;
 +                              next = addr + PAGE_SIZE;
 +                              continue;
 +                      }
 +
 +                      pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
 +                      if (!pte)
 +                              return -ENOMEM;
 +                      vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 +
 +                      addr_pfn += 1;
 +                      next = addr + PAGE_SIZE;
 +                      continue;
 +              }
 +      }
 +      return 0;
 +}
 +
 +
  #ifdef CONFIG_MEMORY_HOTPLUG
  void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
  {
 -      remove_pagetable(start, start + page_size, false);
 +      remove_pagetable(start, start + page_size, true, NULL);
 +}
 +
 +void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
 +                             struct vmem_altmap *altmap)
 +{
 +      remove_pagetable(start, end, false, altmap);
  }
  #endif
  #endif
@@@ -1430,24 -899,7 +1367,24 @@@ unsigned long radix__pmd_hugepage_updat
  #endif
  
        old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
 -      trace_hugepage_update(addr, old, clr, set);
 +      trace_hugepage_update_pmd(addr, old, clr, set);
 +
 +      return old;
 +}
 +
 +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
 +                                       pud_t *pudp, unsigned long clr,
 +                                       unsigned long set)
 +{
 +      unsigned long old;
 +
 +#ifdef CONFIG_DEBUG_VM
 +      WARN_ON(!pud_devmap(*pudp));
 +      assert_spin_locked(pud_lockptr(mm, pudp));
 +#endif
 +
 +      old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
 +      trace_hugepage_update_pud(addr, old, clr, set);
  
        return old;
  }
@@@ -1528,17 -980,6 +1465,17 @@@ pmd_t radix__pmdp_huge_get_and_clear(st
        return old_pmd;
  }
  
 +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
 +                                   unsigned long addr, pud_t *pudp)
 +{
 +      pud_t old_pud;
 +      unsigned long old;
 +
 +      old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
 +      old_pud = __pud(old);
 +      return old_pud;
 +}
 +
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
@@@ -127,21 -127,6 +127,6 @@@ static __always_inline void __tlbie_pid
        trace_tlbie(0, 0, rb, rs, ric, prs, r);
  }
  
- static __always_inline void __tlbie_pid_lpid(unsigned long pid,
-                                            unsigned long lpid,
-                                            unsigned long ric)
- {
-       unsigned long rb, rs, prs, r;
-       rb = PPC_BIT(53); /* IS = 1 */
-       rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 0, rb, rs, ric, prs, r);
- }
  static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
  {
        unsigned long rb,rs,prs,r;
@@@ -202,23 -187,6 +187,6 @@@ static __always_inline void __tlbie_va(
        trace_tlbie(0, 0, rb, rs, ric, prs, r);
  }
  
- static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
-                                           unsigned long lpid,
-                                           unsigned long ap, unsigned long ric)
- {
-       unsigned long rb, rs, prs, r;
-       rb = va & ~(PPC_BITMASK(52, 63));
-       rb |= ap << PPC_BITLSHIFT(58);
-       rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 0, rb, rs, ric, prs, r);
- }
  static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
                                            unsigned long ap, unsigned long ric)
  {
@@@ -264,22 -232,6 +232,6 @@@ static inline void fixup_tlbie_va_range
        }
  }
  
- static inline void fixup_tlbie_va_range_lpid(unsigned long va,
-                                            unsigned long pid,
-                                            unsigned long lpid,
-                                            unsigned long ap)
- {
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
-               asm volatile("ptesync" : : : "memory");
-               __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
-       }
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
-               asm volatile("ptesync" : : : "memory");
-               __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
-       }
- }
  static inline void fixup_tlbie_pid(unsigned long pid)
  {
        /*
        }
  }
  
- static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
- {
-       /*
-        * We can use any address for the invalidation, pick one which is
-        * probably unused as an optimisation.
-        */
-       unsigned long va = ((1UL << 52) - 1);
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
-               asm volatile("ptesync" : : : "memory");
-               __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
-       }
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
-               asm volatile("ptesync" : : : "memory");
-               __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
-                               RIC_FLUSH_TLB);
-       }
- }
  static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
                                       unsigned long ap)
  {
@@@ -416,31 -348,6 +348,6 @@@ static inline void _tlbie_pid(unsigned 
        asm volatile("eieio; tlbsync; ptesync": : :"memory");
  }
  
- static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
-                                  unsigned long ric)
- {
-       asm volatile("ptesync" : : : "memory");
-       /*
-        * Workaround the fact that the "ric" argument to __tlbie_pid
-        * must be a compile-time contraint to match the "i" constraint
-        * in the asm statement.
-        */
-       switch (ric) {
-       case RIC_FLUSH_TLB:
-               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
-               fixup_tlbie_pid_lpid(pid, lpid);
-               break;
-       case RIC_FLUSH_PWC:
-               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
-               break;
-       case RIC_FLUSH_ALL:
-       default:
-               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
-               fixup_tlbie_pid_lpid(pid, lpid);
-       }
-       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
- }
  struct tlbiel_pid {
        unsigned long pid;
        unsigned long ric;
@@@ -566,20 -473,6 +473,6 @@@ static inline void __tlbie_va_range(uns
        fixup_tlbie_va_range(addr - page_size, pid, ap);
  }
  
- static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
-                                        unsigned long pid, unsigned long lpid,
-                                        unsigned long page_size,
-                                        unsigned long psize)
- {
-       unsigned long addr;
-       unsigned long ap = mmu_get_ap(psize);
-       for (addr = start; addr < end; addr += page_size)
-               __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
-       fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
- }
  static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
                                      unsigned long psize, unsigned long ric)
  {
@@@ -660,18 -553,6 +553,6 @@@ static inline void _tlbie_va_range(unsi
        asm volatile("eieio; tlbsync; ptesync": : :"memory");
  }
  
- static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
-                                       unsigned long pid, unsigned long lpid,
-                                       unsigned long page_size,
-                                       unsigned long psize, bool also_pwc)
- {
-       asm volatile("ptesync" : : : "memory");
-       if (also_pwc)
-               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
-       __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
-       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
- }
  static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
                                unsigned long start, unsigned long end,
                                unsigned long pid, unsigned long page_size,
@@@ -820,7 -701,7 +701,7 @@@ void exit_lazy_flush_tlb(struct mm_stru
         * that's what the caller expects.
         */
        if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
-               atomic_dec(&mm->context.active_cpus);
+               dec_mm_active_cpus(mm);
                cpumask_clear_cpu(cpu, mm_cpumask(mm));
                always_flush = true;
        }
@@@ -987,7 -868,6 +868,7 @@@ void radix__flush_tlb_mm(struct mm_stru
                }
        }
        preempt_enable();
 +      mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
  }
  EXPORT_SYMBOL(radix__flush_tlb_mm);
  
@@@ -1021,7 -901,6 +902,7 @@@ static void __flush_all_mm(struct mm_st
                        _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
        }
        preempt_enable();
 +      mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
  }
  
  void radix__flush_all_mm(struct mm_struct *mm)
@@@ -1230,7 -1109,6 +1111,7 @@@ static inline void __radix__flush_tlb_r
        }
  out:
        preempt_enable();
 +      mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
  }
  
  void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@@ -1316,7 -1194,35 +1197,35 @@@ void radix__tlb_flush(struct mmu_gathe
         * See the comment for radix in arch_exit_mmap().
         */
        if (tlb->fullmm) {
-               __flush_all_mm(mm, true);
+               if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+                       /*
+                        * Shootdown based lazy tlb mm refcounting means we
+                        * have to IPI everyone in the mm_cpumask anyway soon
+                        * when the mm goes away, so might as well do it as
+                        * part of the final flush now.
+                        *
+                        * If lazy shootdown was improved to reduce IPIs (e.g.,
+                        * by batching), then it may end up being better to use
+                        * tlbies here instead.
+                        */
+                       preempt_disable();
+                       smp_mb(); /* see radix__flush_tlb_mm */
+                       exit_flush_lazy_tlbs(mm);
+                       _tlbiel_pid(mm->context.id, RIC_FLUSH_ALL);
+                       /*
+                        * It should not be possible to have coprocessors still
+                        * attached here.
+                        */
+                       if (WARN_ON_ONCE(atomic_read(&mm->context.copros) > 0))
+                               __flush_all_mm(mm, true);
+                       preempt_enable();
+               } else {
+                       __flush_all_mm(mm, true);
+               }
        } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
                if (!tlb->freed_tables)
                        radix__flush_tlb_mm(mm);
@@@ -1395,7 -1301,6 +1304,7 @@@ static void __radix__flush_tlb_range_ps
        }
  out:
        preempt_enable();
 +      mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
  }
  
  void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@@ -1465,13 -1370,6 +1374,13 @@@ void radix__flush_pmd_tlb_range(struct 
  }
  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
  
 +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
 +                              unsigned long start, unsigned long end)
 +{
 +      radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
 +}
 +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
 +
  void radix__flush_tlb_all(void)
  {
        unsigned long rb,prs,r,rs;
  }
  
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+                                            unsigned long lpid,
+                                            unsigned long ric)
+ {
+       unsigned long rb, rs, prs, r;
+       rb = PPC_BIT(53); /* IS = 1 */
+       rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 0, rb, rs, ric, prs, r);
+ }
+ static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+                                           unsigned long lpid,
+                                           unsigned long ap, unsigned long ric)
+ {
+       unsigned long rb, rs, prs, r;
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 0, rb, rs, ric, prs, r);
+ }
+ static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+ {
+       /*
+        * We can use any address for the invalidation, pick one which is
+        * probably unused as an optimisation.
+        */
+       unsigned long va = ((1UL << 52) - 1);
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+               asm volatile("ptesync" : : : "memory");
+               __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+       }
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+               asm volatile("ptesync" : : : "memory");
+               __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+                               RIC_FLUSH_TLB);
+       }
+ }
+ static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+                                  unsigned long ric)
+ {
+       asm volatile("ptesync" : : : "memory");
+       /*
+        * Workaround the fact that the "ric" argument to __tlbie_pid
+        * must be a compile-time contraint to match the "i" constraint
+        * in the asm statement.
+        */
+       switch (ric) {
+       case RIC_FLUSH_TLB:
+               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+               fixup_tlbie_pid_lpid(pid, lpid);
+               break;
+       case RIC_FLUSH_PWC:
+               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+               break;
+       case RIC_FLUSH_ALL:
+       default:
+               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+               fixup_tlbie_pid_lpid(pid, lpid);
+       }
+       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+ }
+ static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+                                            unsigned long pid,
+                                            unsigned long lpid,
+                                            unsigned long ap)
+ {
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+               asm volatile("ptesync" : : : "memory");
+               __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+       }
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+               asm volatile("ptesync" : : : "memory");
+               __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+       }
+ }
+ static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+                                        unsigned long pid, unsigned long lpid,
+                                        unsigned long page_size,
+                                        unsigned long psize)
+ {
+       unsigned long addr;
+       unsigned long ap = mmu_get_ap(psize);
+       for (addr = start; addr < end; addr += page_size)
+               __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+       fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+ }
+ static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+                                       unsigned long pid, unsigned long lpid,
+                                       unsigned long page_size,
+                                       unsigned long psize, bool also_pwc)
+ {
+       asm volatile("ptesync" : : : "memory");
+       if (also_pwc)
+               __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+       __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+ }
  /*
   * Performs process-scoped invalidations for a given LPID
   * as part of H_RPT_INVALIDATE hcall.
@@@ -40,6 -40,7 +40,7 @@@
  #include <linux/of_fdt.h>
  #include <linux/libfdt.h>
  #include <linux/memremap.h>
+ #include <linux/memory.h>
  
  #include <asm/pgalloc.h>
  #include <asm/page.h>
@@@ -92,7 -93,7 +93,7 @@@ static struct page * __meminit vmemmap_
   * a page table lookup here because with the hash translation we don't keep
   * vmemmap details in linux page table.
   */
 -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
 +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
  {
        struct page *start;
        unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
@@@ -183,8 -184,8 +184,8 @@@ static __meminit int vmemmap_list_popul
        return 0;
  }
  
 -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
 -                              unsigned long page_size)
 +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
 +                         unsigned long page_size)
  {
        unsigned long nr_pfn = page_size / sizeof(struct page);
        unsigned long start_pfn = page_to_pfn((struct page *)start);
        return false;
  }
  
 -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 -              struct vmem_altmap *altmap)
 +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
 +                                      struct vmem_altmap *altmap)
  {
        bool altmap_alloc;
        unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
        return 0;
  }
  
 +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 +                             struct vmem_altmap *altmap)
 +{
 +
 +#ifdef CONFIG_PPC_BOOK3S_64
 +      if (radix_enabled())
 +              return radix__vmemmap_populate(start, end, node, altmap);
 +#endif
 +
 +      return __vmemmap_populate(start, end, node, altmap);
 +}
 +
  #ifdef CONFIG_MEMORY_HOTPLUG
  static unsigned long vmemmap_list_free(unsigned long start)
  {
        return vmem_back->phys;
  }
  
 -void __ref vmemmap_free(unsigned long start, unsigned long end,
 -              struct vmem_altmap *altmap)
 +static void __ref __vmemmap_free(unsigned long start, unsigned long end,
 +                               struct vmem_altmap *altmap)
  {
        unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
        unsigned long page_order = get_order(page_size);
        start = ALIGN_DOWN(start, page_size);
        if (altmap) {
                alt_start = altmap->base_pfn;
 -              alt_end = altmap->base_pfn + altmap->reserve +
 -                        altmap->free + altmap->alloc + altmap->align;
 +              alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
        }
  
        pr_debug("vmemmap_free %lx...%lx\n", start, end);
                vmemmap_remove_mapping(start, page_size);
        }
  }
 +
 +void __ref vmemmap_free(unsigned long start, unsigned long end,
 +                      struct vmem_altmap *altmap)
 +{
 +#ifdef CONFIG_PPC_BOOK3S_64
 +      if (radix_enabled())
 +              return radix__vmemmap_free(start, end, altmap);
 +#endif
 +      return __vmemmap_free(start, end, altmap);
 +}
 +
  #endif
  void register_page_bootmem_memmap(unsigned long section_nr,
                                  struct page *start_page, unsigned long size)
@@@ -493,6 -472,130 +494,130 @@@ static int __init dt_scan_mmu_pid_width
        return 1;
  }
  
+ /*
+  * Outside hotplug the kernel uses this value to map the kernel direct map
+  * with radix. To be compatible with older kernels, let's keep this value
+  * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
+  * things with 1GB size in the case where we don't support hotplug.
+  */
+ #ifndef CONFIG_MEMORY_HOTPLUG
+ #define DEFAULT_MEMORY_BLOCK_SIZE     SZ_16M
+ #else
+ #define DEFAULT_MEMORY_BLOCK_SIZE     MIN_MEMORY_BLOCK_SIZE
+ #endif
+ static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
+ {
+       unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+       for (; *block_size > min_memory_block_size; *block_size >>= 2) {
+               if ((mem_size & *block_size) == 0)
+                       break;
+       }
+ }
+ static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+                                         depth, void *data)
+ {
+       const char *type;
+       unsigned long *block_size = (unsigned long *)data;
+       const __be32 *reg, *endp;
+       int l;
+       if (depth != 1)
+               return 0;
+       /*
+        * If we have dynamic-reconfiguration-memory node, use the
+        * lmb value.
+        */
+       if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+               const __be32 *prop;
+               prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
+               if (!prop || l < dt_root_size_cells * sizeof(__be32))
+                       /*
+                        * Nothing in the device tree
+                        */
+                       *block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+               else
+                       *block_size = of_read_number(prop, dt_root_size_cells);
+               /*
+                * We have found the final value. Don't probe further.
+                */
+               return 1;
+       }
+       /*
+        * Find all the device tree nodes of memory type and make sure
+        * the area can be mapped using the memory block size value
+        * we end up using. We start with 1G value and keep reducing
+        * it such that we can map the entire area using memory_block_size.
+        * This will be used on powernv and older pseries that don't
+        * have ibm,lmb-size node.
+        * For ex: with P5 we can end up with
+        * memory@0 -> 128MB
+        * memory@128M -> 64M
+        * This will end up using 64MB  memory block size value.
+        */
+       type = of_get_flat_dt_prop(node, "device_type", NULL);
+       if (type == NULL || strcmp(type, "memory") != 0)
+               return 0;
+       reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
+       if (!reg)
+               reg = of_get_flat_dt_prop(node, "reg", &l);
+       if (!reg)
+               return 0;
+       endp = reg + (l / sizeof(__be32));
+       while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+               const char *compatible;
+               u64 size;
+               dt_mem_next_cell(dt_root_addr_cells, &reg);
+               size = dt_mem_next_cell(dt_root_size_cells, &reg);
+               if (size) {
+                       update_memory_block_size(block_size, size);
+                       continue;
+               }
+               /*
+                * ibm,coherent-device-memory with linux,usable-memory = 0
+                * Force 256MiB block size. Work around for GPUs on P9 PowerNV
+                * linux,usable-memory == 0 implies driver managed memory and
+                * we can't use large memory block size due to hotplug/unplug
+                * limitations.
+                */
+               compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+               if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
+                       if (*block_size > SZ_256M)
+                               *block_size = SZ_256M;
+                       /*
+                        * We keep 256M as the upper limit with GPU present.
+                        */
+                       return 0;
+               }
+       }
+       /* continue looking for other memory device types */
+       return 0;
+ }
+ /*
+  * start with 1G memory block size. Early init will
+  * fix this with correct value.
+  */
+ unsigned long memory_block_size __ro_after_init = 1UL << 30;
+ static void __init early_init_memory_block_size(void)
+ {
+       /*
+        * We need to do memory_block_size probe early so that
+        * radix__early_init_mmu() can use this as limit for
+        * mapping page size.
+        */
+       of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
+ }
  void __init mmu_early_init_devtree(void)
  {
        bool hvmode = !!(mfmsr() & MSR_HV);
        if (!hvmode)
                early_check_vec5();
  
+       early_init_memory_block_size();
        if (early_radix_enabled()) {
                radix__early_init_devtree();
  
@@@ -7,12 -7,13 +7,12 @@@
   */
  
  #include <linux/init.h>
 -#include <linux/fs_enet_pd.h>
  #include <linux/of_platform.h>
  
  #include <asm/time.h>
  #include <asm/machdep.h>
  #include <asm/cpm1.h>
- #include <asm/fs_pd.h>
+ #include <asm/8xx_immap.h>
  #include <asm/udbg.h>
  
  #include "mpc8xx.h"
@@@ -21,6 -21,7 +21,6 @@@
  #include <linux/device.h>
  #include <linux/delay.h>
  
 -#include <linux/fs_enet_pd.h>
  #include <linux/fs_uart_pd.h>
  #include <linux/fsl_devices.h>
  #include <linux/mii.h>
@@@ -36,7 -37,6 +36,6 @@@
  #include <asm/time.h>
  #include <asm/8xx_immap.h>
  #include <asm/cpm1.h>
- #include <asm/fs_pd.h>
  #include <asm/udbg.h>
  
  #include "mpc885ads.h"
@@@ -24,6 -24,7 +24,6 @@@
  #include <linux/device.h>
  #include <linux/delay.h>
  
 -#include <linux/fs_enet_pd.h>
  #include <linux/fs_uart_pd.h>
  #include <linux/fsl_devices.h>
  #include <linux/mii.h>
@@@ -38,7 -39,6 +38,6 @@@
  #include <asm/time.h>
  #include <asm/8xx_immap.h>
  #include <asm/cpm1.h>
- #include <asm/fs_pd.h>
  #include <asm/udbg.h>
  
  #include "mpc8xx.h"
@@@ -94,7 -94,6 +94,7 @@@ config PPC_BOOK3S_6
        select PPC_FPU
        select PPC_HAVE_PMU_SUPPORT
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 +      select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
        select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
        select ARCH_ENABLE_SPLIT_PMD_PTLOCK
        select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
@@@ -276,6 -275,13 +276,13 @@@ config TARGET_CP
        default "e500mc" if E500MC_CPU
        default "powerpc" if POWERPC_CPU
  
+ config TUNE_CPU
+       string
+       depends on POWERPC64_CPU
+       default "-mtune=power10" if $(cc-option,-mtune=power10)
+       default "-mtune=power9"  if $(cc-option,-mtune=power9)
+       default "-mtune=power8"  if $(cc-option,-mtune=power8)
  config PPC_BOOK3S
        def_bool y
        depends on PPC_BOOK3S_32 || PPC_BOOK3S_64
  #include <asm/drmem.h>
  #include "pseries.h"
  
- unsigned long pseries_memory_block_size(void)
- {
-       struct device_node *np;
-       u64 memblock_size = MIN_MEMORY_BLOCK_SIZE;
-       struct resource r;
-       np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-       if (np) {
-               int len;
-               int size_cells;
-               const __be32 *prop;
-               size_cells = of_n_size_cells(np);
-               prop = of_get_property(np, "ibm,lmb-size", &len);
-               if (prop && len >= size_cells * sizeof(__be32))
-                       memblock_size = of_read_number(prop, size_cells);
-               of_node_put(np);
-       } else  if (machine_is(pseries)) {
-               /* This fallback really only applies to pseries */
-               unsigned int memzero_size = 0;
-               np = of_find_node_by_path("/memory@0");
-               if (np) {
-                       if (!of_address_to_resource(np, 0, &r))
-                               memzero_size = resource_size(&r);
-                       of_node_put(np);
-               }
-               if (memzero_size) {
-                       /* We now know the size of memory@0, use this to find
-                        * the first memoryblock and get its size.
-                        */
-                       char buf[64];
-                       sprintf(buf, "/memory@%x", memzero_size);
-                       np = of_find_node_by_path(buf);
-                       if (np) {
-                               if (!of_address_to_resource(np, 0, &r))
-                                       memblock_size = resource_size(&r);
-                               of_node_put(np);
-                       }
-               }
-       }
-       return memblock_size;
- }
  static void dlpar_free_property(struct property *prop)
  {
        kfree(prop->name);
@@@ -283,7 -235,7 +235,7 @@@ static int dlpar_offline_lmb(struct drm
  
  static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size)
  {
-       unsigned long block_sz, start_pfn;
+       unsigned long start_pfn;
        int sections_per_block;
        int i;
  
        if (!pfn_valid(start_pfn))
                goto out;
  
-       block_sz = pseries_memory_block_size();
-       sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+       sections_per_block = memory_block_size / MIN_MEMORY_BLOCK_SIZE;
  
        for (i = 0; i < sections_per_block; i++) {
                __remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
@@@ -354,7 -305,6 +305,6 @@@ static int dlpar_add_lmb(struct drmem_l
  static int dlpar_remove_lmb(struct drmem_lmb *lmb)
  {
        struct memory_block *mem_block;
-       unsigned long block_sz;
        int rc;
  
        if (!lmb_is_removable(lmb))
                return rc;
        }
  
-       block_sz = pseries_memory_block_size();
-       __remove_memory(lmb->base_addr, block_sz);
+       __remove_memory(lmb->base_addr, memory_block_size);
        put_device(&mem_block->dev);
  
        /* Update memory regions for memory remove */
-       memblock_remove(lmb->base_addr, block_sz);
+       memblock_remove(lmb->base_addr, memory_block_size);
  
        invalidate_lmb_associativity_index(lmb);
        lmb->flags &= ~DRCONF_MEM_ASSIGNED;
@@@ -637,7 -585,7 +585,7 @@@ static int dlpar_add_lmb(struct drmem_l
                nid = first_online_node;
  
        /* Add the memory */
 -      rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
 +      rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY);
        if (rc) {
                invalidate_lmb_associativity_index(lmb);
                return rc;
  #include <linux/device.h>
  #include <linux/platform_device.h>
  #include <linux/of.h>
- #include <linux/of_platform.h>
  #include <linux/phy.h>
  #include <linux/spi/spi.h>
  #include <linux/fsl_devices.h>
 -#include <linux/fs_enet_pd.h>
  #include <linux/fs_uart_pd.h>
  #include <linux/reboot.h>
  
@@@ -36,6 -36,8 +35,6 @@@
  #include <asm/cpm2.h>
  #include <asm/fsl_hcalls.h>   /* For the Freescale hypervisor */
  
 -extern void init_fcc_ioports(struct fs_platform_info*);
 -extern void init_fec_ioports(struct fs_platform_info*);
  extern void init_smc_ioports(struct fs_uart_platform_info*);
  static phys_addr_t immrbase = -1;
  
diff --combined arch/powerpc/xmon/xmon.c
@@@ -58,6 -58,7 +58,7 @@@
  #ifdef CONFIG_PPC64
  #include <asm/hvcall.h>
  #include <asm/paca.h>
+ #include <asm/lppaca.h>
  #endif
  
  #include "nonstdio.h"
@@@ -1084,7 -1085,7 +1085,7 @@@ cmds(struct pt_regs *excp
                                memzcan();
                                break;
                        case 'i':
 -                              show_mem(0, NULL);
 +                              show_mem();
                                break;
                        default:
                                termch = cmd;
@@@ -3303,7 -3304,7 +3304,7 @@@ static void show_pte(unsigned long addr
  {
        unsigned long tskv = 0;
        struct task_struct *volatile tsk = NULL;
-       struct mm_struct *mm;
+       struct mm_struct *volatile mm;
        pgd_t *pgdp;
        p4d_t *p4dp;
        pud_t *pudp;
@@@ -3828,9 -3829,9 +3829,9 @@@ static void dump_tlb_44x(void
  #ifdef CONFIG_PPC_BOOK3E_64
  static void dump_tlb_book3e(void)
  {
-       u32 mmucfg, pidmask, lpidmask;
+       u32 mmucfg;
        u64 ramask;
-       int i, tlb, ntlbs, pidsz, lpidsz, rasz, lrat = 0;
+       int i, tlb, ntlbs, pidsz, lpidsz, rasz;
        int mmu_version;
        static const char *pgsz_names[] = {
                "  1K",
        pidsz = ((mmucfg >> 6) & 0x1f) + 1;
        lpidsz = (mmucfg >> 24) & 0xf;
        rasz = (mmucfg >> 16) & 0x7f;
-       if ((mmu_version > 1) && (mmucfg & 0x10000))
-               lrat = 1;
        printf("Book3E MMU MAV=%d.0,%d TLBs,%d-bit PID,%d-bit LPID,%d-bit RA\n",
               mmu_version, ntlbs, pidsz, lpidsz, rasz);
-       pidmask = (1ul << pidsz) - 1;
-       lpidmask = (1ul << lpidsz) - 1;
        ramask = (1ull << rasz) - 1;
  
        for (tlb = 0; tlb < ntlbs; tlb++) {
@@@ -6,8 -6,10 +6,9 @@@
  #include <linux/input.h>
  #include <linux/kthread.h>
  #include <linux/mutex.h>
+ #include <linux/platform_device.h>
  #include <linux/spinlock.h>
  #include <linux/types.h>
 -#include <linux/of_device.h>
  
  enum ams_irq {
        AMS_IRQ_FREEFALL = 0x01,
@@@ -2,7 -2,6 +2,7 @@@
  #ifndef FS_ENET_H
  #define FS_ENET_H
  
 +#include <linux/clk.h>
  #include <linux/mii.h>
  #include <linux/netdevice.h>
  #include <linux/types.h>
@@@ -10,8 -9,8 +10,6 @@@
  #include <linux/phy.h>
  #include <linux/dma-mapping.h>
  
- #include <asm/fs_pd.h>
 -#include <linux/fs_enet_pd.h>
--
  #ifdef CONFIG_CPM1
  #include <asm/cpm1.h>
  #endif
@@@ -118,23 -117,6 +116,23 @@@ struct phy_info 
  #define ENET_RX_ALIGN  16
  #define ENET_RX_FRSIZE L1_CACHE_ALIGN(PKT_MAXBUF_SIZE + ENET_RX_ALIGN - 1)
  
 +struct fs_platform_info {
 +      /* device specific information */
 +      u32 cp_command;         /* CPM page/sblock/mcn */
 +
 +      u32 dpram_offset;
 +
 +      struct device_node *phy_node;
 +
 +      int rx_ring, tx_ring;   /* number of buffers on rx      */
 +      int rx_copybreak;       /* limit we copy small frames   */
 +      int napi_weight;        /* NAPI weight                  */
 +
 +      int use_rmii;           /* use RMII mode                */
 +
 +      struct clk *clk_per;    /* 'per' clock for register access */
 +};
 +
  struct fs_enet_private {
        struct napi_struct napi;
        struct device *dev;     /* pointer back to the device (must be initialized first) */
@@@ -209,6 -191,11 +207,6 @@@ void fs_cleanup_bds(struct net_device *
  #define PFX DRV_MODULE_NAME   ": "
  
  /***************************************************************************/
 -
 -int fs_enet_platform_init(void);
 -void fs_enet_platform_cleanup(void);
 -
 -/***************************************************************************/
  /* buffer descriptor access macros */
  
  /* access macros */
  #include <linux/platform_device.h>
  #include <linux/phy.h>
  #include <linux/of_address.h>
 -#include <linux/of_device.h>
  #include <linux/of_irq.h>
  #include <linux/gfp.h>
  #include <linux/pgtable.h>
  
  #include <asm/immap_cpm2.h>
- #include <asm/mpc8260.h>
  #include <asm/cpm2.h>
  
  #include <asm/irq.h>
@@@ -105,7 -105,7 +104,7 @@@ static int do_pd_setup(struct fs_enet_p
                goto out_ep;
  
        fep->fcc.mem = (void __iomem *)cpm2_immr;
 -      fpi->dpram_offset = cpm_dpalloc(128, 32);
 +      fpi->dpram_offset = cpm_muram_alloc(128, 32);
        if (IS_ERR_VALUE(fpi->dpram_offset)) {
                ret = fpi->dpram_offset;
                goto out_fcccp;
@@@ -547,7 -547,7 +546,7 @@@ static void tx_restart(struct net_devic
        }
        /* Now update the TBPTR and dirty flag to the current buffer */
        W32(ep, fen_genfcc.fcc_tbptr,
 -              (uint) (((void *)recheck_bd - fep->ring_base) +
 +              (uint)(((void __iomem *)recheck_bd - fep->ring_base) +
                fep->ring_mem_addr));
        fep->dirty_tx = recheck_bd;