Merge tag '5.13-rc-smb3-part3' of git://git.samba.org/sfrench/cifs-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 9 May 2021 20:19:29 +0000 (13:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 9 May 2021 20:19:29 +0000 (13:19 -0700)
Pull cifs fixes from Steve French:
 "Three small SMB3 chmultichannel related changesets (also for stable)
  from the SMB3 test event this week.

  The other fixes are still in review/testing"

* tag '5.13-rc-smb3-part3' of git://git.samba.org/sfrench/cifs-2.6:
  smb3: if max_channels set to more than one channel request multichannel
  smb3: do not attempt multichannel to server which does not support it
  smb3: when mounting with multichannel include it in requested capabilities

1001 files changed:
.gitignore
CREDITS
Documentation/ABI/testing/sysfs-class-net-qmi
Documentation/ABI/testing/sysfs-devices-system-cpu
Documentation/ABI/testing/sysfs-driver-input-exc3000
Documentation/ABI/testing/sysfs-kernel-mm-cma [new file with mode: 0644]
Documentation/admin-guide/devices.txt
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/mm/memory-hotplug.rst
Documentation/admin-guide/mm/userfaultfd.rst
Documentation/admin-guide/reporting-issues.rst
Documentation/arm64/booting.rst
Documentation/arm64/elf_hwcaps.rst
Documentation/arm64/tagged-address-abi.rst
Documentation/core-api/symbol-namespaces.rst
Documentation/dev-tools/gdb-kernel-debugging.rst
Documentation/devicetree/bindings/.gitignore
Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml
Documentation/devicetree/bindings/display/renesas,du.yaml
Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml
Documentation/devicetree/bindings/input/atmel,maxtouch.yaml
Documentation/devicetree/bindings/input/iqs626a.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/mms114.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml
Documentation/devicetree/bindings/media/renesas,vin.yaml
Documentation/devicetree/bindings/mtd/tango-nand.txt [deleted file]
Documentation/devicetree/bindings/net/renesas,etheravb.yaml
Documentation/devicetree/bindings/pci/rcar-pci-host.yaml
Documentation/devicetree/bindings/pci/tango-pcie.txt [deleted file]
Documentation/devicetree/bindings/riscv/microchip.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/serial/8250.yaml
Documentation/devicetree/bindings/vendor-prefixes.yaml
Documentation/driver-api/vfio.rst
Documentation/firmware-guide/acpi/gpio-properties.rst
Documentation/input/devices/rotary-encoder.rst
Documentation/input/joydev/joystick-api.rst
Documentation/input/joydev/joystick.rst
Documentation/process/changes.rst
Documentation/riscv/index.rst
Documentation/riscv/vm-layout.rst [new file with mode: 0644]
Documentation/scheduler/sched-domains.rst
Documentation/translations/it_IT/process/changes.rst
Documentation/translations/zh_CN/index.rst
Documentation/x86/x86_64/5level-paging.rst
MAINTAINERS
Makefile
arch/.gitignore
arch/alpha/include/asm/io.h
arch/alpha/kernel/pc873xx.c
arch/alpha/lib/csum_partial_copy.c
arch/arc/Kconfig
arch/arm/Kconfig
arch/arm/boot/compressed/Makefile
arch/arm/configs/dove_defconfig
arch/arm/configs/footbridge_defconfig
arch/arm/configs/magician_defconfig
arch/arm/configs/moxart_defconfig
arch/arm/configs/mps2_defconfig
arch/arm/configs/mvebu_v5_defconfig
arch/arm/configs/xcep_defconfig
arch/arm/include/asm/bug.h
arch/arm/include/asm/io.h
arch/arm/include/asm/kexec.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/set_memory.h
arch/arm/include/uapi/asm/Kbuild
arch/arm/include/uapi/asm/unistd.h
arch/arm/kernel/asm-offsets.c
arch/arm/kernel/entry-common.S
arch/arm/kernel/hw_breakpoint.c
arch/arm/kernel/machine_kexec.c
arch/arm/kernel/process.c
arch/arm/kernel/smccc-call.S
arch/arm/kernel/suspend.c
arch/arm/kernel/traps.c
arch/arm/mach-footbridge/Kconfig
arch/arm/mach-footbridge/Makefile
arch/arm/mach-footbridge/personal-pci.c [deleted file]
arch/arm/mach-footbridge/personal.c [deleted file]
arch/arm/mm/cache-v7.S
arch/arm/mm/dump.c
arch/arm/mm/init.c
arch/arm/mm/proc-v7.S
arch/arm/mm/ptdump_debugfs.c
arch/arm/probes/kprobes/test-arm.c
arch/arm/probes/kprobes/test-core.h
arch/arm/tools/Makefile
arch/arm/tools/syscallhdr.sh [deleted file]
arch/arm/tools/syscalltbl.sh [deleted file]
arch/arm64/Kconfig
arch/arm64/include/asm/daifflags.h
arch/arm64/include/asm/kernel-pgtable.h
arch/arm64/include/asm/memory.h
arch/arm64/include/asm/sparsemem.h
arch/arm64/kernel/alternative.c
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuidle.c
arch/arm64/kernel/entry-common.c
arch/arm64/kernel/entry.S
arch/arm64/kernel/process.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kernel/vdso/vdso.lds.S
arch/arm64/kernel/vdso32/Makefile
arch/arm64/mm/hugetlbpage.c
arch/arm64/mm/init.c
arch/arm64/mm/mmu.c
arch/arm64/mm/ptdump.c
arch/h8300/include/asm/bitops.h
arch/hexagon/Makefile
arch/hexagon/configs/comet_defconfig
arch/hexagon/include/asm/futex.h
arch/hexagon/include/asm/io.h
arch/hexagon/include/asm/timex.h
arch/hexagon/kernel/hexagon_ksyms.c
arch/hexagon/kernel/ptrace.c
arch/hexagon/lib/Makefile
arch/hexagon/lib/divsi3.S [new file with mode: 0644]
arch/hexagon/lib/memcpy_likely_aligned.S [new file with mode: 0644]
arch/hexagon/lib/modsi3.S [new file with mode: 0644]
arch/hexagon/lib/udivsi3.S [new file with mode: 0644]
arch/hexagon/lib/umodsi3.S [new file with mode: 0644]
arch/ia64/Kconfig
arch/ia64/include/asm/io.h
arch/ia64/include/asm/uaccess.h
arch/ia64/mm/hugetlbpage.c
arch/m68k/Makefile
arch/m68k/atari/time.c
arch/m68k/configs/amcore_defconfig
arch/m68k/include/asm/bitops.h
arch/m68k/include/asm/io_mm.h
arch/mips/Kconfig
arch/mips/Makefile
arch/mips/include/asm/io.h
arch/mips/mm/hugetlbpage.c
arch/openrisc/configs/or1ksim_defconfig
arch/parisc/Kconfig
arch/parisc/Makefile
arch/parisc/include/asm/io.h
arch/parisc/include/asm/pdc_chassis.h
arch/parisc/mm/hugetlbpage.c
arch/powerpc/Kconfig
arch/powerpc/boot/wrapper
arch/powerpc/include/asm/io.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/pci-bridge.h
arch/powerpc/include/asm/pci.h
arch/powerpc/kernel/module.c
arch/powerpc/kexec/file_load_64.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/lib/Makefile
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/powernv/Makefile
arch/powerpc/platforms/powernv/memtrace.c
arch/powerpc/platforms/powernv/npu-dma.c [deleted file]
arch/powerpc/platforms/powernv/opal-call.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/pci.h
arch/powerpc/platforms/pseries/pci.c
arch/riscv/Kconfig
arch/riscv/Kconfig.erratas [new file with mode: 0644]
arch/riscv/Kconfig.socs
arch/riscv/Makefile
arch/riscv/boot/Makefile
arch/riscv/boot/dts/Makefile
arch/riscv/boot/dts/microchip/Makefile [new file with mode: 0644]
arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts [new file with mode: 0644]
arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi [new file with mode: 0644]
arch/riscv/boot/loader.lds.S
arch/riscv/configs/defconfig
arch/riscv/errata/Makefile [new file with mode: 0644]
arch/riscv/errata/alternative.c [new file with mode: 0644]
arch/riscv/errata/sifive/Makefile [new file with mode: 0644]
arch/riscv/errata/sifive/errata.c [new file with mode: 0644]
arch/riscv/errata/sifive/errata_cip_453.S [new file with mode: 0644]
arch/riscv/include/asm/alternative-macros.h [new file with mode: 0644]
arch/riscv/include/asm/alternative.h [new file with mode: 0644]
arch/riscv/include/asm/asm.h
arch/riscv/include/asm/csr.h
arch/riscv/include/asm/elf.h
arch/riscv/include/asm/errata_list.h [new file with mode: 0644]
arch/riscv/include/asm/ftrace.h
arch/riscv/include/asm/kexec.h [new file with mode: 0644]
arch/riscv/include/asm/page.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/sbi.h
arch/riscv/include/asm/sections.h
arch/riscv/include/asm/set_memory.h
arch/riscv/include/asm/smp.h
arch/riscv/include/asm/string.h
arch/riscv/include/asm/syscall.h
arch/riscv/include/asm/tlbflush.h
arch/riscv/include/asm/vendorid_list.h [new file with mode: 0644]
arch/riscv/kernel/Makefile
arch/riscv/kernel/crash_dump.c [new file with mode: 0644]
arch/riscv/kernel/crash_save_regs.S [new file with mode: 0644]
arch/riscv/kernel/entry.S
arch/riscv/kernel/head.S
arch/riscv/kernel/head.h
arch/riscv/kernel/kexec_relocate.S [new file with mode: 0644]
arch/riscv/kernel/machine_kexec.c [new file with mode: 0644]
arch/riscv/kernel/mcount.S
arch/riscv/kernel/module.c
arch/riscv/kernel/probes/kprobes.c
arch/riscv/kernel/sbi.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/smp.c
arch/riscv/kernel/smpboot.c
arch/riscv/kernel/syscall_table.c
arch/riscv/kernel/time.c
arch/riscv/kernel/traps.c
arch/riscv/kernel/vdso.c
arch/riscv/kernel/vdso/Makefile
arch/riscv/kernel/vmlinux-xip.lds.S [new file with mode: 0644]
arch/riscv/kernel/vmlinux.lds.S
arch/riscv/mm/fault.c
arch/riscv/mm/init.c
arch/riscv/mm/kasan_init.c
arch/riscv/mm/physaddr.c
arch/riscv/mm/ptdump.c
arch/riscv/net/bpf_jit_comp64.c
arch/riscv/net/bpf_jit_core.c
arch/s390/Kconfig
arch/s390/configs/debug_defconfig
arch/s390/configs/defconfig
arch/s390/include/asm/cpu_mcf.h
arch/s390/include/asm/entry-common.h
arch/s390/include/asm/io.h
arch/s390/include/asm/pci.h
arch/s390/kernel/perf_cpum_cf.c
arch/s390/kernel/perf_cpum_cf_common.c
arch/s390/kernel/perf_cpum_cf_diag.c
arch/s390/kernel/setup.c
arch/s390/kernel/syscall.c
arch/s390/kernel/traps.c
arch/s390/mm/hugetlbpage.c
arch/s390/pci/pci.c
arch/s390/pci/pci_event.c
arch/sh/Kconfig
arch/sh/Makefile
arch/sh/configs/edosk7705_defconfig
arch/sh/configs/se7206_defconfig
arch/sh/configs/sh2007_defconfig
arch/sh/configs/sh7724_generic_defconfig
arch/sh/configs/sh7770_generic_defconfig
arch/sh/configs/sh7785lcr_32bit_defconfig
arch/sh/include/asm/bitops.h
arch/sh/include/asm/io.h
arch/sh/kernel/syscalls/Makefile
arch/sh/kernel/syscalls/syscallhdr.sh [deleted file]
arch/sh/kernel/syscalls/syscalltbl.sh [deleted file]
arch/sh/mm/Kconfig
arch/sh/mm/hugetlbpage.c
arch/sparc/configs/sparc64_defconfig
arch/sparc/include/asm/Kbuild
arch/sparc/include/asm/io_64.h
arch/sparc/kernel/syscalls/Makefile
arch/sparc/kernel/syscalls/syscallhdr.sh [deleted file]
arch/sparc/kernel/syscalls/syscalltbl.sh [deleted file]
arch/sparc/kernel/systbls_32.S
arch/sparc/kernel/systbls_64.S
arch/sparc/mm/hugetlbpage.c
arch/um/drivers/cow.h
arch/x86/Kconfig
arch/x86/events/amd/iommu.c
arch/x86/include/asm/bug.h
arch/x86/include/asm/idtentry.h
arch/x86/include/asm/msr.h
arch/x86/include/asm/page_64.h
arch/x86/include/asm/page_64_types.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/resctrl/monitor.c
arch/x86/kernel/nmi.c
arch/x86/kernel/process.c
arch/x86/kernel/smpboot.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/pat/set_memory.c
arch/xtensa/Makefile
arch/xtensa/configs/xip_kc705_defconfig
block/bio.c
block/blk-settings.c
certs/.gitignore
drivers/acpi/acpi_memhotplug.c
drivers/acpi/arm64/gtdt.c
drivers/acpi/custom_method.c
drivers/acpi/internal.h
drivers/acpi/irq.c
drivers/acpi/power.c
drivers/acpi/scan.c
drivers/acpi/sleep.h
drivers/atm/firestream.c
drivers/auxdisplay/panel.c
drivers/base/firmware_loader/main.c
drivers/base/memory.c
drivers/block/brd.c
drivers/block/loop.c
drivers/block/rnbd/rnbd-clt.c
drivers/block/rnbd/rnbd-clt.h
drivers/block/rnbd/rnbd-srv.c
drivers/char/Kconfig
drivers/char/mem.c
drivers/firmware/psci/psci.c
drivers/gpu/drm/i915/display/intel_dp.c
drivers/gpu/drm/qxl/qxl_drv.c
drivers/i3c/master.c
drivers/i3c/master/svc-i3c-master.c
drivers/infiniband/ulp/rtrs/rtrs-clt.c
drivers/input/Makefile
drivers/input/joystick/xpad.c
drivers/input/keyboard/gpio_keys.c
drivers/input/keyboard/imx_keypad.c
drivers/input/keyboard/tca6416-keypad.c
drivers/input/keyboard/tegra-kbc.c
drivers/input/misc/Kconfig
drivers/input/misc/Makefile
drivers/input/misc/ims-pcu.c
drivers/input/misc/iqs626a.c [new file with mode: 0644]
drivers/input/misc/max8997_haptic.c
drivers/input/mouse/elan_i2c.h
drivers/input/mouse/elan_i2c_core.c
drivers/input/serio/apbps2.c
drivers/input/touchscreen.c [moved from drivers/input/touchscreen/of_touchscreen.c with 93% similarity]
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/input/touchscreen/ar1021_i2c.c
drivers/input/touchscreen/atmel_mxt_ts.c
drivers/input/touchscreen/bu21029_ts.c
drivers/input/touchscreen/cyttsp_core.c
drivers/input/touchscreen/cyttsp_core.h
drivers/input/touchscreen/elants_i2c.c
drivers/input/touchscreen/exc3000.c
drivers/input/touchscreen/hycon-hy46xx.c [new file with mode: 0644]
drivers/input/touchscreen/ili210x.c
drivers/input/touchscreen/ilitek_ts_i2c.c [new file with mode: 0644]
drivers/input/touchscreen/iqs5xx.c
drivers/input/touchscreen/lpc32xx_ts.c
drivers/input/touchscreen/melfas_mip4.c
drivers/input/touchscreen/mms114.c
drivers/input/touchscreen/msg2638.c [new file with mode: 0644]
drivers/input/touchscreen/silead.c
drivers/input/touchscreen/stmfts.c
drivers/input/touchscreen/tsc2007.h
drivers/input/touchscreen/tsc2007_core.c
drivers/input/touchscreen/wacom_i2c.c
drivers/input/touchscreen/wm831x-ts.c
drivers/input/touchscreen/zinitix.c
drivers/isdn/capi/kcapi_proc.c
drivers/md/bcache/super.c
drivers/media/usb/pwc/pwc-uncompress.c
drivers/memory/.gitignore
drivers/net/can/m_can/m_can.c
drivers/net/can/spi/mcp251x.c
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
drivers/net/dsa/microchip/ksz8795_spi.c
drivers/net/dsa/microchip/ksz8863_smi.c
drivers/net/ethernet/adaptec/starfire.c
drivers/net/ethernet/amd/atarilance.c
drivers/net/ethernet/amd/pcnet32.c
drivers/net/ethernet/atheros/alx/main.c
drivers/net/ethernet/atheros/atl1c/atl1c_main.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
drivers/net/ethernet/brocade/bna/bnad.c
drivers/net/ethernet/cadence/macb_main.c
drivers/net/ethernet/chelsio/cxgb4/sge.c
drivers/net/ethernet/cisco/enic/enic_main.c
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
drivers/net/ethernet/intel/i40e/i40e_client.c
drivers/net/ethernet/intel/i40e/i40e_common.c
drivers/net/ethernet/intel/i40e/i40e_ethtool.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_type.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
drivers/net/ethernet/stmicro/stmmac/hwif.h
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ipa/gsi.c
drivers/net/ipa/gsi_reg.h
drivers/net/phy/marvell.c
drivers/net/wan/hdlc_fr.c
drivers/net/wireless/intersil/hostap/hostap_proc.c
drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
drivers/net/wireless/intersil/orinoco/orinoco_pci.c
drivers/net/wireless/intersil/orinoco/orinoco_plx.c
drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
drivers/nvdimm/btt.c
drivers/nvdimm/pmem.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/ioctl.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/loop.c
drivers/of/overlay.c
drivers/parport/parport_ip32.c
drivers/pci/hotplug/s390_pci_hpc.c
drivers/pcmcia/cistpl.c
drivers/pcmcia/ds.c
drivers/pcmcia/pcmcia_cis.c
drivers/pcmcia/pcmcia_resource.c
drivers/pcmcia/rsrc_nonstatic.c
drivers/pinctrl/pinctrl-ingenic.c
drivers/platform/chrome/cros_ec_lpc_mec.c
drivers/platform/chrome/cros_ec_typec.c
drivers/platform/chrome/cros_usbpd_notify.c
drivers/platform/chrome/wilco_ec/telemetry.c
drivers/platform/x86/dell/dell_rbu.c
drivers/s390/block/dasd_eckd.h
drivers/s390/cio/device.c
drivers/scsi/53c700.c
drivers/scsi/53c700.h
drivers/scsi/ch.c
drivers/scsi/esas2r/esas2r_main.c
drivers/scsi/fnic/fnic_scsi.c
drivers/scsi/ips.c
drivers/scsi/ips.h
drivers/scsi/lasi700.c
drivers/scsi/lpfc/lpfc_bsg.c
drivers/scsi/lpfc/lpfc_init.c
drivers/scsi/lpfc/lpfc_sli.c
drivers/scsi/megaraid/mbox_defs.h
drivers/scsi/megaraid/mega_common.h
drivers/scsi/megaraid/megaraid_mbox.c
drivers/scsi/megaraid/megaraid_mbox.h
drivers/scsi/qla1280.c
drivers/scsi/qla2xxx/qla_init.c
drivers/scsi/qla2xxx/qla_os.c
drivers/scsi/scsi_debug.c
drivers/scsi/scsicam.c
drivers/scsi/sni_53c710.c
drivers/scsi/ufs/ufs-sysfs.c
drivers/scsi/ufs/ufshcd.c
drivers/target/target_core_user.c
drivers/tty/vt/.gitignore
drivers/video/fbdev/matrox/matroxfb_base.c
drivers/video/fbdev/vga16fb.c
fs/9p/v9fs.c
fs/9p/vfs_file.c
fs/Kconfig
fs/block_dev.c
fs/btrfs/compression.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/reflink.c
fs/btrfs/zlib.c
fs/btrfs/zstd.c
fs/buffer.c
fs/ceph/Kconfig
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/cache.h
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/io.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/metric.c
fs/ceph/metric.h
fs/ceph/snap.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/configfs/configfs_internal.h
fs/configfs/dir.c
fs/configfs/file.c
fs/configfs/inode.c
fs/configfs/item.c
fs/configfs/mount.c
fs/configfs/symlink.c
fs/dax.c
fs/ecryptfs/crypto.c
fs/ecryptfs/debug.c
fs/ecryptfs/dentry.c
fs/ecryptfs/ecryptfs_kernel.h
fs/ecryptfs/file.c
fs/ecryptfs/inode.c
fs/ecryptfs/keystore.c
fs/ecryptfs/kthread.c
fs/ecryptfs/main.c
fs/ecryptfs/messaging.c
fs/ecryptfs/miscdev.c
fs/ecryptfs/mmap.c
fs/ecryptfs/read_write.c
fs/ecryptfs/super.c
fs/eventpoll.c
fs/fat/fatent.c
fs/gfs2/glock.c
fs/hpfs/hpfs.h
fs/hugetlbfs/inode.c
fs/inode.c
fs/io_uring.c
fs/iomap/buffered-io.c
fs/isofs/rock.c
fs/locks.c
fs/nfs/callback_proc.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/export.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/fs_context.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/io.c
fs/nfs/mount_clnt.c
fs/nfs/nfs3acl.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs42proc.c
fs/nfs/nfs42xattr.c
fs/nfs/nfs4file.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4renewd.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/nfstrace.c
fs/nfs/nfstrace.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/proc.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/Kconfig
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/xdr4.h
fs/nilfs2/cpfile.c
fs/nilfs2/ioctl.c
fs/nilfs2/segment.c
fs/nilfs2/the_nilfs.c
fs/ocfs2/acl.c
fs/ocfs2/acl.h
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/blockcheck.c
fs/ocfs2/blockcheck.h
fs/ocfs2/buffer_head_io.c
fs/ocfs2/buffer_head_io.h
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/heartbeat.h
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/cluster/netdebug.c
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/cluster/nodemanager.h
fs/ocfs2/cluster/ocfs2_heartbeat.h
fs/ocfs2/cluster/ocfs2_nodemanager.h
fs/ocfs2/cluster/quorum.c
fs/ocfs2/cluster/quorum.h
fs/ocfs2/cluster/sys.c
fs/ocfs2/cluster/sys.h
fs/ocfs2/cluster/tcp.c
fs/ocfs2/cluster/tcp.h
fs/ocfs2/cluster/tcp_internal.h
fs/ocfs2/dcache.c
fs/ocfs2/dcache.h
fs/ocfs2/dir.c
fs/ocfs2/dir.h
fs/ocfs2/dlm/dlmapi.h
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmconvert.c
fs/ocfs2/dlm/dlmconvert.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdebug.h
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmdomain.h
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlm/dlmunlock.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/dlmfs/userdlm.c
fs/ocfs2/dlmfs/userdlm.h
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/export.c
fs/ocfs2/export.h
fs/ocfs2/extent_map.c
fs/ocfs2/extent_map.h
fs/ocfs2/file.c
fs/ocfs2/file.h
fs/ocfs2/filecheck.c
fs/ocfs2/filecheck.h
fs/ocfs2/heartbeat.c
fs/ocfs2/heartbeat.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/localalloc.h
fs/ocfs2/locks.c
fs/ocfs2/locks.h
fs/ocfs2/mmap.c
fs/ocfs2/move_extents.c
fs/ocfs2/move_extents.h
fs/ocfs2/namei.c
fs/ocfs2/namei.h
fs/ocfs2/ocfs1_fs_compat.h
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_ioctl.h
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/ocfs2_lockingver.h
fs/ocfs2/refcounttree.c
fs/ocfs2/refcounttree.h
fs/ocfs2/reservations.c
fs/ocfs2/reservations.h
fs/ocfs2/resize.c
fs/ocfs2/resize.h
fs/ocfs2/slot_map.c
fs/ocfs2/slot_map.h
fs/ocfs2/stack_o2cb.c
fs/ocfs2/stack_user.c
fs/ocfs2/stackglue.c
fs/ocfs2/stackglue.h
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/super.h
fs/ocfs2/symlink.c
fs/ocfs2/symlink.h
fs/ocfs2/sysfile.c
fs/ocfs2/sysfile.h
fs/ocfs2/uptodate.c
fs/ocfs2/uptodate.h
fs/ocfs2/xattr.c
fs/ocfs2/xattr.h
fs/proc/generic.c
fs/proc/inode.c
fs/proc/proc_sysctl.c
fs/proc/task_mmu.c
fs/reiserfs/procfs.c
fs/unicode/.gitignore
fs/userfaultfd.c
fs/xfs/libxfs/xfs_ag_resv.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_log_format.h
fs/xfs/libxfs/xfs_rmap_btree.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/scrub/agheader.c
fs/xfs/scrub/fscounters.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item_recover.c
fs/xfs/xfs_log.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_reflink.c
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
include/asm-generic/bitops/find.h
include/asm-generic/bitops/le.h
include/asm-generic/bitsperlong.h
include/asm-generic/io.h
include/dt-bindings/input/atmel-maxtouch.h [new file with mode: 0644]
include/linux/align.h [new file with mode: 0644]
include/linux/async.h
include/linux/bio.h
include/linux/bitmap.h
include/linux/bitops.h
include/linux/blk-mq.h
include/linux/blkdev.h
include/linux/bpf_verifier.h
include/linux/buffer_head.h
include/linux/cma.h
include/linux/compaction.h
include/linux/compat.h
include/linux/configfs.h
include/linux/context_tracking.h
include/linux/crc8.h
include/linux/cred.h
include/linux/delayacct.h
include/linux/fs.h
include/linux/genl_magic_func.h
include/linux/genl_magic_struct.h
include/linux/gfp.h
include/linux/highmem.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/init_task.h
include/linux/initrd.h
include/linux/iomap.h
include/linux/kconfig.h
include/linux/kernel.h
include/linux/kvm_host.h
include/linux/memcontrol.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/memremap.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/netfilter_arp/arp_tables.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/pagemap.h
include/linux/pgtable.h
include/linux/platform_data/cros_ec_commands.h
include/linux/proc_fs.h
include/linux/profile.h
include/linux/sched.h
include/linux/sched/mm.h
include/linux/shrinker.h
include/linux/smp.h
include/linux/sunrpc/xprt.h
include/linux/swap.h
include/linux/userfaultfd_k.h
include/linux/vm_event_item.h
include/linux/vmalloc.h
include/linux/vtime.h
include/net/sctp/command.h
include/trace/events/cma.h
include/trace/events/migrate.h
include/trace/events/mmflags.h
include/trace/events/rpcrdma.h
include/trace/events/sunrpc.h
include/uapi/linux/if_bonding.h
include/uapi/linux/kexec.h
include/uapi/linux/mempolicy.h
include/uapi/linux/netfilter/xt_SECMARK.h
include/uapi/linux/nfs4.h
include/uapi/linux/seg6_local.h
include/uapi/linux/userfaultfd.h
include/uapi/linux/vfio.h
include/xen/interface/elfnote.h
include/xen/interface/hvm/hvm_vcpu.h
include/xen/interface/io/xenbus.h
init/Kconfig
init/initramfs.c
init/main.c
ipc/sem.c
kernel/.gitignore
kernel/Makefile
kernel/async.c
kernel/bpf/verifier.c
kernel/configs/android-base.config
kernel/cred.c
kernel/exit.c
kernel/fork.c
kernel/futex.c
kernel/gcov/Kconfig
kernel/gcov/base.c
kernel/gcov/clang.c
kernel/gcov/fs.c
kernel/gcov/gcc_4_7.c
kernel/gcov/gcov.h
kernel/kexec_core.c
kernel/kexec_file.c
kernel/kmod.c
kernel/locking/qrwlock.c
kernel/resource.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/psi.c
kernel/smp.c
kernel/sys.c
kernel/sysctl.c
kernel/trace/ftrace.c
kernel/umh.c
kernel/up.c
kernel/user_namespace.c
lib/.gitignore
lib/Kconfig.kfence
lib/bch.c
lib/crc8.c
lib/decompress_unlzma.c
lib/find_bit.c
lib/genalloc.c
lib/iov_iter.c
lib/list_sort.c
lib/nlattr.c
lib/parser.c
lib/percpu_counter.c
lib/stackdepot.c
mm/Kconfig
mm/Makefile
mm/balloon_compaction.c
mm/cma.c
mm/cma.h
mm/cma_debug.c
mm/cma_sysfs.c [new file with mode: 0644]
mm/compaction.c
mm/filemap.c
mm/frontswap.c
mm/gup.c
mm/gup_test.c
mm/gup_test.h
mm/highmem.c
mm/huge_memory.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/internal.h
mm/kasan/kasan.h
mm/kasan/quarantine.c
mm/kasan/shadow.c
mm/kfence/core.c
mm/kfence/report.c
mm/khugepaged.c
mm/ksm.c
mm/list_lru.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_owner.c
mm/page_vma_mapped.c
mm/percpu-internal.h
mm/percpu.c
mm/pgalloc-track.h
mm/process_vm_access.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_slots.c
mm/swap_state.c
mm/swapfile.c
mm/truncate.c
mm/userfaultfd.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
mm/z3fold.c
mm/zpool.c
mm/zsmalloc.c
mm/zswap.c
net/bridge/br_netlink.c
net/ceph/auth.c
net/ceph/auth_x.c
net/ceph/decode.c
net/ethtool/netlink.c
net/hsr/hsr_forward.c
net/ipv4/netfilter/arp_tables.c
net/ipv4/netfilter/arptable_filter.c
net/ipv4/tcp.c
net/ipv4/tcp_cong.c
net/ipv6/seg6.c
net/ipv6/seg6_local.c
net/mptcp/subflow.c
net/netfilter/nf_conntrack_ftp.c
net/netfilter/nf_conntrack_h323_main.c
net/netfilter/nf_conntrack_irc.c
net/netfilter/nf_conntrack_pptp.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_sane.c
net/netfilter/nf_tables_api.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_osf.c
net/netfilter/nft_set_hash.c
net/netfilter/xt_SECMARK.c
net/nfc/llcp_sock.c
net/openvswitch/actions.c
net/sched/sch_frag.c
net/sctp/sm_make_chunk.c
net/sctp/sm_sideeffect.c
net/sctp/sm_statefuns.c
net/sctp/socket.c
net/smc/af_smc.c
net/sunrpc/clnt.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/svc.c
net/sunrpc/svcsock.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
net/vmw_vsock/vmci_transport.c
net/xdp/xsk_queue.h
samples/auxdisplay/.gitignore
samples/binderfs/.gitignore
samples/configfs/configfs_sample.c
samples/connector/.gitignore
samples/hidraw/.gitignore
samples/kprobes/kprobe_example.c
samples/mei/.gitignore
samples/nitro_enclaves/.gitignore
samples/pidfd/.gitignore
samples/seccomp/.gitignore
samples/timers/.gitignore
samples/vfio-mdev/mbochs.c
samples/vfio-mdev/mdpy.c
samples/vfs/.gitignore
samples/watch_queue/.gitignore
samples/watchdog/.gitignore
scripts/.gitignore
scripts/Makefile.build
scripts/Makefile.lib
scripts/basic/.gitignore
scripts/checkpatch.pl
scripts/dtc/.gitignore
scripts/gcc-plugins/.gitignore
scripts/gdb/linux/cpus.py
scripts/gdb/linux/symbols.py
scripts/genksyms/.gitignore
scripts/genksyms/Makefile
scripts/kernel-doc
scripts/link-vmlinux.sh
scripts/mod/.gitignore
scripts/nsdeps
scripts/package/buildtar
scripts/recordmcount.pl
scripts/remove-stale-files [new file with mode: 0755]
scripts/setlocalversion
scripts/spelling.txt
scripts/tags.sh
scripts/ver_linux
sound/pci/hda/hda_generic.c
sound/pci/hda/patch_realtek.c
sound/usb/mixer_maps.c
tools/include/asm-generic/bitops/find.h
tools/include/asm-generic/bitsperlong.h
tools/include/linux/bitmap.h
tools/lib/bitmap.c
tools/lib/bpf/ringbuf.c
tools/lib/find_bit.c
tools/scripts/Makefile.include
tools/testing/selftests/bpf/prog_tests/snprintf.c
tools/testing/selftests/bpf/progs/test_snprintf.c
tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
tools/testing/selftests/kvm/lib/sparsebit.c
tools/testing/selftests/mincore/mincore_selftest.c
tools/testing/selftests/net/mptcp/mptcp_connect.sh
tools/testing/selftests/powerpc/mm/tlbie_test.c
tools/testing/selftests/proc/Makefile
tools/testing/selftests/proc/proc-subset-pid.c [new file with mode: 0644]
tools/testing/selftests/proc/read.c
tools/testing/selftests/vm/.gitignore
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/gup_test.c
tools/testing/selftests/vm/split_huge_page_test.c [new file with mode: 0644]
tools/testing/selftests/vm/userfaultfd.c
tools/usb/hcd-tests.sh
usr/.gitignore
usr/gen_initramfs.sh
usr/include/.gitignore

index df8d314..7afd412 100644 (file)
 *.xz
 *.zst
 Module.symvers
-modules.builtin
 modules.order
 
 #
 # Top-level generic files
 #
-/tags
-/TAGS
 /linux
 /modules-only.symvers
 /vmlinux
@@ -66,6 +63,7 @@ modules.order
 /vmlinuz
 /System.map
 /Module.markers
+/modules.builtin
 /modules.builtin.modinfo
 /modules.nsdeps
 
@@ -114,6 +112,10 @@ patches-*
 patches
 series
 
+# ctags files
+tags
+TAGS
+
 # cscope files
 cscope.*
 ncscope.*
diff --git a/CREDITS b/CREDITS
index b06760f..7ef7b13 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -1874,6 +1874,11 @@ S: Krosenska' 543
 S: 181 00 Praha 8
 S: Czech Republic
 
+N: Murali Karicheri
+E: m-karicheri2@ti.com
+D: Keystone NetCP driver
+D: Keystone PCIe host controller driver
+
 N: Jan "Yenya" Kasprzak
 E: kas@fi.muni.cz
 D: Author of the COSA/SRP sync serial board driver.
index ed79f58..47e6b97 100644 (file)
@@ -58,3 +58,19 @@ Description:
 
                Indicates the mux id associated to the qmimux network interface
                during its creation.
+
+What:          /sys/class/net/<iface>/qmi/pass_through
+Date:          January 2021
+KernelVersion: 5.12
+Contact:       Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
+Description:
+               Boolean.  Default: 'N'
+
+               Set this to 'Y' to enable 'pass-through' mode, allowing packets
+               in MAP format to be passed on to the stack.
+
+               Normally the rmnet driver (CONFIG_RMNET) is then used to process
+               and demultiplex these packets.
+
+               'Pass-through' mode can be enabled when the device is in
+               'raw-ip' mode only.
index 0eee30b..fe13baa 100644 (file)
@@ -285,7 +285,7 @@ Description:        Disable L3 cache indices
 
                All AMD processors with L3 caches provide this functionality.
                For details, see BKDGs at
-               http://developer.amd.com/documentation/guides/Pages/default.aspx
+                https://www.amd.com/en/support/tech-docs?keyword=bios+kernel
 
 
 What:          /sys/devices/system/cpu/cpufreq/boost
index cd7c578..704434b 100644 (file)
@@ -15,3 +15,12 @@ Description:    Reports the model identification provided by the touchscreen, fo
                Access: Read
 
                Valid values: Represented as string
+
+What:          /sys/bus/i2c/devices/xxx/type
+Date:          Jan 2021
+Contact:       linux-input@vger.kernel.org
+Description:   Reports the type identification provided by the touchscreen, for example "PCAP82H80 Series"
+
+               Access: Read
+
+               Valid values: Represented as string
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
new file mode 100644 (file)
index 0000000..02b2bb6
--- /dev/null
@@ -0,0 +1,25 @@
+What:          /sys/kernel/mm/cma/
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               /sys/kernel/mm/cma/ contains a subdirectory for each CMA
+               heap name (also sometimes called CMA areas).
+
+               Each CMA heap subdirectory (that is, each
+               /sys/kernel/mm/cma/<cma-heap-name> directory) contains the
+               following items:
+
+                       alloc_pages_success
+                       alloc_pages_fail
+
+What:          /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_success
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               the number of pages CMA API succeeded to allocate
+
+What:          /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               the number of pages CMA API failed to allocate
index ef41f77..9c2be82 100644 (file)
@@ -4,7 +4,7 @@
 
    1 char      Memory devices
                  1 = /dev/mem          Physical memory access
-                 2 = /dev/kmem         Kernel virtual memory access
+                 2 = /dev/kmem         OBSOLETE - replaced by /proc/kcore
                  3 = /dev/null         Null device
                  4 = /dev/port         I/O port access
                  5 = /dev/zero         Null byte source
index 0d48fbd..cb89dbd 100644 (file)
                        initcall functions.  Useful for debugging built-in
                        modules and initcalls.
 
+       initramfs_async= [KNL]
+                       Format: <bool>
+                       Default: 1
+                       This parameter controls whether the initramfs
+                       image is unpacked asynchronously, concurrently
+                       with devices being probed and
+                       initialized. This should normally just work,
+                       but as a debugging aid, one can get the
+                       historical behaviour of the initramfs
+                       unpacking being completed before device_ and
+                       late_ initcalls.
+
        initrd=         [BOOT] Specify the location of the initial ramdisk
 
        initrdmem=      [KNL] Specify a physical address and size from which to
                        seconds.  Use this parameter to check at some
                        other rate.  0 disables periodic checking.
 
-       memtest=        [KNL,X86,ARM,PPC] Enable memtest
+       memory_hotplug.memmap_on_memory
+                       [KNL,X86,ARM] Boolean flag to enable this feature.
+                       Format: {on | off (default)}
+                       When enabled, runtime hotplugged memory will
+                       allocate its internal metadata (struct pages)
+                       from the hotadded memory which will allow to
+                       hotadd a lot of memory without requiring
+                       additional memory to do so.
+                       This feature is disabled by default because it
+                       has some implication on large (e.g. GB)
+                       allocations in some configurations (e.g. small
+                       memory blocks).
+                       The state of the flag can be read in
+                       /sys/module/memory_hotplug/parameters/memmap_on_memory.
+                       Note that even when enabled, there are a few cases where
+                       the feature is not effective.
+
+       memtest=        [KNL,X86,ARM,PPC,RISCV] Enable memtest
                        Format: <integer>
                        default : 0 <disable>
                        Specifies the number of memtest passes to be
 
        nohugeiomap     [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
 
+       nohugevmalloc   [PPC] Disable kernel huge vmalloc mappings.
+
        nosmt           [KNL,S390] Disable symmetric multithreading (SMT).
                        Equivalent to smt=1.
 
index 5307f90..05d51d2 100644 (file)
@@ -357,6 +357,15 @@ creates ZONE_MOVABLE as following.
    Unfortunately, there is no information to show which memory block belongs
    to ZONE_MOVABLE. This is TBD.
 
+.. note::
+   Techniques that rely on long-term pinnings of memory (especially, RDMA and
+   vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
+   hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that
+   memory can still get hot removed - be aware that pinning can fail even if
+   there is plenty of free memory in ZONE_MOVABLE. In addition, using
+   ZONE_MOVABLE might make page pinning more expensive, because pages have to be
+   migrated off that zone first.
+
 .. _memory_hotplug_how_to_offline_memory:
 
 How to offline memory
index 65eefa6..3aa38e8 100644 (file)
@@ -63,36 +63,36 @@ the generic ioctl available.
 
 The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl
 defines what memory types are supported by the ``userfaultfd`` and what
-events, except page fault notifications, may be generated.
-
-If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs
-virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in
-``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be
-set if the kernel supports registering ``userfaultfd`` ranges on shared
-memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``,
-``MAP_SHARED``, ``memfd_create``, etc).
-
-The userland application that wants to use ``userfaultfd`` with hugetlbfs
-or shared memory need to set the corresponding flag in
-``uffdio_api.features`` to enable those features.
-
-If the userland desires to receive notifications for events other than
-page faults, it has to verify that ``uffdio_api.features`` has appropriate
-``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more
-detail below in `Non-cooperative userfaultfd`_ section.
-
-Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should
-be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to
-register a memory range in the ``userfaultfd`` by setting the
+events, except page fault notifications, may be generated:
+
+- The ``UFFD_FEATURE_EVENT_*`` flags indicate that various other events
+  other than page faults are supported. These events are described in more
+  detail below in the `Non-cooperative userfaultfd`_ section.
+
+- ``UFFD_FEATURE_MISSING_HUGETLBFS`` and ``UFFD_FEATURE_MISSING_SHMEM``
+  indicate that the kernel supports ``UFFDIO_REGISTER_MODE_MISSING``
+  registrations for hugetlbfs and shared memory (covering all shmem APIs,
+  i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, ``MAP_SHARED``, ``memfd_create``,
+  etc) virtual memory areas, respectively.
+
+- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports
+  ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory
+  areas.
+
+The userland application should set the feature flags it intends to use
+when invoking the ``UFFDIO_API`` ioctl, to request that those features be
+enabled if supported.
+
+Once the ``userfaultfd`` API has been enabled the ``UFFDIO_REGISTER``
+ioctl should be invoked (if present in the returned ``uffdio_api.ioctls``
+bitmask) to register a memory range in the ``userfaultfd`` by setting the
 uffdio_register structure accordingly. The ``uffdio_register.mode``
 bitmask will specify to the kernel which kind of faults to track for
-the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing
-pages). The ``UFFDIO_REGISTER`` ioctl will return the
+the range. The ``UFFDIO_REGISTER`` ioctl will return the
 ``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve
 userfaults on the range registered. Not all ioctls will necessarily be
-supported for all memory types depending on the underlying virtual
-memory backend (anonymous memory vs tmpfs vs real filebacked
-mappings).
+supported for all memory types (e.g. anonymous memory vs. shmem vs.
+hugetlbfs), or all types of intercepted faults.
 
 Userland can use the ``uffdio_register.ioctls`` to manage the virtual
 address space in the background (to add or potentially also remove
@@ -100,21 +100,46 @@ memory from the ``userfaultfd`` registered range). This means a userfault
 could be triggering just before userland maps in the background the
 user-faulted page.
 
-The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That
-atomically copies a page into the userfault registered range and wakes
-up the blocked userfaults
-(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set).
-Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in
-guaranteeing that nothing can see an half copied page since it'll
-keep userfaulting until the copy has finished.
+Resolving Userfaults
+--------------------
+
+There are three basic ways to resolve userfaults:
+
+- ``UFFDIO_COPY`` atomically copies some existing page contents from
+  userspace.
+
+- ``UFFDIO_ZEROPAGE`` atomically zeros the new page.
+
+- ``UFFDIO_CONTINUE`` maps an existing, previously-populated page.
+
+These operations are atomic in the sense that they guarantee nothing can
+see a half-populated page, since readers will keep userfaulting until the
+operation has finished.
+
+By default, these wake up userfaults blocked on the range in question.
+They support a ``UFFDIO_*_MODE_DONTWAKE`` ``mode`` flag, which indicates
+that waking will be done separately at some later time.
+
+Which ioctl to choose depends on the kind of page fault, and what we'd
+like to do to resolve it:
+
+- For ``UFFDIO_REGISTER_MODE_MISSING`` faults, the fault needs to be
+  resolved by either providing a new page (``UFFDIO_COPY``), or mapping
+  the zero page (``UFFDIO_ZEROPAGE``). By default, the kernel would map
+  the zero page for a missing fault. With userfaultfd, userspace can
+  decide what content to provide before the faulting thread continues.
+
+- For ``UFFDIO_REGISTER_MODE_MINOR`` faults, there is an existing page (in
+  the page cache). Userspace has the option of modifying the page's
+  contents before resolving the fault. Once the contents are correct
+  (modified or not), userspace asks the kernel to map the page and let the
+  faulting thread continue with ``UFFDIO_CONTINUE``.
 
 Notes:
 
-- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then
-  you must provide some kind of page in your thread after reading from
-  the uffd.  You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``.
-  The normal behavior of the OS automatically providing a zero page on
-  an anonymous mmaping is not in place.
+- You can tell which kind of fault occurred by examining
+  ``pagefault.flags`` within the ``uffd_msg``, checking for the
+  ``UFFD_PAGEFAULT_FLAG_*`` flags.
 
 - None of the page-delivering ioctls default to the range that you
   registered with.  You must fill in all fields for the appropriate
@@ -122,9 +147,9 @@ Notes:
 
 - You get the address of the access that triggered the missing page
   event out of a struct uffd_msg that you read in the thread from the
-  uffd.  You can supply as many pages as you want with ``UFFDIO_COPY`` or
-  ``UFFDIO_ZEROPAGE``.  Keep in mind that unless you used DONTWAKE then
-  the first of any of those IOCTLs wakes up the faulting thread.
+  uffd.  You can supply as many pages as you want with these IOCTLs.
+  Keep in mind that unless you used DONTWAKE then the first of any of
+  those IOCTLs wakes up the faulting thread.
 
 - Be sure to test for all errors including
   (``pollfd[0].revents & POLLERR``).  This can happen, e.g. when ranges
index 48b4d0e..18d8e25 100644 (file)
@@ -24,7 +24,8 @@ longterm series? One still supported? Then search the `LKML
 you don't find any, install `the latest release from that series
 <https://kernel.org/>`_. If it still shows the issue, report it to the stable
 mailing list (stable@vger.kernel.org) and CC the regressions list
-(regressions@lists.linux.dev).
+(regressions@lists.linux.dev); ideally also CC the maintainer and the mailing
+list for the subsystem in question.
 
 In all other cases try your best guess which kernel part might be causing the
 issue. Check the :ref:`MAINTAINERS <maintainers>` file for how its developers
@@ -48,8 +49,9 @@ before the issue occurs.
 If you are facing multiple issues with the Linux kernel at once, report each
 separately. While writing your report, include all information relevant to the
 issue, like the kernel and the distro used. In case of a regression, CC the
-regressions mailing list (regressions@lists.linux.dev) to your report; also try
-to include the commit-id of the change causing it, which a bisection can find.
+regressions mailing list (regressions@lists.linux.dev) to your report. Also try
+to pin-point the culprit with a bisection; if you succeed, include its
+commit-id and CC everyone in the sign-off-by chain.
 
 Once the report is out, answer any questions that come up and help where you
 can. That includes keeping the ball rolling by occasionally retesting with newer
@@ -198,10 +200,11 @@ report them:
 
  * Send a short problem report to the Linux stable mailing list
    (stable@vger.kernel.org) and CC the Linux regressions mailing list
-   (regressions@lists.linux.dev). Roughly describe the issue and ideally
-   explain how to reproduce it. Mention the first version that shows the
-   problem and the last version that's working fine. Then wait for further
-   instructions.
+   (regressions@lists.linux.dev); if you suspect the cause in a particular
+   subsystem, CC its maintainer and its mailing list. Roughly describe the
+   issue and ideally explain how to reproduce it. Mention the first version
+   that shows the problem and the last version that's working fine. Then
+   wait for further instructions.
 
 The reference section below explains each of these steps in more detail.
 
@@ -768,7 +771,9 @@ regular internet search engine and add something like
 the results to the archives at that URL.
 
 It's also wise to check the internet, LKML and maybe bugzilla.kernel.org again
-at this point.
+at this point. If your report needs to be filed in a bug tracker, you may want
+to check the mailing list archives for the subsystem as well, as someone might
+have reported it only there.
 
 For details how to search and what to do if you find matching reports see
 "Search for existing reports, first run" above.
@@ -1249,9 +1254,10 @@ and the oldest where the issue occurs (say 5.8-rc1).
 
 When sending the report by mail, CC the Linux regressions mailing list
 (regressions@lists.linux.dev). In case the report needs to be filed to some web
-tracker, proceed to do so; once filed, forward the report by mail to the
-regressions list. Make sure to inline the forwarded report, hence do not attach
-it. Also add a short note at the top where you mention the URL to the ticket.
+tracker, proceed to do so. Once filed, forward the report by mail to the
+regressions list; CC the maintainer and the mailing list for the subsystem in
+question. Make sure to inline the forwarded report, hence do not attach it.
+Also add a short note at the top where you mention the URL to the ticket.
 
 When mailing or forwarding the report, in case of a successful bisection add the
 author of the culprit to the recipients; also CC everyone in the signed-off-by
@@ -1536,17 +1542,20 @@ Report the regression
 
     *Send a short problem report to the Linux stable mailing list
     (stable@vger.kernel.org) and CC the Linux regressions mailing list
-    (regressions@lists.linux.dev). Roughly describe the issue and ideally
-    explain how to reproduce it.  Mention the first version that shows the
-    problem and the last version that's working fine. Then wait for further
-    instructions.*
+    (regressions@lists.linux.dev); if you suspect the cause in a particular
+    subsystem, CC its maintainer and its mailing list. Roughly describe the
+    issue and ideally explain how to reproduce it. Mention the first version
+    that shows the problem and the last version that's working fine. Then
+    wait for further instructions.*
 
 When reporting a regression that happens within a stable or longterm kernel
 line (say when updating from 5.10.4 to 5.10.5) a brief report is enough for
-the start to get the issue reported quickly. Hence a rough description is all
-it takes.
+the start to get the issue reported quickly. Hence a rough description to the
+stable and regressions mailing list is all it takes; but in case you suspect
+the cause in a particular subsystem, CC its maintainers and its mailing list
+as well, because that will speed things up.
 
-But note, it helps developers a great deal if you can specify the exact version
+And note, it helps developers a great deal if you can specify the exact version
 that introduced the problem. Hence if possible within a reasonable time frame,
 try to find that version using vanilla kernels. Lets assume something broke when
 your distributor released a update from Linux kernel 5.10.5 to 5.10.8. Then as
@@ -1563,7 +1572,9 @@ pinpoint the exact change that causes the issue (which then can easily get
 reverted to fix the issue quickly). Hence consider to do a proper bisection
 right away if time permits. See the section 'Special care for regressions' and
 the document 'Documentation/admin-guide/bug-bisect.rst' for details how to
-perform one.
+perform one. In case of a successful bisection add the author of the culprit to
+the recipients; also CC everyone in the signed-off-by chain, which you find at
+the end of its commit message.
 
 
 Reference for "Reporting issues only occurring in older kernel version lines"
index 4fcc00a..18b8cc1 100644 (file)
@@ -277,9 +277,40 @@ Before jumping into the kernel, the following conditions must be met:
 
     - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1.
 
+  For CPUs with Advanced SIMD and floating point support:
+
+  - If EL3 is present:
+
+    - CPTR_EL3.TFP (bit 10) must be initialised to 0b0.
+
+  - If EL2 is present and the kernel is entered at EL1:
+
+    - CPTR_EL2.TFP (bit 10) must be initialised to 0b0.
+
+  For CPUs with the Scalable Vector Extension (FEAT_SVE) present:
+
+  - if EL3 is present:
+
+    - CPTR_EL3.EZ (bit 8) must be initialised to 0b1.
+
+    - ZCR_EL3.LEN must be initialised to the same value for all CPUs the
+      kernel is executed on.
+
+  - If the kernel is entered at EL1 and EL2 is present:
+
+    - CPTR_EL2.TZ (bit 8) must be initialised to 0b0.
+
+    - CPTR_EL2.ZEN (bits 17:16) must be initialised to 0b11.
+
+    - ZCR_EL2.LEN must be initialised to the same value for all CPUs the
+      kernel will execute on.
+
 The requirements described above for CPU mode, caches, MMUs, architected
 timers, coherency and system registers apply to all CPUs.  All CPUs must
-enter the kernel in the same exception level.
+enter the kernel in the same exception level.  Where the values documented
+disable traps it is permissible for these traps to be enabled so long as
+those traps are handled transparently by higher exception levels as though
+the values documented were set.
 
 The boot loader is expected to enter the kernel on each CPU in the
 following manner:
index 8782166..ec1a5a6 100644 (file)
@@ -74,7 +74,7 @@ HWCAP_ASIMD
 
 HWCAP_EVTSTRM
     The generic timer is configured to generate events at a frequency of
-    approximately 100KHz.
+    approximately 10KHz.
 
 HWCAP_AES
     Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001.
index cbc4d45..459e6b6 100644 (file)
@@ -113,6 +113,12 @@ ABI relaxation:
 
 - ``shmat()`` and ``shmdt()``.
 
+- ``brk()`` (since kernel v5.6).
+
+- ``mmap()`` (since kernel v5.6).
+
+- ``mremap()``, the ``new_address`` argument (since kernel v5.6).
+
 Any attempt to use non-zero tagged pointers may result in an error code
 being returned, a (fatal) signal being raised, or other modes of
 failure.
index 9b76337..5ad9e0a 100644 (file)
@@ -43,14 +43,14 @@ exporting of kernel symbols to the kernel symbol table, variants of these are
 available to export symbols into a certain namespace: EXPORT_SYMBOL_NS() and
 EXPORT_SYMBOL_NS_GPL(). They take one additional argument: the namespace.
 Please note that due to macro expansion that argument needs to be a
-preprocessor symbol. E.g. to export the symbol `usb_stor_suspend` into the
-namespace `USB_STORAGE`, use::
+preprocessor symbol. E.g. to export the symbol ``usb_stor_suspend`` into the
+namespace ``USB_STORAGE``, use::
 
        EXPORT_SYMBOL_NS(usb_stor_suspend, USB_STORAGE);
 
-The corresponding ksymtab entry struct `kernel_symbol` will have the member
-`namespace` set accordingly. A symbol that is exported without a namespace will
-refer to `NULL`. There is no default namespace if none is defined. `modpost`
+The corresponding ksymtab entry struct ``kernel_symbol`` will have the member
+``namespace`` set accordingly. A symbol that is exported without a namespace will
+refer to ``NULL``. There is no default namespace if none is defined. ``modpost``
 and kernel/module.c make use the namespace at build time or module load time,
 respectively.
 
@@ -64,7 +64,7 @@ and EXPORT_SYMBOL_GPL() macro expansions that do not specify a namespace.
 
 There are multiple ways of specifying this define and it depends on the
 subsystem and the maintainer's preference, which one to use. The first option
-is to define the default namespace in the `Makefile` of the subsystem. E.g. to
+is to define the default namespace in the ``Makefile`` of the subsystem. E.g. to
 export all symbols defined in usb-common into the namespace USB_COMMON, add a
 line like this to drivers/usb/common/Makefile::
 
@@ -96,7 +96,7 @@ using a statement like::
 
        MODULE_IMPORT_NS(USB_STORAGE);
 
-This will create a `modinfo` tag in the module for each imported namespace.
+This will create a ``modinfo`` tag in the module for each imported namespace.
 This has the side effect, that the imported namespaces of a module can be
 inspected with modinfo::
 
@@ -113,7 +113,7 @@ metadata definitions like MODULE_AUTHOR() or MODULE_LICENSE(). Refer to section
 4. Loading Modules that use namespaced Symbols
 ==============================================
 
-At module loading time (e.g. `insmod`), the kernel will check each symbol
+At module loading time (e.g. ``insmod``), the kernel will check each symbol
 referenced from the module for its availability and whether the namespace it
 might be exported to has been imported by the module. The default behaviour of
 the kernel is to reject loading modules that don't specify sufficient imports.
@@ -138,19 +138,19 @@ missing imports. Fixing missing imports can be done with::
 A typical scenario for module authors would be::
 
        - write code that depends on a symbol from a not imported namespace
-       - `make`
+       - ``make``
        - notice the warning of modpost telling about a missing import
-       - run `make nsdeps` to add the import to the correct code location
+       - run ``make nsdeps`` to add the import to the correct code location
 
 For subsystem maintainers introducing a namespace, the steps are very similar.
-Again, `make nsdeps` will eventually add the missing namespace imports for
+Again, ``make nsdeps`` will eventually add the missing namespace imports for
 in-tree modules::
 
        - move or add symbols to a namespace (e.g. with EXPORT_SYMBOL_NS())
-       - `make` (preferably with an allmodconfig to cover all in-kernel
+       - ``make`` (preferably with an allmodconfig to cover all in-kernel
          modules)
        - notice the warning of modpost telling about a missing import
-       - run `make nsdeps` to add the import to the correct code location
+       - run ``make nsdeps`` to add the import to the correct code location
 
 You can also run nsdeps for external module builds. A typical usage is::
 
index 4756f6b..8e0f1fe 100644 (file)
@@ -114,7 +114,7 @@ Examples of using the Linux-provided gdb helpers
     [     0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved
     ....
 
-- Examine fields of the current task struct::
+- Examine fields of the current task struct(supported by x86 and arm64 only)::
 
     (gdb) p $lx_current().pid
     $1 = 4998
index 3a05b99..a777199 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.example.dts
-processed-schema*.yaml
-processed-schema*.json
+/processed-schema*.yaml
+/processed-schema*.json
index 552a99c..121596f 100644 (file)
@@ -51,6 +51,9 @@ properties:
   resets: true
   reset-names: true
 
+  power-domains:
+    maxItems: 1
+
   ports:
     $ref: /schemas/graph.yaml#/properties/port
     description: |
index adb5165..62f3ca6 100644 (file)
@@ -49,7 +49,7 @@ additionalProperties: true
 examples:
   - |
     i3c-master@a0000000 {
-        compatible = "silvaco,i3c-master";
+        compatible = "silvaco,i3c-master-v1";
         clocks = <&zynqmp_clk 71>, <&fclk>, <&sclk>;
         clock-names = "pclk", "fast_clk", "slow_clk";
         interrupt-parent = <&gic>;
index 8c6418f..3ec579d 100644 (file)
@@ -39,6 +39,13 @@ properties:
       (active low). The line must be flagged with
       GPIO_ACTIVE_LOW.
 
+  wake-gpios:
+    maxItems: 1
+    description:
+      Optional GPIO specifier for the touchscreen's wake pin
+      (active low). The line must be flagged with
+      GPIO_ACTIVE_LOW.
+
   linux,gpio-keymap:
     $ref: /schemas/types.yaml#/definitions/uint32-array
     description: |
@@ -53,6 +60,29 @@ properties:
       or experiment to determine which bit corresponds to which input. Use
       KEY_RESERVED for unused padding values.
 
+  atmel,wakeup-method:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: |
+      The WAKE line is an active-low input that is used to wake up the touch
+      controller from deep-sleep mode before communication with the controller
+      could be started. This optional feature used to minimize current
+      consumption when the controller is in deep sleep mode. This feature is
+      relevant only to some controller families, like mXT1386 controller for
+      example.
+
+      The WAKE pin can be connected in one of the following ways:
+       1) left permanently low
+       2) connected to the I2C-compatible SCL pin
+       3) connected to a GPIO pin on the host
+    enum:
+      - 0 # ATMEL_MXT_WAKEUP_NONE
+      - 1 # ATMEL_MXT_WAKEUP_I2C_SCL
+      - 2 # ATMEL_MXT_WAKEUP_GPIO
+    default: 0
+
+  wakeup-source:
+    type: boolean
+
 required:
   - compatible
   - reg
@@ -63,6 +93,7 @@ additionalProperties: false
 examples:
   - |
     #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/input/atmel-maxtouch.h>
     #include <dt-bindings/gpio/gpio.h>
     i2c {
       #address-cells = <1>;
@@ -75,6 +106,7 @@ examples:
         reset-gpios = <&gpio 27 GPIO_ACTIVE_LOW>;
         vdda-supply = <&ab8500_ldo_aux2_reg>;
         vdd-supply = <&ab8500_ldo_aux5_reg>;
+        atmel,wakeup-method = <ATMEL_MXT_WAKEUP_I2C_SCL>;
       };
     };
 
diff --git a/Documentation/devicetree/bindings/input/iqs626a.yaml b/Documentation/devicetree/bindings/input/iqs626a.yaml
new file mode 100644 (file)
index 0000000..0cb736c
--- /dev/null
@@ -0,0 +1,843 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/iqs626a.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS626A Capacitive Touch Controller
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS626A is a 14-channel capacitive touch controller that features
+  additional Hall-effect and inductive sensing capabilities.
+
+  Link to datasheet: https://www.azoteq.com/
+
+allOf:
+  - $ref: touchscreen/touchscreen.yaml#
+
+properties:
+  compatible:
+    const: azoteq,iqs626a
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  azoteq,suspend-mode:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3]
+    default: 0
+    description: |
+      Specifies the power mode during suspend as follows:
+      0: Automatic (same as normal runtime, i.e. suspend/resume disabled)
+      1: Low power (all sensing at a reduced reporting rate)
+      2: Ultra-low power (ULP channel proximity sensing)
+      3: Halt (no sensing)
+
+  azoteq,clk-div:
+    type: boolean
+    description: Divides the device's core clock by a factor of 4.
+
+  azoteq,ulp-enable:
+    type: boolean
+    description:
+      Permits the device to automatically enter ultra-low-power mode from low-
+      power mode.
+
+  azoteq,ulp-update:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3, 4, 5, 6, 7]
+    default: 3
+    description: |
+      Specifies the rate at which the trackpad, generic and Hall channels are
+      updated during ultra-low-power mode as follows:
+      0: 8
+      1: 13
+      2: 28
+      3: 54
+      4: 89
+      5: 135
+      6: 190
+      7: 256
+
+  azoteq,ati-band-disable:
+    type: boolean
+    description: Disables the ATI band check.
+
+  azoteq,ati-lp-only:
+    type: boolean
+    description: Limits automatic ATI to low-power mode.
+
+  azoteq,gpio3-select:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3, 4, 5, 6, 7]
+    default: 1
+    description: |
+      Selects the channel or group of channels for which the GPIO3 pin
+      represents touch state as follows:
+      0: None
+      1: ULP channel
+      2: Trackpad
+      3: Trackpad
+      4: Generic channel 0
+      5: Generic channel 1
+      6: Generic channel 2
+      7: Hall channel
+
+  azoteq,reseed-select:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3]
+    default: 0
+    description: |
+      Specifies the event(s) that prompt the device to reseed (i.e. reset the
+      long-term average) of an associated channel as follows:
+      0: None
+      1: Proximity
+      2: Proximity or touch
+      3: Proximity, touch or deep touch
+
+  azoteq,thresh-extend:
+    type: boolean
+    description: Multiplies all touch and deep-touch thresholds by 4.
+
+  azoteq,tracking-enable:
+    type: boolean
+    description:
+      Enables all associated channels to track their respective reference
+      channels.
+
+  azoteq,reseed-offset:
+    type: boolean
+    description:
+      Applies an 8-count offset to all long-term averages upon either ATI or
+      reseed events.
+
+  azoteq,rate-np-ms:
+    minimum: 0
+    maximum: 255
+    default: 150
+    description: Specifies the report rate (in ms) during normal-power mode.
+
+  azoteq,rate-lp-ms:
+    minimum: 0
+    maximum: 255
+    default: 150
+    description: Specifies the report rate (in ms) during low-power mode.
+
+  azoteq,rate-ulp-ms:
+    multipleOf: 16
+    minimum: 0
+    maximum: 4080
+    default: 0
+    description: Specifies the report rate (in ms) during ultra-low-power mode.
+
+  azoteq,timeout-pwr-ms:
+    multipleOf: 512
+    minimum: 0
+    maximum: 130560
+    default: 2560
+    description:
+      Specifies the length of time (in ms) to wait for an event before moving
+      from normal-power mode to low-power mode, or (if 'azoteq,ulp-enable' is
+      present) from low-power mode to ultra-low-power mode.
+
+  azoteq,timeout-lta-ms:
+    multipleOf: 512
+    minimum: 0
+    maximum: 130560
+    default: 40960
+    description:
+      Specifies the length of time (in ms) to wait before resetting the long-
+      term average of all channels. Specify the maximum timeout to disable it
+      altogether.
+
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+patternProperties:
+  "^ulp-0|generic-[0-2]|hall$":
+    type: object
+    description:
+      Represents a single sensing channel. A channel is active if defined and
+      inactive otherwise.
+
+    properties:
+      azoteq,ati-exclude:
+        type: boolean
+        description:
+          Prevents the channel from participating in an ATI event that is
+          manually triggered during initialization.
+
+      azoteq,reseed-disable:
+        type: boolean
+        description:
+          Prevents the channel from being reseeded if the long-term average
+          timeout (defined in 'azoteq,timeout-lta') expires.
+
+      azoteq,meas-cap-decrease:
+        type: boolean
+        description:
+          Decreases the internal measurement capacitance from 60 pF to 15 pF.
+
+      azoteq,rx-inactive:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2]
+        default: 0
+        description: |
+          Specifies how inactive CRX pins are to be terminated as follows:
+          0: VSS
+          1: Floating
+          2: VREG (generic channels only)
+
+      azoteq,linearize:
+        type: boolean
+        description:
+          Enables linearization of the channel's counts (generic and Hall
+          channels) or inverts the polarity of the channel's proximity or
+          touch states (ULP channel).
+
+      azoteq,dual-direction:
+        type: boolean
+        description:
+          Specifies that the channel's long-term average is to freeze in the
+          presence of either increasing or decreasing counts, thereby permit-
+          ting events to be reported in either direction.
+
+      azoteq,filt-disable:
+        type: boolean
+        description: Disables raw count filtering for the channel.
+
+      azoteq,ati-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        description: |
+          Specifies the channel's ATI mode as follows:
+          0: Disabled
+          1: Semi-partial
+          2: Partial
+          3: Full
+
+          The default value is a function of the channel and the device's reset
+          user interface (RUI); reference the datasheet for further information
+          about the available RUI options.
+
+      azoteq,ati-base:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [75, 100, 150, 200]
+        description:
+          Specifies the channel's ATI base. The default value is a function
+          of the channel and the device's RUI.
+
+      azoteq,ati-target:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        multipleOf: 32
+        minimum: 0
+        maximum: 2016
+        description:
+          Specifies the channel's ATI target. The default value is a function
+          of the channel and the device's RUI.
+
+      azoteq,cct-increase:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 16
+        default: 0
+        description:
+          Specifies the degree to which the channel's charge cycle time is to
+          be increased, with 0 representing no increase. The maximum value is
+          limited to 4 in the case of the ULP channel, and the property is un-
+          available entirely in the case of the Hall channel.
+
+      azoteq,proj-bias:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the bias current applied during projected-capacitance
+          sensing as follows:
+          0: 2.5 uA
+          1: 5 uA
+          2: 10 uA
+          3: 20 uA
+
+          This property is unavailable in the case of the Hall channel.
+
+      azoteq,sense-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        description: |
+          Specifies the channel's sensing frequency as follows (parenthesized
+          numbers represent the frequency if 'azoteq,clk-div' is present):
+          0: 4 MHz (1 MHz)
+          1: 2 MHz (500 kHz)
+          2: 1 MHz (250 kHz)
+          3: 500 kHz (125 kHz)
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,ati-band-tighten:
+        type: boolean
+        description:
+          Tightens the ATI band from 1/8 to 1/16 of the desired target (ULP and
+          generic channels only).
+
+      azoteq,proj-enable:
+        type: boolean
+        description: Enables projected-capacitance sensing (ULP channel only).
+
+      azoteq,filt-str-np-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during normal-power mode (ULP
+          and generic channels only).
+
+      azoteq,filt-str-lp-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during low-power mode (ULP and
+          generic channels only).
+
+      azoteq,filt-str-np-lta:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the long-term average filter strength during normal-power
+          mode (ULP and generic channels only).
+
+      azoteq,filt-str-lp-lta:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the long-term average filter strength during low-power mode
+          (ULP and generic channels only).
+
+      azoteq,rx-enable:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 8
+        items:
+          minimum: 0
+          maximum: 7
+        description:
+          Specifies the CRX pin(s) associated with the channel.
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,tx-enable:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 8
+        items:
+          minimum: 0
+          maximum: 7
+        description:
+          Specifies the TX pin(s) associated with the channel.
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,local-cap-size:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3, 4]
+        default: 0
+        description: |
+          Specifies the capacitance to be added to the channel as follows:
+          0: 0 pF
+          1: 0.5 pF
+          2: 1.0 pF
+          3: 1.5 pF
+          4: 2.0 pF
+
+          This property is unavailable in the case of the ULP or Hall channels.
+
+      azoteq,sense-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 8, 9, 12, 14, 15]
+        description: |
+          Specifies the channel's sensing mode as follows:
+          0:  Self capacitance
+          1:  Projected capacitance
+          8:  Self inductance
+          9:  Mutual inductance
+          12: External
+          14: Hall effect
+          15: Temperature
+
+          This property is unavailable in the case of the ULP or Hall channels.
+          The default value is a function of the channel and the device's RUI.
+
+      azoteq,tx-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the inductive sensing excitation frequency as follows
+          (parenthesized numbers represent the frequency if 'azoteq,clk-div'
+          is present):
+          0: 16 MHz (4 MHz)
+          1: 8 MHz (2 MHz)
+          2: 4 MHz (1 MHz)
+          3: 2 MHz (500 kHz)
+
+          This property is unavailable in the case of the ULP or Hall channels.
+
+      azoteq,invert-enable:
+        type: boolean
+        description:
+          Inverts the polarity of the states reported for proximity, touch and
+          deep-touch events relative to their respective thresholds (generic
+          channels only).
+
+      azoteq,comp-disable:
+        type: boolean
+        description:
+          Disables compensation for the channel (generic channels only).
+
+      azoteq,static-enable:
+        type: boolean
+        description:
+          Enables the static front-end for the channel (generic channels only).
+
+      azoteq,assoc-select:
+        $ref: /schemas/types.yaml#/definitions/string-array
+        minItems: 1
+        maxItems: 6
+        items:
+          enum:
+            - ulp-0
+            - trackpad-3x2
+            - trackpad-3x3
+            - generic-0
+            - generic-1
+            - generic-2
+            - hall
+        description:
+          Specifies the associated channels for which the channel serves as a
+          reference channel. By default, no channels are selected. This prop-
+          erty is only available for the generic channels.
+
+      azoteq,assoc-weight:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 255
+        default: 0
+        description:
+          Specifies the channel's impact weight if it acts as an associated
+          channel (0 = 0% impact, 255 = 200% impact). This property is only
+          available for the generic channels.
+
+    patternProperties:
+      "^event-(prox|touch|deep)(-alt)?$":
+        type: object
+        description:
+          Represents a proximity, touch or deep-touch event reported by the
+          channel in response to a decrease in counts. Node names suffixed with
+          '-alt' instead correspond to an increase in counts.
+
+          By default, the long-term average tracks an increase in counts such
+          that only events corresponding to a decrease in counts are reported
+          (refer to the datasheet for more information).
+
+          Specify 'azoteq,dual-direction' to freeze the long-term average when
+          the counts increase or decrease such that events of either direction
+          can be reported. Alternatively, specify 'azoteq,invert-enable' to in-
+          vert the polarity of the states reported by the channel.
+
+          Complementary events (e.g. event-touch and event-touch-alt) can both
+          be present and specify different key or switch codes, but not differ-
+          ent thresholds or hysteresis (if applicable).
+
+          Proximity events are unavailable in the case of the Hall channel, and
+          deep-touch events are only available for the generic channels. Unless
+          otherwise specified, default values are a function of the channel and
+          the device's RUI.
+
+        properties:
+          azoteq,thresh:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            minimum: 0
+            maximum: 255
+            description: Specifies the threshold for the event.
+
+          azoteq,hyst:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            minimum: 0
+            maximum: 15
+            description:
+              Specifies the hysteresis for the event (touch and deep-touch
+              events only).
+
+          linux,code:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            description: Numeric key or switch code associated with the event.
+
+          linux,input-type:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            enum: [1, 5]
+            description:
+              Specifies whether the event is to be interpreted as a key (1) or
+              a switch (5). By default, Hall-channel events are interpreted as
+              switches and all others are interpreted as keys.
+
+        dependencies:
+          linux,input-type: ["linux,code"]
+
+        additionalProperties: false
+
+    dependencies:
+      azoteq,assoc-weight: ["azoteq,assoc-select"]
+
+    additionalProperties: false
+
+  "^trackpad-3x[2-3]$":
+    type: object
+    description:
+      Represents all channels associated with the trackpad. The channels are
+      collectively active if the trackpad is defined and inactive otherwise.
+
+    properties:
+      azoteq,ati-exclude:
+        type: boolean
+        description:
+          Prevents the trackpad channels from participating in an ATI event
+          that is manually triggered during initialization.
+
+      azoteq,reseed-disable:
+        type: boolean
+        description:
+          Prevents the trackpad channels from being reseeded if the long-term
+          average timeout (defined in 'azoteq,timeout-lta') expires.
+
+      azoteq,meas-cap-decrease:
+        type: boolean
+        description:
+          Decreases the internal measurement capacitance from 60 pF to 15 pF.
+
+      azoteq,rx-inactive:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1]
+        default: 0
+        description: |
+          Specifies how inactive CRX pins are to be terminated as follows:
+          0: VSS
+          1: Floating
+
+      azoteq,linearize:
+        type: boolean
+        description: Inverts the polarity of the trackpad's touch state.
+
+      azoteq,dual-direction:
+        type: boolean
+        description:
+          Specifies that the trackpad's long-term averages are to freeze in
+          the presence of either increasing or decreasing counts, thereby
+          permitting events to be reported in either direction.
+
+      azoteq,filt-disable:
+        type: boolean
+        description: Disables raw count filtering for the trackpad channels.
+
+      azoteq,ati-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the trackpad's ATI mode as follows:
+          0: Disabled
+          1: Semi-partial
+          2: Partial
+          3: Full
+
+      azoteq,ati-base:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 6
+        maxItems: 9
+        items:
+          minimum: 45
+          maximum: 300
+        default: [45, 45, 45, 45, 45, 45, 45, 45, 45]
+        description: Specifies each individual trackpad channel's ATI base.
+
+      azoteq,ati-target:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        multipleOf: 32
+        minimum: 0
+        maximum: 2016
+        default: 0
+        description: Specifies the trackpad's ATI target.
+
+      azoteq,cct-increase:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 4
+        default: 0
+        description:
+          Specifies the degree to which the trackpad's charge cycle time is to
+          be increased, with 0 representing no increase.
+
+      azoteq,proj-bias:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the bias current applied during projected-capacitance
+          sensing as follows:
+          0: 2.5 uA
+          1: 5 uA
+          2: 10 uA
+          3: 20 uA
+
+      azoteq,sense-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the trackpad's sensing frequency as follows (parenthesized
+          numbers represent the frequency if 'azoteq,clk-div' is present):
+          0: 4 MHz (1 MHz)
+          1: 2 MHz (500 kHz)
+          2: 1 MHz (250 kHz)
+          3: 500 kHz (125 kHz)
+
+      azoteq,ati-band-tighten:
+        type: boolean
+        description:
+          Tightens the ATI band from 1/8 to 1/16 of the desired target.
+
+      azoteq,thresh:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 6
+        maxItems: 9
+        items:
+          minimum: 0
+          maximum: 255
+        default: [0, 0, 0, 0, 0, 0, 0, 0, 0]
+        description:
+          Specifies each individual trackpad channel's touch threshold.
+
+      azoteq,hyst:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 15
+        default: 0
+        description: Specifies the trackpad's touch hysteresis.
+
+      azoteq,lta-update:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3, 4, 5, 6, 7]
+        default: 0
+        description: |
+          Specifies the update rate of the trackpad's long-term average during
+          ultra-low-power mode as follows:
+          0: 2
+          1: 4
+          2: 8
+          3: 16
+          4: 32
+          5: 64
+          6: 128
+          7: 255
+
+      azoteq,filt-str-trackpad:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: Specifies the trackpad coordinate filter strength.
+
+      azoteq,filt-str-np-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during normal-power mode.
+
+      azoteq,filt-str-lp-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during low-power mode.
+
+      linux,keycodes:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 6
+        description: |
+          Specifies the numeric keycodes associated with each available gesture
+          in the following order (enter 0 for unused gestures):
+          0: Positive flick or swipe in X direction
+          1: Negative flick or swipe in X direction
+          2: Positive flick or swipe in Y direction
+          3: Negative flick or swipe in Y direction
+          4: Tap
+          5: Hold
+
+      azoteq,gesture-swipe:
+        type: boolean
+        description:
+          Directs the device to interpret axial gestures as a swipe (finger
+          remains on trackpad) instead of a flick (finger leaves trackpad).
+
+      azoteq,timeout-tap-ms:
+        multipleOf: 16
+        minimum: 0
+        maximum: 4080
+        default: 0
+        description:
+          Specifies the length of time (in ms) within which a trackpad touch
+          must be released in order to be interpreted as a tap.
+
+      azoteq,timeout-swipe-ms:
+        multipleOf: 16
+        minimum: 0
+        maximum: 4080
+        default: 0
+        description:
+          Specifies the length of time (in ms) within which an axial gesture
+          must be completed in order to be interpreted as a flick or swipe.
+
+      azoteq,thresh-swipe:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 255
+        default: 0
+        description:
+          Specifies the number of points across which an axial gesture must
+          travel in order to be interpreted as a flick or swipe.
+
+    dependencies:
+      azoteq,gesture-swipe: ["linux,keycodes"]
+      azoteq,timeout-tap-ms: ["linux,keycodes"]
+      azoteq,timeout-swipe-ms: ["linux,keycodes"]
+      azoteq,thresh-swipe: ["linux,keycodes"]
+
+    additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - "#address-cells"
+  - "#size-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/input/input.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            iqs626a@44 {
+                    #address-cells = <1>;
+                    #size-cells = <0>;
+
+                    compatible = "azoteq,iqs626a";
+                    reg = <0x44>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+                    azoteq,rate-np-ms = <16>;
+                    azoteq,rate-lp-ms = <160>;
+
+                    azoteq,timeout-pwr-ms = <2560>;
+                    azoteq,timeout-lta-ms = <32768>;
+
+                    ulp-0 {
+                            azoteq,meas-cap-decrease;
+
+                            azoteq,ati-base = <75>;
+                            azoteq,ati-target = <1024>;
+
+                            azoteq,rx-enable = <2>, <3>, <4>,
+                                               <5>, <6>, <7>;
+
+                            event-prox {
+                                    linux,code = <KEY_POWER>;
+                            };
+                    };
+
+                    trackpad-3x3 {
+                            azoteq,filt-str-np-cnt = <1>;
+                            azoteq,filt-str-lp-cnt = <1>;
+
+                            azoteq,hyst = <4>;
+                            azoteq,thresh = <35>, <40>, <40>,
+                                            <38>, <33>, <38>,
+                                            <35>, <35>, <35>;
+
+                            azoteq,ati-mode = <3>;
+                            azoteq,ati-base = <195>, <195>, <195>,
+                                              <195>, <195>, <195>,
+                                              <195>, <195>, <195>;
+                            azoteq,ati-target = <512>;
+
+                            azoteq,proj-bias = <1>;
+                            azoteq,sense-freq = <2>;
+
+                            linux,keycodes = <KEY_VOLUMEUP>,
+                                             <KEY_VOLUMEDOWN>,
+                                             <KEY_NEXTSONG>,
+                                             <KEY_PREVIOUSSONG>,
+                                             <KEY_PLAYPAUSE>,
+                                             <KEY_STOPCD>;
+
+                            azoteq,gesture-swipe;
+                            azoteq,timeout-swipe-ms = <800>;
+                            azoteq,timeout-tap-ms = <400>;
+                            azoteq,thresh-swipe = <40>;
+                    };
+
+                    /*
+                     * Preserve the default register settings for
+                     * the temperature-tracking channel leveraged
+                     * by reset user interface (RUI) 1.
+                     *
+                     * Scalar properties (e.g. ATI mode) are left
+                     * untouched by simply omitting them; boolean
+                     * properties must be specified explicitly as
+                     * needed.
+                     */
+                    generic-2 {
+                            azoteq,reseed-disable;
+                            azoteq,meas-cap-decrease;
+                            azoteq,dual-direction;
+                            azoteq,comp-disable;
+                            azoteq,static-enable;
+                    };
+
+                    hall {
+                            azoteq,reseed-disable;
+                            azoteq,meas-cap-decrease;
+
+                            event-touch {
+                                    linux,code = <SW_LID>;
+                            };
+                    };
+            };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml
new file mode 100644 (file)
index 0000000..b5f3772
--- /dev/null
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/azoteq,iqs5xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS550/572/525 Trackpad/Touchscreen Controller
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS550, IQS572 and IQS525 trackpad and touchscreen controllers
+  employ projected-capacitance sensing and can track up to five independent
+  contacts.
+
+  Link to datasheet: https://www.azoteq.com/
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - azoteq,iqs550
+      - azoteq,iqs572
+      - azoteq,iqs525
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  wakeup-source: true
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            touchscreen@74 {
+                    compatible = "azoteq,iqs550";
+                    reg = <0x74>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+                    reset-gpios = <&gpio 22 (GPIO_ACTIVE_LOW |
+                                             GPIO_PUSH_PULL)>;
+
+                    touchscreen-size-x = <800>;
+                    touchscreen-size-y = <480>;
+            };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml
new file mode 100644 (file)
index 0000000..942562f
--- /dev/null
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/hycon,hy46xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Hycon HY46XX series touchscreen controller bindings
+
+description: |
+  There are 6 variants of the chip for various touch panel sizes and cover lens material
+   Glass: 0.3mm--4.0mm
+    PET/PMMA: 0.2mm--2.0mm
+    HY4613(B)-N048  < 6"
+    HY4614(B)-N068  7" .. 10.1"
+    HY4621-NS32  < 5"
+    HY4623-NS48  5.1" .. 7"
+   Glass: 0.3mm--8.0mm
+    PET/PMMA: 0.2mm--4.0mm
+    HY4633(B)-N048  < 6"
+    HY4635(B)-N048  < 7" .. 10.1"
+
+maintainers:
+  - Giulio Benetti <giulio.benetti@benettiengineering.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - hycon,hy4613
+      - hycon,hy4614
+      - hycon,hy4621
+      - hycon,hy4623
+      - hycon,hy4633
+      - hycon,hy4635
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  vcc-supply: true
+
+  hycon,threshold:
+    description: Allows setting the sensitivity in the range from 0 to 255.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 255
+
+  hycon,glove-enable:
+    type: boolean
+    description: Allows enabling glove setting.
+
+  hycon,report-speed-hz:
+    description: Allows setting the report speed in Hertz.
+    minimum: 1
+    maximum: 255
+
+  hycon,noise-filter-enable:
+    type: boolean
+    description: Allows enabling power noise filter.
+
+  hycon,filter-data:
+    description: Allows setting how many samples throw before reporting touch
+                 in the range from 0 to 5.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 5
+
+  hycon,gain:
+    description: Allows setting the sensitivity distance in the range from 0 to 5.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 5
+
+  hycon,edge-offset:
+    description: Allows setting the edge compensation in the range from 0 to 16.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 16
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+  interrupt-controller: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@1c {
+        compatible = "hycon,hy4633";
+        reg = <0x1c>;
+        interrupt-parent = <&gpio2>;
+        interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
+        reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml b/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml
new file mode 100644 (file)
index 0000000..a190e7b
--- /dev/null
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/ilitek_ts_i2c.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ilitek I2C Touchscreen Controller
+
+maintainers:
+  - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - ilitek,ili2130
+      - ilitek,ili2131
+      - ilitek,ili2132
+      - ilitek,ili2316
+      - ilitek,ili2322
+      - ilitek,ili2323
+      - ilitek,ili2326
+      - ilitek,ili2520
+      - ilitek,ili2521
+
+  reg:
+    const: 0x41
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  wakeup-source:
+    type: boolean
+    description: touchscreen can be used as a wakeup source.
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - reset-gpios
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/gpio/gpio.h>
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        touchscreen@41 {
+            compatible = "ilitek,ili2520";
+            reg = <0x41>;
+
+            interrupt-parent = <&gpio1>;
+            interrupts = <7 IRQ_TYPE_LEVEL_LOW>;
+            reset-gpios = <&gpio1 8 GPIO_ACTIVE_LOW>;
+            touchscreen-inverted-y;
+            wakeup-source;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt b/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt
deleted file mode 100644 (file)
index efa0820..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-Azoteq IQS550/572/525 Trackpad/Touchscreen Controller
-
-Required properties:
-
-- compatible                   : Must be equal to one of the following:
-                                 "azoteq,iqs550"
-                                 "azoteq,iqs572"
-                                 "azoteq,iqs525"
-
-- reg                          : I2C slave address for the device.
-
-- interrupts                   : GPIO to which the device's active-high RDY
-                                 output is connected (see [0]).
-
-- reset-gpios                  : GPIO to which the device's active-low NRST
-                                 input is connected (see [1]).
-
-Optional properties:
-
-- touchscreen-min-x            : See [2].
-
-- touchscreen-min-y            : See [2].
-
-- touchscreen-size-x           : See [2]. If this property is omitted, the
-                                 maximum x-coordinate is specified by the
-                                 device's "X Resolution" register.
-
-- touchscreen-size-y           : See [2]. If this property is omitted, the
-                                 maximum y-coordinate is specified by the
-                                 device's "Y Resolution" register.
-
-- touchscreen-max-pressure     : See [2]. Pressure is expressed as the sum of
-                                 the deltas across all channels impacted by a
-                                 touch event. A channel's delta is calculated
-                                 as its count value minus a reference, where
-                                 the count value is inversely proportional to
-                                 the channel's capacitance.
-
-- touchscreen-fuzz-x           : See [2].
-
-- touchscreen-fuzz-y           : See [2].
-
-- touchscreen-fuzz-pressure    : See [2].
-
-- touchscreen-inverted-x       : See [2]. Inversion is applied relative to that
-                                 which may already be specified by the device's
-                                 FLIP_X and FLIP_Y register fields.
-
-- touchscreen-inverted-y       : See [2]. Inversion is applied relative to that
-                                 which may already be specified by the device's
-                                 FLIP_X and FLIP_Y register fields.
-
-- touchscreen-swapped-x-y      : See [2]. Swapping is applied relative to that
-                                 which may already be specified by the device's
-                                 SWITCH_XY_AXIS register field.
-
-[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-[1]: Documentation/devicetree/bindings/gpio/gpio.txt
-[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
-
-Example:
-
-       &i2c1 {
-               /* ... */
-
-               touchscreen@74 {
-                       compatible = "azoteq,iqs550";
-                       reg = <0x74>;
-                       interrupt-parent = <&gpio>;
-                       interrupts = <17 4>;
-                       reset-gpios = <&gpio 27 1>;
-
-                       touchscreen-size-x = <640>;
-                       touchscreen-size-y = <480>;
-
-                       touchscreen-max-pressure = <16000>;
-               };
-
-               /* ... */
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml b/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml
new file mode 100644 (file)
index 0000000..6236688
--- /dev/null
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/melfas,mms114.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Melfas MMS114 family touchscreen controller bindings
+
+maintainers:
+  - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  $nodename:
+    pattern: "^touchscreen(@.*)?$"
+
+  compatible:
+    items:
+      - enum:
+          - melfas,mms114
+          - melfas,mms134s
+          - melfas,mms136
+          - melfas,mms152
+          - melfas,mms345l
+
+  reg:
+    description: I2C address
+
+  clock-frequency:
+    description: I2C client clock frequency, defined for host
+    minimum: 100000
+    maximum: 400000
+
+  interrupts:
+    maxItems: 1
+
+  avdd-supply:
+    description: Analog power supply regulator on AVDD pin
+
+  vdd-supply:
+    description: Digital power supply regulator on VDD pin
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-fuzz-pressure: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+  touchscreen-max-pressure: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@48 {
+        compatible = "melfas,mms114";
+        reg = <0x48>;
+        interrupt-parent = <&gpio>;
+        interrupts = <39 IRQ_TYPE_EDGE_FALLING>;
+        avdd-supply = <&ldo1_reg>;
+        vdd-supply = <&ldo2_reg>;
+        touchscreen-size-x = <720>;
+        touchscreen-size-y = <1280>;
+        touchscreen-fuzz-x = <10>;
+        touchscreen-fuzz-y = <10>;
+        touchscreen-fuzz-pressure = <10>;
+        touchscreen-inverted-x;
+        touchscreen-inverted-y;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/mms114.txt b/Documentation/devicetree/bindings/input/touchscreen/mms114.txt
deleted file mode 100644 (file)
index 707234c..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-* MELFAS MMS114/MMS152/MMS345L touchscreen controller
-
-Required properties:
-- compatible: should be one of:
-       - "melfas,mms114"
-       - "melfas,mms152"
-       - "melfas,mms345l"
-- reg: I2C address of the chip
-- interrupts: interrupt to which the chip is connected
-- touchscreen-size-x: See [1]
-- touchscreen-size-y: See [1]
-
-Optional properties:
-- touchscreen-fuzz-x: See [1]
-- touchscreen-fuzz-y: See [1]
-- touchscreen-fuzz-pressure: See [1]
-- touchscreen-inverted-x: See [1]
-- touchscreen-inverted-y: See [1]
-- touchscreen-swapped-x-y: See [1]
-
-[1]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
-
-Example:
-
-       i2c@00000000 {
-               /* ... */
-
-               touchscreen@48 {
-                       compatible = "melfas,mms114";
-                       reg = <0x48>;
-                       interrupts = <39 0>;
-                       touchscreen-size-x = <720>;
-                       touchscreen-size-y = <1280>;
-                       touchscreen-fuzz-x = <10>;
-                       touchscreen-fuzz-y = <10>;
-                       touchscreen-fuzz-pressure = <10>;
-                       touchscreen-inverted-x;
-                       touchscreen-inverted-y;
-               };
-
-               /* ... */
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml
new file mode 100644 (file)
index 0000000..3a42c23
--- /dev/null
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/mstar,msg2638.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MStar msg2638 touchscreen controller Bindings
+
+maintainers:
+  - Vincent Knecht <vincent.knecht@mailoo.org>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    const: mstar,msg2638
+
+  reg:
+    const: 0x26
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  vdd-supply:
+    description: Power supply regulator for the chip
+
+  vddio-supply:
+    description: Power supply regulator for the I2C bus
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - reset-gpios
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@26 {
+        compatible = "mstar,msg2638";
+        reg = <0x26>;
+        interrupt-parent = <&msmgpio>;
+        interrupts = <13 IRQ_TYPE_EDGE_FALLING>;
+        reset-gpios = <&msmgpio 100 GPIO_ACTIVE_LOW>;
+        pinctrl-names = "default";
+        pinctrl-0 = <&ts_int_reset_default>;
+        vdd-supply = <&pm8916_l17>;
+        vddio-supply = <&pm8916_l5>;
+        touchscreen-size-x = <2048>;
+        touchscreen-size-y = <2048>;
+      };
+    };
+
+...
index df5d8d1..160ff4b 100644 (file)
@@ -22,6 +22,9 @@ properties:
   reg:
     maxItems: 1
 
+  interrupts:
+    maxItems: 1
+
   interrupt-controller: true
 
 required:
@@ -29,6 +32,7 @@ required:
   - compatible
   - reg
   - interrupt-controller
+  - interrupts
 
 additionalProperties: false
 
index fe7c4cb..dd1a5ce 100644 (file)
@@ -193,23 +193,35 @@ required:
   - interrupts
   - clocks
   - power-domains
-  - resets
-
-if:
-  properties:
-    compatible:
-      contains:
-        enum:
-          - renesas,vin-r8a7778
-          - renesas,vin-r8a7779
-          - renesas,rcar-gen2-vin
-then:
-  required:
-    - port
-else:
-  required:
-    - renesas,id
-    - ports
+
+allOf:
+  - if:
+      not:
+        properties:
+          compatible:
+            contains:
+              enum:
+                - renesas,vin-r8a7778
+                - renesas,vin-r8a7779
+    then:
+      required:
+        - resets
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - renesas,vin-r8a7778
+              - renesas,vin-r8a7779
+              - renesas,rcar-gen2-vin
+    then:
+      required:
+        - port
+    else:
+      required:
+        - renesas,id
+        - ports
 
 additionalProperties: false
 
diff --git a/Documentation/devicetree/bindings/mtd/tango-nand.txt b/Documentation/devicetree/bindings/mtd/tango-nand.txt
deleted file mode 100644 (file)
index 91c8420..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-Sigma Designs Tango4 NAND Flash Controller (NFC)
-
-Required properties:
-
-- compatible: "sigma,smp8758-nand"
-- reg: address/size of nfc_reg, nfc_mem, and pbus_reg
-- dmas: reference to the DMA channel used by the controller
-- dma-names: "rxtx"
-- clocks: reference to the system clock
-- #address-cells: <1>
-- #size-cells: <0>
-
-Children nodes represent the available NAND chips.
-See Documentation/devicetree/bindings/mtd/nand-controller.yaml for generic bindings.
-
-Example:
-
-       nandc: nand-controller@2c000 {
-               compatible = "sigma,smp8758-nand";
-               reg = <0x2c000 0x30>, <0x2d000 0x800>, <0x20000 0x1000>;
-               dmas = <&dma0 3>;
-               dma-names = "rxtx";
-               clocks = <&clkgen SYS_CLK>;
-               #address-cells = <1>;
-               #size-cells = <0>;
-
-               nand@0 {
-                       reg = <0>; /* CS0 */
-                       nand-ecc-strength = <14>;
-                       nand-ecc-step-size = <1024>;
-               };
-
-               nand@1 {
-                       reg = <1>; /* CS1 */
-                       nand-ecc-strength = <14>;
-                       nand-ecc-step-size = <1024>;
-               };
-       };
index fe72a55..005868f 100644 (file)
@@ -51,12 +51,12 @@ properties:
 
   clocks:
     minItems: 1
-    maxItems: 2
     items:
       - description: AVB functional clock
       - description: Optional TXC reference clock
 
   clock-names:
+    minItems: 1
     items:
       - const: fck
       - const: refclk
index 4a2bcc0..8fdfbc7 100644 (file)
@@ -17,6 +17,7 @@ allOf:
 properties:
   compatible:
     oneOf:
+      - const: renesas,pcie-r8a7779       # R-Car H1
       - items:
           - enum:
               - renesas,pcie-r8a7742      # RZ/G1H
@@ -74,7 +75,16 @@ required:
   - clocks
   - clock-names
   - power-domains
-  - resets
+
+if:
+  not:
+    properties:
+      compatible:
+        contains:
+          const: renesas,pcie-r8a7779
+then:
+  required:
+    - resets
 
 unevaluatedProperties: false
 
diff --git a/Documentation/devicetree/bindings/pci/tango-pcie.txt b/Documentation/devicetree/bindings/pci/tango-pcie.txt
deleted file mode 100644 (file)
index 2446838..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-Sigma Designs Tango PCIe controller
-
-Required properties:
-
-- compatible: "sigma,smp8759-pcie"
-- reg: address/size of PCI configuration space, address/size of register area
-- bus-range: defined by size of PCI configuration space
-- device_type: "pci"
-- #size-cells: <2>
-- #address-cells: <3>
-- msi-controller
-- ranges: translation from system to bus addresses
-- interrupts: spec for misc interrupts, spec for MSI
-
-Example:
-
-       pcie@2e000 {
-               compatible = "sigma,smp8759-pcie";
-               reg = <0x50000000 0x400000>, <0x2e000 0x100>;
-               bus-range = <0 3>;
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
-               msi-controller;
-               ranges = <0x02000000 0x0 0x00400000  0x50400000  0x0 0x3c00000>;
-               interrupts =
-                       <54 IRQ_TYPE_LEVEL_HIGH>, /* misc interrupts */
-                       <55 IRQ_TYPE_LEVEL_HIGH>; /* MSI */
-       };
diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml
new file mode 100644 (file)
index 0000000..3f981e8
--- /dev/null
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/microchip.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip PolarFire SoC-based boards device tree bindings
+
+maintainers:
+  - Cyril Jean <Cyril.Jean@microchip.com>
+  - Lewis Hanly <lewis.hanly@microchip.com>
+
+description:
+  Microchip PolarFire SoC-based boards
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    items:
+      - enum:
+          - microchip,mpfs-icicle-kit
+      - const: microchip,mpfs
+
+additionalProperties: true
+
+...
index f0506a9..41f57c4 100644 (file)
@@ -100,11 +100,6 @@ properties:
               - mediatek,mt7623-btif
           - const: mediatek,mtk-btif
       - items:
-          - enum:
-              - mediatek,mt7622-btif
-              - mediatek,mt7623-btif
-          - const: mediatek,mtk-btif
-      - items:
           - const: mrvl,mmp-uart
           - const: intel,xscale-uart
       - items:
index 99a72a4..b868cef 100644 (file)
@@ -495,6 +495,8 @@ patternProperties:
     description: Shenzhen Hugsun Technology Co. Ltd.
   "^hwacom,.*":
     description: HwaCom Systems Inc.
+  "^hycon,.*":
+    description: Hycon Technology Corp.
   "^hydis,.*":
     description: Hydis Technologies
   "^hyundai,.*":
index decc68c..606eed8 100644 (file)
@@ -2,7 +2,7 @@
 VFIO - "Virtual Function I/O" [1]_
 ==================================
 
-Many modern system now provide DMA and interrupt remapping facilities
+Many modern systems now provide DMA and interrupt remapping facilities
 to help ensure I/O devices behave within the boundaries they've been
 allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d,
 POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC
index 4e264c1..df4b711 100644 (file)
@@ -99,6 +99,12 @@ native::
       }
   }
 
+Note, that historically ACPI has no means of the GPIO polarity and thus
+the SPISerialBus() resource defines it on the per-chip basis. In order
+to avoid a chain of negations, the GPIO polarity is considered being
+Active High. Even for the cases when _DSD() is involved (see the example
+above) the GPIO CS polarity must be defined Active High to avoid ambiguity.
+
 Other supported properties
 ==========================
 
index 810ae02..5865748 100644 (file)
@@ -107,13 +107,17 @@ example below:
                },
        };
 
-       static const struct property_entry rotary_encoder_properties[] __initconst = {
+       static const struct property_entry rotary_encoder_properties[] = {
                PROPERTY_ENTRY_U32("rotary-encoder,steps-per-period", 24),
                PROPERTY_ENTRY_U32("linux,axis",                      ABS_X),
                PROPERTY_ENTRY_U32("rotary-encoder,relative_axis",    0),
                { },
        };
 
+       static const struct software_node rotary_encoder_node = {
+               .properties = rotary_encoder_properties,
+       };
+
        static struct platform_device rotary_encoder_device = {
                .name           = "rotary-encoder",
                .id             = 0,
@@ -122,7 +126,7 @@ example below:
        ...
 
        gpiod_add_lookup_table(&rotary_encoder_gpios);
-       device_add_properties(&rotary_encoder_device, rotary_encoder_properties);
+       device_add_software_node(&rotary_encoder_device.dev, &rotary_encoder_node);
        platform_device_register(&rotary_encoder_device);
 
        ...
index 95803e2..af5934c 100644 (file)
@@ -71,7 +71,7 @@ The possible values of ``type`` are::
        #define JS_EVENT_INIT           0x80    /* initial state of device */
 
 As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed
-events on open. That is, if it's issuing a INIT BUTTON event, the
+events on open. That is, if it's issuing an INIT BUTTON event, the
 current type value will be::
 
        int type = JS_EVENT_BUTTON | JS_EVENT_INIT;     /* 0x81 */
@@ -100,8 +100,8 @@ is, you have both an axis 0 and a button 0). Generally,
         =============== =======
 
 Hats vary from one joystick type to another. Some can be moved in 8
-directions, some only in 4, The driver, however, always reports a hat as two
-independent axis, even if the hardware doesn't allow independent movement.
+directions, some only in 4. The driver, however, always reports a hat as two
+independent axes, even if the hardware doesn't allow independent movement.
 
 
 js_event.value
@@ -188,10 +188,10 @@ One reason for emptying the queue is that if it gets full you'll start
 missing events since the queue is finite, and older events will get
 overwritten.
 
-The other reason is that you want to know all what happened, and not
+The other reason is that you want to know all that happened, and not
 delay the processing till later.
 
-Why can get the queue full? Because you don't empty the queue as
+Why can the queue get full? Because you don't empty the queue as
 mentioned, or because too much time elapses from one read to another
 and too many events to store in the queue get generated. Note that
 high system load may contribute to space those reads even more.
@@ -277,7 +277,7 @@ to be in the stable part of the API, and therefore may change without
 warning in following releases of the driver.
 
 Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold
-information for all axis. That is, struct js_corr corr[MAX_AXIS];
+information for all axes. That is, struct js_corr corr[MAX_AXIS];
 
 struct js_corr is defined as::
 
@@ -328,7 +328,7 @@ To test the state of the buttons,
        second_button_state = js.buttons & 2;
 
 The axis values do not have a defined range in the original 0.x driver,
-except for that the values are non-negative. The 1.2.8+ drivers use a
+except that the values are non-negative. The 1.2.8+ drivers use a
 fixed range for reporting the values, 1 being the minimum, 128 the
 center, and 255 maximum value.
 
index 9746fd7..f615906 100644 (file)
@@ -133,15 +133,15 @@ And add a line to your rc script executing that file::
 This way, after the next reboot your joystick will remain calibrated. You
 can also add the ``jscal -p`` line to your shutdown script.
 
-Hspecific driver information
-==============================
+Hardware-specific driver information
+====================================
 
 In this section each of the separate hardware specific drivers is described.
 
 Analog joysticks
 ----------------
 
-The analog.c uses the standard analog inputs of the gameport, and thus
+The analog.c driver uses the standard analog inputs of the gameport, and thus
 supports all standard joysticks and gamepads. It uses a very advanced
 routine for this, allowing for data precision that can't be found on any
 other system.
@@ -266,7 +266,7 @@ to:
 * Logitech WingMan Extreme Digital 3D
 
 ADI devices are autodetected, and the driver supports up to two (any
-combination of) devices on a single gameport, using an Y-cable or chained
+combination of) devices on a single gameport, using a Y-cable or chained
 together.
 
 Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan
@@ -288,7 +288,7 @@ supports:
 * Gravis Xterminator DualControl
 
 All these devices are autodetected, and you can even use any combination
-of up to two of these pads either chained together or using an Y-cable on a
+of up to two of these pads either chained together or using a Y-cable on a
 single gameport.
 
 GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is
@@ -311,7 +311,7 @@ allow connecting analog joysticks to them, you'll need to load the analog
 driver as well to handle the attached joysticks.
 
 The trackball should work with USB mousedev module as a normal mouse. See
-the USB documentation for how to setup an USB mouse.
+the USB documentation for how to setup a USB mouse.
 
 ThrustMaster DirectConnect (BSP)
 --------------------------------
@@ -332,7 +332,7 @@ If you have one of these, contact me.
 
 TMDC devices are autodetected, and thus no parameters to the module
 are needed. Up to two TMDC devices can be connected to one gameport, using
-an Y-cable.
+a Y-cable.
 
 Creative Labs Blaster
 ---------------------
@@ -342,7 +342,7 @@ the:
 
 * Creative Blaster GamePad Cobra
 
-Up to two of these can be used on a single gameport, using an Y-cable.
+Up to two of these can be used on a single gameport, using a Y-cable.
 
 Genius Digital joysticks
 ------------------------
@@ -381,7 +381,7 @@ card, 16 in case you have two in your system.
 Trident 4DWave / Aureal Vortex
 ------------------------------
 
-Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets
+Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipset
 provide an "Enhanced Game Port" mode where the soundcard handles polling the
 joystick.  This mode is supported by the pcigame.c module. Once loaded the
 analog driver can use the enhanced features of these gameports..
@@ -454,7 +454,7 @@ Devices currently supported by spaceball.c are:
 * SpaceTec SpaceBall 4000 FLX
 
 In addition to having the spaceorb/spaceball and serport modules in the
-kernel, you also need to attach a serial port to it. to do that, run the
+kernel, you also need to attach a serial port to it. To do that, run the
 inputattach program::
 
        inputattach --spaceorb /dev/tts/x &
@@ -466,7 +466,7 @@ or::
 where /dev/tts/x is the serial port which the device is connected to. After
 doing this, the device will be reported and will start working.
 
-There is one caveat with the SpaceOrb. The button #6, the on the bottom
+There is one caveat with the SpaceOrb. The button #6, the one on the bottom
 side of the orb, although reported as an ordinary button, causes internal
 recentering of the spaceorb, moving the zero point to the position in which
 the ball is at the moment of pressing the button. So, think first before
@@ -500,7 +500,7 @@ joy-magellan module. It currently supports only the:
 * Magellan 3D
 * Space Mouse
 
-models, the additional buttons on the 'Plus' versions are not supported yet.
+models; the additional buttons on the 'Plus' versions are not supported yet.
 
 To use it, you need to attach the serial port to the driver using the::
 
@@ -575,7 +575,7 @@ FAQ
 :A: The device files don't exist. Create them (see section 2.2).
 
 :Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick
-    or pad that uses a 9-pin D-type cannon connector to the serial port of my
+    or pad that uses a 9-pin D-type Cannon connector to the serial port of my
     PC?
 :A: Yes, it is possible, but it'll burn your serial port or the pad. It
     won't work, of course.
index dac1771..d3a8557 100644 (file)
@@ -48,7 +48,6 @@ quota-tools            3.09             quota -V
 PPP                    2.4.0            pppd --version
 nfs-utils              1.0.5            showmount --version
 procps                 3.2.0            ps --version
-oprofile               0.9              oprofiled --version
 udev                   081              udevd --version
 grub                   0.93             grub --version || grub-install --version
 mcelog                 0.6              mcelog --version
index 6e6e394..ea915c1 100644 (file)
@@ -6,6 +6,7 @@ RISC-V architecture
     :maxdepth: 1
 
     boot-image-header
+    vm-layout
     pmu
     patch-acceptance
 
diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst
new file mode 100644 (file)
index 0000000..329d320
--- /dev/null
@@ -0,0 +1,63 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Virtual Memory Layout on RISC-V Linux
+=====================================
+
+:Author: Alexandre Ghiti <alex@ghiti.fr>
+:Date: 12 February 2021
+
+This document describes the virtual memory layout used by the RISC-V Linux
+Kernel.
+
+RISC-V Linux Kernel 32bit
+=========================
+
+RISC-V Linux Kernel SV32
+------------------------
+
+TODO
+
+RISC-V Linux Kernel 64bit
+=========================
+
+The RISC-V privileged architecture document states that the 64bit addresses
+"must have bits 63–48 all equal to bit 47, or else a page-fault exception will
+occur.": that splits the virtual address space into 2 halves separated by a very
+big hole, the lower half is where the userspace resides, the upper half is where
+the RISC-V Linux Kernel resides.
+
+RISC-V Linux Kernel SV39
+------------------------
+
+::
+
+  ========================================================================================================================
+      Start addr    |   Offset   |     End addr     |  Size   | VM area description
+  ========================================================================================================================
+                    |            |                  |         |
+   0000000000000000 |    0       | 0000003fffffffff |  256 GB | user-space virtual memory, different per mm
+  __________________|____________|__________________|_________|___________________________________________________________
+                    |            |                  |         |
+   0000004000000000 | +256    GB | ffffffbfffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
+                    |            |                  |         |     virtual memory addresses up to the -256 GB
+                    |            |                  |         |     starting offset of kernel mappings.
+  __________________|____________|__________________|_________|___________________________________________________________
+                                                              |
+                                                              | Kernel-space virtual memory, shared between all processes:
+  ____________________________________________________________|___________________________________________________________
+                    |            |                  |         |
+   ffffffc000000000 | -256    GB | ffffffc7ffffffff |   32 GB | kasan
+   ffffffcefee00000 | -196    GB | ffffffcefeffffff |    2 MB | fixmap
+   ffffffceff000000 | -196    GB | ffffffceffffffff |   16 MB | PCI io
+   ffffffcf00000000 | -196    GB | ffffffcfffffffff |    4 GB | vmemmap
+   ffffffd000000000 | -192    GB | ffffffdfffffffff |   64 GB | vmalloc/ioremap space
+   ffffffe000000000 | -128    GB | ffffffff7fffffff |  124 GB | direct mapping of all physical memory
+  __________________|____________|__________________|_________|____________________________________________________________
+                                                              |
+                                                              |
+  ____________________________________________________________|____________________________________________________________
+                    |            |                  |         |
+   ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | modules
+   ffffffff80000000 |   -2    GB | ffffffffffffffff |    2 GB | kernel, BPF
+  __________________|____________|__________________|_________|____________________________________________________________
index 14ea2f2..84dcdcd 100644 (file)
@@ -74,7 +74,7 @@ for a given topology level by creating a sched_domain_topology_level array and
 calling set_sched_topology() with this array as the parameter.
 
 The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you
 forgot to tweak your cmdline, you can also flip the
 /sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
 the sched domains which should catch most possible errors (described above). It
index cc883f8..87d0818 100644 (file)
@@ -51,7 +51,6 @@ quota-tools            3.09               quota -V
 PPP                    2.4.0              pppd --version
 nfs-utils              1.0.5              showmount --version
 procps                 3.2.0              ps --version
-oprofile               0.9                oprofiled --version
 udev                   081                udevd --version
 grub                   0.93               grub --version || grub-install --version
 mcelog                 0.6                mcelog --version
index ee6b20c..d56d6b7 100644 (file)
+.. SPDX-License-Identifier: GPL-2.0
+
 .. raw:: latex
 
        \renewcommand\thesection*
        \renewcommand\thesubsection*
 
+.. _linux_doc_zh:
+
 中文翻译
 ========
 
-这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名开发
-人员做出贡献。 与任何大型社区一样,知道如何完成任务将使得更改合并的过程变得更
-加容易。
 
-翻译计划:
-内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。
+.. note::
+
+   **翻译计划:**
+   内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。
+
+许可证文档
+----------
+
+下面的文档介绍了Linux内核源代码的许可证(GPLv2)、如何在源代码树中正确标记
+单个文件的许可证、以及指向完整许可证文本的链接。
+
+* Documentation/translations/zh_CN/process/license-rules.rst
+
+用户文档
+--------
+
+下面的手册是为内核用户编写的——即那些试图让它在给定系统上以最佳方式工作的
+用户。
 
 .. toctree::
    :maxdepth: 2
 
    admin-guide/index
+
+TODOList:
+
+* kbuild/index
+
+固件相关文档
+------------
+
+下列文档描述了内核需要的平台固件相关信息。
+
+TODOList:
+
+* firmware-guide/index
+* devicetree/index
+
+应用程序开发人员文档
+--------------------
+
+用户空间API手册涵盖了描述应用程序开发人员可见内核接口方面的文档。
+
+TODOlist:
+
+* userspace-api/index
+
+内核开发简介
+------------
+
+这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名
+开发人员做出贡献。与任何大型社区一样,知道如何完成任务将使得更改合并的过程
+变得更加容易。
+
+.. toctree::
+   :maxdepth: 2
+
    process/index
    dev-tools/index
    doc-guide/index
    kernel-hacking/index
-   filesystems/index
-   arm64/index
-   sound/index
+
+TODOList:
+
+* trace/index
+* maintainer/index
+* fault-injection/index
+* livepatch/index
+* rust/index
+
+内核API文档
+-----------
+
+以下手册从内核开发人员的角度详细介绍了特定的内核子系统是如何工作的。这里的
+大部分信息都是直接从内核源代码获取的,并根据需要添加补充材料(或者至少是在
+我们设法添加的时候——可能不是所有的都是有需要的)。
+
+.. toctree::
+   :maxdepth: 2
+
+   core-api/index
    cpu-freq/index
-   mips/index
    iio/index
+   sound/index
+   filesystems/index
+
+TODOList:
+
+* driver-api/index
+* locking/index
+* accounting/index
+* block/index
+* cdrom/index
+* ide/index
+* fb/index
+* fpga/index
+* hid/index
+* i2c/index
+* isdn/index
+* infiniband/index
+* leds/index
+* netlabel/index
+* networking/index
+* pcmcia/index
+* power/index
+* target/index
+* timers/index
+* spi/index
+* w1/index
+* watchdog/index
+* virt/index
+* input/index
+* hwmon/index
+* gpu/index
+* security/index
+* crypto/index
+* vm/index
+* bpf/index
+* usb/index
+* PCI/index
+* scsi/index
+* misc-devices/index
+* scheduler/index
+* mhi/index
+
+体系结构无关文档
+----------------
+
+TODOList:
+
+* asm-annotations
+
+特定体系结构文档
+----------------
+
+.. toctree::
+   :maxdepth: 2
+
+   mips/index
+   arm64/index
    riscv/index
-   core-api/index
    openrisc/index
 
+TODOList:
+
+* arm/index
+* ia64/index
+* m68k/index
+* nios2/index
+* parisc/index
+* powerpc/index
+* s390/index
+* sh/index
+* sparc/index
+* x86/index
+* xtensa/index
+
+其他文档
+--------
+
+有几份未排序的文档似乎不适合放在文档的其他部分,或者可能需要进行一些调整和/或
+转换为reStructureText格式,也有可能太旧。
+
+TODOList:
+
+* staging/index
+* watch_queue
+
 目录和表格
 ----------
 
index 4485641..b792bbd 100644 (file)
@@ -6,9 +6,9 @@
 
 Overview
 ========
-Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+Original x86-64 was limited by 4-level paging to 256 TiB of virtual address
 space and 64 TiB of physical address space. We are already bumping into
-this limit: some vendors offers servers with 64 TiB of memory today.
+this limit: some vendors offer servers with 64 TiB of memory today.
 
 To overcome the limitation upcoming hardware will introduce support for
 5-level paging. It is a straight-forward extension of the current page
index 65d200e..bd7aff0 100644 (file)
@@ -624,6 +624,7 @@ F:  fs/affs/
 
 AFS FILESYSTEM
 M:     David Howells <dhowells@redhat.com>
+M:     Marc Dionne <marc.dionne@auristor.com>
 L:     linux-afs@lists.infradead.org
 S:     Supported
 W:     https://www.infradead.org/~dhowells/kafs/
@@ -3207,6 +3208,22 @@ F:       Documentation/filesystems/bfs.rst
 F:     fs/bfs/
 F:     include/uapi/linux/bfs_fs.h
 
+BITMAP API
+M:     Yury Norov <yury.norov@gmail.com>
+R:     Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+R:     Rasmus Villemoes <linux@rasmusvillemoes.dk>
+S:     Maintained
+F:     include/asm-generic/bitops/find.h
+F:     include/linux/bitmap.h
+F:     lib/bitmap.c
+F:     lib/find_bit.c
+F:     lib/find_bit_benchmark.c
+F:     lib/test_bitmap.c
+F:     tools/include/asm-generic/bitops/find.h
+F:     tools/include/linux/bitmap.h
+F:     tools/lib/bitmap.c
+F:     tools/lib/find_bit.c
+
 BLINKM RGB LED DRIVER
 M:     Jan-Simon Moeller <jansimon.moeller@gmx.de>
 S:     Maintained
@@ -8388,6 +8405,13 @@ S:       Maintained
 F:     mm/hwpoison-inject.c
 F:     mm/memory-failure.c
 
+HYCON HY46XX TOUCHSCREEN SUPPORT
+M:     Giulio Benetti <giulio.benetti@benettiengineering.com>
+L:     linux-input@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml
+F:     drivers/input/touchscreen/hycon-hy46xx.c
+
 HYGON PROCESSOR SUPPORT
 M:     Pu Wen <puwen@hygon.cn>
 L:     linux-kernel@vger.kernel.org
@@ -9529,6 +9553,7 @@ F:        fs/io-wq.h
 F:     fs/io_uring.c
 F:     include/linux/io_uring.h
 F:     include/uapi/linux/io_uring.h
+F:     tools/io_uring/
 
 IPMI SUBSYSTEM
 M:     Corey Minyard <minyard@acm.org>
@@ -14075,13 +14100,6 @@ F:     Documentation/devicetree/bindings/pci/ti-pci.txt
 F:     drivers/pci/controller/cadence/pci-j721e.c
 F:     drivers/pci/controller/dwc/pci-dra7xx.c
 
-PCI DRIVER FOR TI KEYSTONE
-M:     Murali Karicheri <m-karicheri2@ti.com>
-L:     linux-pci@vger.kernel.org
-L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-S:     Maintained
-F:     drivers/pci/controller/dwc/pci-keystone.c
-
 PCI DRIVER FOR V3 SEMICONDUCTOR V360EPC
 M:     Linus Walleij <linus.walleij@linaro.org>
 L:     linux-pci@vger.kernel.org
@@ -15867,6 +15885,7 @@ F:      drivers/infiniband/ulp/rtrs/
 
 RXRPC SOCKETS (AF_RXRPC)
 M:     David Howells <dhowells@redhat.com>
+M:     Marc Dionne <marc.dionne@auristor.com>
 L:     linux-afs@lists.infradead.org
 S:     Supported
 W:     https://www.infradead.org/~dhowells/kafs/
@@ -18283,13 +18302,6 @@ S:     Maintained
 F:     sound/soc/codecs/isabelle*
 F:     sound/soc/codecs/lm49453*
 
-TI NETCP ETHERNET DRIVER
-M:     Wingman Kwok <w-kwok2@ti.com>
-M:     Murali Karicheri <m-karicheri2@ti.com>
-L:     netdev@vger.kernel.org
-S:     Maintained
-F:     drivers/net/ethernet/ti/netcp*
-
 TI PCM3060 ASoC CODEC DRIVER
 M:     Kirill Marinushkin <kmarinushkin@birdec.com>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
index 72af8e4..15b6476 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -399,11 +399,6 @@ ifeq ($(ARCH),sparc64)
        SRCARCH := sparc
 endif
 
-# Additional ARCH settings for sh
-ifeq ($(ARCH),sh64)
-       SRCARCH := sh
-endif
-
 export cross_compiling :=
 ifneq ($(SRCARCH),$(SUBARCH))
 cross_compiling := 1
@@ -792,16 +787,16 @@ KBUILD_CFLAGS += -Wno-gnu
 KBUILD_CFLAGS += -mno-global-merge
 else
 
-# These warnings generated too much noise in a regular build.
-# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
-KBUILD_CFLAGS += -Wno-unused-but-set-variable
-
 # Warn about unmarked fall-throughs in switch statement.
 # Disabled for clang while comment to attribute conversion happens and
 # https://github.com/ClangBuiltLinux/linux/issues/636 is discussed.
 KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough,)
 endif
 
+# These warnings generated too much noise in a regular build.
+# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS  += -fno-omit-frame-pointer -fno-optimize-sibling-calls
@@ -1225,7 +1220,7 @@ PHONY += prepare archprepare
 
 archprepare: outputmakefile archheaders archscripts scripts include/config/kernel.release \
        asm-generic $(version_h) $(autoksyms_h) include/generated/utsrelease.h \
-       include/generated/autoconf.h
+       include/generated/autoconf.h remove-stale-files
 
 prepare0: archprepare
        $(Q)$(MAKE) $(build)=scripts/mod
@@ -1234,6 +1229,10 @@ prepare0: archprepare
 # All the preparing..
 prepare: prepare0 prepare-objtool prepare-resolve_btfids
 
+PHONY += remove-stale-files
+remove-stale-files:
+       $(Q)$(srctree)/scripts/remove-stale-files
+
 # Support for using generic headers in asm-generic
 asm-generic := -f $(srctree)/scripts/Makefile.asm-generic obj
 
@@ -1512,9 +1511,6 @@ MRPROPER_FILES += include/config include/generated          \
                  vmlinux-gdb.py \
                  *.spec
 
-# Directories & files removed with 'make distclean'
-DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS
-
 # clean - Delete most, but leave enough to build external modules
 #
 clean: rm-files := $(CLEAN_FILES)
@@ -1541,16 +1537,14 @@ mrproper: clean $(mrproper-dirs)
 
 # distclean
 #
-distclean: rm-files := $(wildcard $(DISTCLEAN_FILES))
-
 PHONY += distclean
 
 distclean: mrproper
-       $(call cmd,rmfiles)
-       @find $(srctree) $(RCS_FIND_IGNORE) \
+       @find . $(RCS_FIND_IGNORE) \
                \( -name '*.orig' -o -name '*.rej' -o -name '*~' \
                -o -name '*.bak' -o -name '#*#' -o -name '*%' \
-               -o -name 'core' \) \
+               -o -name 'core' -o -name tags -o -name TAGS -o -name 'cscope*' \
+               -o -name GPATH -o -name GRTAGS -o -name GSYMS -o -name GTAGS \) \
                -type f -print | xargs rm -f
 
 
@@ -1717,17 +1711,7 @@ else # KBUILD_EXTMOD
 # When building external modules the kernel used as basis is considered
 # read-only, and no consistency checks are made and the make
 # system is not used on the basis kernel. If updates are required
-# in the basis kernel ordinary make commands (without M=...) must
-# be used.
-#
-# The following are the only valid targets when building external
-# modules.
-# make M=dir clean     Delete all automatically generated files
-# make M=dir modules   Make all modules in specified dir
-# make M=dir          Same as 'make M=dir modules'
-# make M=dir modules_install
-#                      Install the modules built in the module directory
-#                      Assumes install directory is already created
+# in the basis kernel ordinary make commands (without M=...) must be used.
 
 # We are always building only modules.
 KBUILD_BUILTIN :=
index 4191da4..756c19c 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-i386
-x86_64
+/i386/
+/x86_64/
index 1f6a909..0fab5ac 100644 (file)
@@ -602,11 +602,6 @@ extern void outsl (unsigned long port, const void *src, unsigned long count);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #endif /* __KERNEL__ */
 
 #endif /* __ALPHA_IO_H */
index 63aee5d..82b19c9 100644 (file)
@@ -13,12 +13,12 @@ static char *pc873xx_names[] = {
 static unsigned int base, model;
 
 
-unsigned int __init pc873xx_get_base()
+unsigned int __init pc873xx_get_base(void)
 {
        return base;
 }
 
-char *__init pc873xx_get_model()
+char *__init pc873xx_get_model(void)
 {
        return pc873xx_names[model];
 }
index dc68efb..1931a04 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <net/checksum.h>
 
 
 #define ldq_u(x,y) \
index bc8d6ae..2d98501 100644 (file)
@@ -6,6 +6,7 @@
 config ARC
        def_bool y
        select ARC_TIMERS
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DMA_PREP_COHERENT
        select ARCH_HAS_PTE_SPECIAL
@@ -28,6 +29,7 @@ config ARC
        select GENERIC_SMP_IDLE_THREAD
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_FUTEX_CMPXCHG if FUTEX
@@ -48,9 +50,6 @@ config ARC
        select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
        select SET_FS
 
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config TRACE_IRQFLAGS_SUPPORT
        def_bool y
 
@@ -86,10 +85,6 @@ config STACKTRACE_SUPPORT
        def_bool y
        select STACKTRACE
 
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-       def_bool y
-       depends on ARC_MMU_V4
-
 menu "ARC Architecture Configuration"
 
 menu "ARC Platform/SoC/Board"
index 085c830..24804f1 100644 (file)
@@ -31,6 +31,7 @@ config ARM
        select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
        select ARCH_SUPPORTS_ATOMIC_RMW
+       select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
        select ARCH_USE_BUILTIN_BSWAP
        select ARCH_USE_CMPXCHG_LOCKREF
        select ARCH_USE_MEMTEST
@@ -77,6 +78,7 @@ config ARM
        select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
        select HAVE_ARCH_THREAD_STRUCT_WHITELIST
        select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
        select HAVE_ARM_SMCCC if CPU_V7
        select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
        select HAVE_CONTEXT_TRACKING
@@ -1511,14 +1513,6 @@ config HW_PERF_EVENTS
        def_bool y
        depends on ARM_PMU
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y
-       depends on ARM_LPAE
-
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-       def_bool y
-       depends on ARM_LPAE
-
 config ARCH_WANT_GENERAL_HUGETLB
        def_bool y
 
index fd94e27..8eb70c1 100644 (file)
@@ -96,13 +96,6 @@ endif
 $(foreach o, $(libfdt_objs) atags_to_fdt.o fdt_check_mem_start.o, \
        $(eval CFLAGS_$(o) := -I $(srctree)/scripts/dtc/libfdt -fno-stack-protector))
 
-# These were previously generated C files. When you are building the kernel
-# with O=, make sure to remove the stale files in the output tree. Otherwise,
-# the build system wrongly compiles the stale ones.
-ifdef building_out_of_srctree
-$(shell rm -f $(addprefix $(obj)/, fdt_rw.c fdt_ro.c fdt_wip.c fdt.c))
-endif
-
 targets       := vmlinux vmlinux.lds piggy_data piggy.o \
                 lib1funcs.o ashldi3.o bswapsdi2.o \
                 head.o $(OBJS)
@@ -118,8 +111,8 @@ asflags-y := -DZIMAGE
 
 # Supply kernel BSS size to the decompressor via a linker symbol.
 KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \
-               sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \
-                      -e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) )
+               sed -n -e 's/^\([^ ]*\) [ABD] __bss_start$$/-0x\1/p' \
+                      -e 's/^\([^ ]*\) [ABD] __bss_stop$$/+0x\1/p') )) )
 LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)
 # Supply ZRELADDR to the decompressor via a linker symbol.
 ifneq ($(CONFIG_AUTO_ZRELADDR),y)
index e70c997..b935162 100644 (file)
@@ -63,7 +63,6 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_MOUSE_PS2 is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
index 3a7938f..2aa3ebe 100644 (file)
@@ -7,7 +7,6 @@ CONFIG_EXPERT=y
 CONFIG_MODULES=y
 CONFIG_ARCH_FOOTBRIDGE=y
 CONFIG_ARCH_CATS=y
-CONFIG_ARCH_PERSONAL_SERVER=y
 CONFIG_ARCH_EBSA285_HOST=y
 CONFIG_ARCH_NETWINDER=y
 CONFIG_LEDS=y
index b4670d4..abde1fb 100644 (file)
@@ -72,7 +72,6 @@ CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
 CONFIG_INPUT_UINPUT=m
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
index 6834e97..eacc089 100644 (file)
@@ -79,7 +79,6 @@ CONFIG_INPUT_EVBUG=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_NR_UARTS=1
index 1d923db..89f4a6f 100644 (file)
@@ -69,7 +69,6 @@ CONFIG_SMSC911X=y
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MPS2_UART_CONSOLE=y
 CONFIG_SERIAL_MPS2_UART=y
 # CONFIG_HW_RANDOM is not set
index 4f16716..d57ff30 100644 (file)
@@ -100,7 +100,6 @@ CONFIG_INPUT_EVDEV=y
 CONFIG_KEYBOARD_GPIO=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
index f1fbdfc..4d8e7f2 100644 (file)
@@ -53,7 +53,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 CONFIG_SERIAL_PXA_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
index 673c7dd..ba8d9d7 100644 (file)
@@ -88,5 +88,6 @@ extern asmlinkage void c_backtrace(unsigned long fp, int pmode,
 struct mm_struct;
 void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr);
 extern void __show_regs(struct pt_regs *);
+extern void __show_regs_alloc_free(struct pt_regs *regs);
 
 #endif
index fc74812..f74944c 100644 (file)
@@ -430,11 +430,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #include <asm-generic/io.h>
 
 #ifdef CONFIG_MMU
index 22751b5..e62832d 100644 (file)
@@ -56,9 +56,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
        }
 }
 
-/* Function pointer to optional machine-specific reinitialization */
-extern void (*kexec_reinit)(void);
-
 static inline unsigned long phys_to_boot_phys(phys_addr_t phys)
 {
        return phys_to_idmap(phys);
index 2f841cb..a711322 100644 (file)
@@ -150,21 +150,6 @@ extern unsigned long vectors_base;
  */
 #define PLAT_PHYS_OFFSET       UL(CONFIG_PHYS_OFFSET)
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * When referencing data in RAM from the XIP region in a relative manner
- * with the MMU off, we need the relative offset between the two physical
- * addresses.  The macro below achieves this, which is:
- *    __pa(v_data) - __xip_pa(v_text)
- */
-#define PHYS_RELATIVE(v_data, v_text) \
-       (((v_data) - PAGE_OFFSET + PLAT_PHYS_OFFSET) - \
-        ((v_text) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR) + \
-          CONFIG_XIP_PHYS_ADDR))
-#else
-#define PHYS_RELATIVE(v_data, v_text) ((v_data) - (v_text))
-#endif
-
 #ifndef __ASSEMBLY__
 
 /*
index a1ceff4..ec17fc0 100644 (file)
@@ -18,12 +18,4 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
-
 #endif
index ce85731..63748af 100644 (file)
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-generated-y += unistd-common.h
 generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
 generic-y += kvm_para.h
index 93ecf8a..ae7749e 100644 (file)
@@ -24,7 +24,6 @@
 #include <asm/unistd-oabi.h>
 #endif
 
-#include <asm/unistd-common.h>
 #define __NR_sync_file_range2          __NR_arm_sync_file_range
 
 /*
index be8050b..70993af 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/vdso_datapage.h>
 #include <asm/hardware/cache-l2x0.h>
 #include <linux/kbuild.h>
+#include <linux/arm-smccc.h>
 #include "signal.h"
 
 /*
@@ -148,6 +149,8 @@ int main(void)
   DEFINE(SLEEP_SAVE_SP_PHYS,   offsetof(struct sleep_save_sp, save_ptr_stash_phys));
   DEFINE(SLEEP_SAVE_SP_VIRT,   offsetof(struct sleep_save_sp, save_ptr_stash));
 #endif
+  DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,      offsetof(struct arm_smccc_quirk, id));
+  DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,   offsetof(struct arm_smccc_quirk, state));
   BLANK();
   DEFINE(DMA_BIDIRECTIONAL,    DMA_BIDIRECTIONAL);
   DEFINE(DMA_TO_DEVICE,                DMA_TO_DEVICE);
index e0d7833..7f0b7ab 100644 (file)
@@ -344,20 +344,19 @@ ENTRY(\sym)
        .size   \sym, . - \sym
        .endm
 
-#define NATIVE(nr, func) syscall nr, func
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, native)
+#define __SYSCALL(nr, func) syscall nr, func
 
 /*
  * This is the syscall table declaration for native ABI syscalls.
  * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall.
  */
        syscall_table_start sys_call_table
-#define COMPAT(nr, native, compat) syscall nr, native
 #ifdef CONFIG_AEABI
 #include <calls-eabi.S>
 #else
 #include <calls-oabi.S>
 #endif
-#undef COMPAT
        syscall_table_end sys_call_table
 
 /*============================================================================
@@ -455,7 +454,8 @@ ENDPROC(sys_oabi_readahead)
  * using the compatibility syscall entries.
  */
        syscall_table_start sys_oabi_call_table
-#define COMPAT(nr, native, compat) syscall nr, compat
+#undef __SYSCALL_WITH_COMPAT
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, compat)
 #include <calls-oabi.S>
        syscall_table_end sys_oabi_call_table
 
index 08660ae..b1423fb 100644 (file)
@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
                        info->trigger = addr;
                        pr_debug("breakpoint fired: address = 0x%x\n", addr);
                        perf_bp_event(bp, regs);
-                       if (!bp->overflow_handler)
+                       if (is_default_overflow_handler(bp))
                                enable_single_step(bp, addr);
                        goto unlock;
                }
index 2b09dad..f567032 100644 (file)
@@ -147,11 +147,6 @@ void machine_crash_shutdown(struct pt_regs *regs)
        pr_info("Loading crashdump kernel...\n");
 }
 
-/*
- * Function pointer to optional machine-specific reinitialization
- */
-void (*kexec_reinit)(void);
-
 void machine_kexec(struct kimage *image)
 {
        unsigned long page_list, reboot_entry_phys;
@@ -187,9 +182,6 @@ void machine_kexec(struct kimage *image)
 
        pr_info("Bye!\n");
 
-       if (kexec_reinit)
-               kexec_reinit();
-
        soft_restart(reboot_entry_phys);
 }
 
index 5199a2b..6324f4d 100644 (file)
@@ -92,6 +92,17 @@ void arch_cpu_idle_exit(void)
        ledtrig_cpu(CPU_LED_IDLE_END);
 }
 
+void __show_regs_alloc_free(struct pt_regs *regs)
+{
+       int i;
+
+       /* check for r0 - r12 only */
+       for (i = 0; i < 13; i++) {
+               pr_alert("Register r%d information:", i);
+               mem_dump_obj((void *)regs->uregs[i]);
+       }
+}
+
 void __show_regs(struct pt_regs *regs)
 {
        unsigned long flags;
index 00664c7..931df62 100644 (file)
@@ -3,7 +3,9 @@
  * Copyright (c) 2015, Linaro Limited
  */
 #include <linux/linkage.h>
+#include <linux/arm-smccc.h>
 
+#include <asm/asm-offsets.h>
 #include <asm/opcodes-sec.h>
 #include <asm/opcodes-virt.h>
 #include <asm/unwind.h>
@@ -27,7 +29,14 @@ UNWIND(      .fnstart)
 UNWIND(        .save   {r4-r7})
        ldm     r12, {r4-r7}
        \instr
-       pop     {r4-r7}
+       ldr     r4, [sp, #36]
+       cmp     r4, #0
+       beq     1f                      // No quirk structure
+       ldr     r5, [r4, #ARM_SMCCC_QUIRK_ID_OFFS]
+       cmp     r5, #ARM_SMCCC_QUIRK_QCOM_A6
+       bne     1f                      // No quirk present
+       str     r6, [r4, #ARM_SMCCC_QUIRK_STATE_OFFS]
+1:     pop     {r4-r7}
        ldr     r12, [sp, #(4 * 4)]
        stm     r12, {r0-r3}
        bx      lr
index 24bd205..43f0a3e 100644 (file)
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mm_types.h>
@@ -26,12 +27,22 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
                return -EINVAL;
 
        /*
+        * Function graph tracer state gets incosistent when the kernel
+        * calls functions that never return (aka suspend finishers) hence
+        * disable graph tracing during their execution.
+        */
+       pause_graph_tracing();
+
+       /*
         * Provide a temporary page table with an identity mapping for
         * the MMU-enable code, required for resuming.  On successful
         * resume (indicated by a zero return code), we need to switch
         * back to the correct page tables.
         */
        ret = __cpu_suspend(arg, fn, __mpidr);
+
+       unpause_graph_tracing();
+
        if (ret == 0) {
                cpu_switch_mm(mm->pgd, mm);
                local_flush_bp_all();
@@ -45,7 +56,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
        u32 __mpidr = cpu_logical_map(smp_processor_id());
-       return __cpu_suspend(arg, fn, __mpidr);
+       int ret;
+
+       pause_graph_tracing();
+       ret = __cpu_suspend(arg, fn, __mpidr);
+       unpause_graph_tracing();
+
+       return ret;
 }
 #define        idmap_pgd       NULL
 #endif
index 17d5a78..64308e3 100644 (file)
@@ -287,6 +287,7 @@ static int __die(const char *str, int err, struct pt_regs *regs)
 
        print_modules();
        __show_regs(regs);
+       __show_regs_alloc_free(regs);
        pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n",
                 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk));
 
index 844aa58..728aff9 100644 (file)
@@ -16,27 +16,6 @@ config ARCH_CATS
 
          Saying N will reduce the size of the Footbridge kernel.
 
-config ARCH_PERSONAL_SERVER
-       bool "Compaq Personal Server"
-       select FOOTBRIDGE_HOST
-       select ISA
-       select ISA_DMA
-       select FORCE_PCI
-       help
-         Say Y here if you intend to run this kernel on the Compaq
-         Personal Server.
-
-         Saying N will reduce the size of the Footbridge kernel.
-
-         The Compaq Personal Server is not available for purchase.
-         There are no product plans beyond the current research
-         prototypes at this time.  Information is available at:
-
-         <http://www.crl.hpl.hp.com/projects/personalserver/>
-
-         If you have any questions or comments about the  Compaq Personal
-         Server, send e-mail to <skiff@crl.dec.com>.
-
 config ARCH_EBSA285_ADDIN
        bool "EBSA285 (addin mode)"
        select ARCH_EBSA285
index a09f104..6262993 100644 (file)
@@ -11,12 +11,10 @@ pci-y                       += dc21285.o
 pci-$(CONFIG_ARCH_CATS) += cats-pci.o
 pci-$(CONFIG_ARCH_EBSA285_HOST) += ebsa285-pci.o
 pci-$(CONFIG_ARCH_NETWINDER) += netwinder-pci.o
-pci-$(CONFIG_ARCH_PERSONAL_SERVER) += personal-pci.o
 
 obj-$(CONFIG_ARCH_CATS) += cats-hw.o isa-timer.o
 obj-$(CONFIG_ARCH_EBSA285) += ebsa285.o dc21285-timer.o
 obj-$(CONFIG_ARCH_NETWINDER) += netwinder-hw.o isa-timer.o
-obj-$(CONFIG_ARCH_PERSONAL_SERVER) += personal.o dc21285-timer.o
 
 obj-$(CONFIG_PCI)      +=$(pci-y)
 
diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c
deleted file mode 100644 (file)
index 9d19aa9..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/arm/mach-footbridge/personal-pci.c
- *
- * PCI bios-type initialisation for PCI machines
- *
- * Bits taken from various places.
- */
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-
-#include <asm/irq.h>
-#include <asm/mach/pci.h>
-#include <asm/mach-types.h>
-
-static int irqmap_personal_server[] = {
-       IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0,
-       IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI
-};
-
-static int personal_server_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-       unsigned char line;
-
-       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-
-       if (line > 0x40 && line <= 0x5f) {
-               /* line corresponds to the bit controlling this interrupt
-                * in the footbridge.  Ignore the first 8 interrupt bits,
-                * look up the rest in the map.  IN0 is bit number 8
-                */
-               return irqmap_personal_server[(line & 0x1f) - 8];
-       } else if (line == 0) {
-               /* no interrupt */
-               return 0;
-       } else
-               return irqmap_personal_server[(line - 1) & 3];
-}
-
-static struct hw_pci personal_server_pci __initdata = {
-       .map_irq                = personal_server_map_irq,
-       .nr_controllers         = 1,
-       .ops                    = &dc21285_ops,
-       .setup                  = dc21285_setup,
-       .preinit                = dc21285_preinit,
-       .postinit               = dc21285_postinit,
-};
-
-static int __init personal_pci_init(void)
-{
-       if (machine_is_personal_server())
-               pci_common_init(&personal_server_pci);
-       return 0;
-}
-
-subsys_initcall(personal_pci_init);
diff --git a/arch/arm/mach-footbridge/personal.c b/arch/arm/mach-footbridge/personal.c
deleted file mode 100644 (file)
index ca71575..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/arm/mach-footbridge/personal.c
- *
- * Personal server (Skiff) machine fixup
- */
-#include <linux/init.h>
-#include <linux/spinlock.h>
-
-#include <asm/hardware/dec21285.h>
-#include <asm/mach-types.h>
-
-#include <asm/mach/arch.h>
-
-#include "common.h"
-
-MACHINE_START(PERSONAL_SERVER, "Compaq-PersonalServer")
-       /* Maintainer: Jamey Hicks / George France */
-       .atag_offset    = 0x100,
-       .map_io         = footbridge_map_io,
-       .init_irq       = footbridge_init_irq,
-       .init_time      = footbridge_timer_init,
-       .restart        = footbridge_restart,
-MACHINE_END
-
index dc8f152..830bbfb 100644 (file)
@@ -33,41 +33,41 @@ icache_size:
  * processor.  We fix this by performing an invalidate, rather than a
  * clean + invalidate, before jumping into the kernel.
  *
- * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs
- * to be called for both secondary cores startup and primary core resume
- * procedures.
+ * This function needs to be called for both secondary cores startup and
+ * primary core resume procedures.
  */
 ENTRY(v7_invalidate_l1)
-       mov     r0, #0
-       mcr     p15, 2, r0, c0, c0, 0
-       mrc     p15, 1, r0, c0, c0, 0
-
-       movw    r1, #0x7fff
-       and     r2, r1, r0, lsr #13
+       mov     r0, #0
+       mcr     p15, 2, r0, c0, c0, 0   @ select L1 data cache in CSSELR
+       isb
+       mrc     p15, 1, r0, c0, c0, 0   @ read cache geometry from CCSIDR
 
-       movw    r1, #0x3ff
+       movw    r3, #0x3ff
+       and     r3, r3, r0, lsr #3      @ 'Associativity' in CCSIDR[12:3]
+       clz     r1, r3                  @ WayShift
+       mov     r2, #1
+       mov     r3, r3, lsl r1          @ NumWays-1 shifted into bits [31:...]
+       movs    r1, r2, lsl r1          @ #1 shifted left by same amount
+       moveq   r1, #1                  @ r1 needs value > 0 even if only 1 way
 
-       and     r3, r1, r0, lsr #3      @ NumWays - 1
-       add     r2, r2, #1              @ NumSets
+       and     r2, r0, #0x7
+       add     r2, r2, #4              @ SetShift
 
-       and     r0, r0, #0x7
-       add     r0, r0, #4      @ SetShift
+1:     movw    ip, #0x7fff
+       and     r0, ip, r0, lsr #13     @ 'NumSets' in CCSIDR[27:13]
 
-       clz     r1, r3          @ WayShift
-       add     r4, r3, #1      @ NumWays
-1:     sub     r2, r2, #1      @ NumSets--
-       mov     r3, r4          @ Temp = NumWays
-2:     subs    r3, r3, #1      @ Temp--
-       mov     r5, r3, lsl r1
-       mov     r6, r2, lsl r0
-       orr     r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
-       mcr     p15, 0, r5, c7, c6, 2
-       bgt     2b
-       cmp     r2, #0
-       bgt     1b
-       dsb     st
-       isb
-       ret     lr
+2:     mov     ip, r0, lsl r2          @ NumSet << SetShift
+       orr     ip, ip, r3              @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+       mcr     p15, 0, ip, c7, c6, 2
+       subs    r0, r0, #1              @ Set--
+       bpl     2b
+       subs    r3, r3, r1              @ Way--
+       bcc     3f
+       mrc     p15, 1, r0, c0, c0, 0   @ re-read cache geometry from CCSIDR
+       b       1b
+3:     dsb     st
+       isb
+       ret     lr
 ENDPROC(v7_invalidate_l1)
 
 /*
index 93ff009..fb68800 100644 (file)
@@ -420,7 +420,7 @@ void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info)
        note_page(&st, 0, 0, 0, NULL);
 }
 
-static void ptdump_initialize(void)
+static void __init ptdump_initialize(void)
 {
        unsigned i, j;
 
@@ -466,7 +466,7 @@ void ptdump_check_wx(void)
                pr_info("Checked W+X mappings: passed, no W+X pages found\n");
 }
 
-static int ptdump_init(void)
+static int __init ptdump_init(void)
 {
        ptdump_initialize();
        ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables");
index 1ba9f9f..9d4744a 100644 (file)
@@ -489,33 +489,12 @@ static int __mark_rodata_ro(void *unused)
        return 0;
 }
 
-static int kernel_set_to_readonly __read_mostly;
-
 void mark_rodata_ro(void)
 {
-       kernel_set_to_readonly = 1;
        stop_machine(__mark_rodata_ro, NULL, NULL);
        debug_checkwx();
 }
 
-void set_kernel_text_rw(void)
-{
-       if (!kernel_set_to_readonly)
-               return;
-
-       set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false,
-                               current->active_mm);
-}
-
-void set_kernel_text_ro(void)
-{
-       if (!kernel_set_to_readonly)
-               return;
-
-       set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true,
-                               current->active_mm);
-}
-
 #else
 static inline void fix_kernmem_perms(void) { }
 #endif /* CONFIG_STRICT_KERNEL_RWX */
index 28c9d32..26d726a 100644 (file)
@@ -256,6 +256,20 @@ ENDPROC(cpu_pj4b_do_resume)
 
 #endif
 
+       @
+       @ Invoke the v7_invalidate_l1() function, which adheres to the AAPCS
+       @ rules, and so it may corrupt registers that we need to preserve.
+       @
+       .macro  do_invalidate_l1
+       mov     r6, r1
+       mov     r7, r2
+       mov     r10, lr
+       bl      v7_invalidate_l1                @ corrupts {r0-r3, ip, lr}
+       mov     r1, r6
+       mov     r2, r7
+       mov     lr, r10
+       .endm
+
 /*
  *     __v7_setup
  *
@@ -277,6 +291,7 @@ __v7_ca5mp_setup:
 __v7_ca9mp_setup:
 __v7_cr7mp_setup:
 __v7_cr8mp_setup:
+       do_invalidate_l1
        mov     r10, #(1 << 0)                  @ Cache/TLB ops broadcasting
        b       1f
 __v7_ca7mp_setup:
@@ -284,13 +299,9 @@ __v7_ca12mp_setup:
 __v7_ca15mp_setup:
 __v7_b15mp_setup:
 __v7_ca17mp_setup:
+       do_invalidate_l1
        mov     r10, #0
-1:     adr     r0, __v7_setup_stack_ptr
-       ldr     r12, [r0]
-       add     r12, r12, r0                    @ the local stack
-       stmia   r12, {r1-r6, lr}                @ v7_invalidate_l1 touches r0-r6
-       bl      v7_invalidate_l1
-       ldmia   r12, {r1-r6, lr}
+1:
 #ifdef CONFIG_SMP
        orr     r10, r10, #(1 << 6)             @ Enable SMP/nAMP mode
        ALT_SMP(mrc     p15, 0, r0, c1, c0, 1)
@@ -471,12 +482,7 @@ __v7_pj4b_setup:
 #endif /* CONFIG_CPU_PJ4B */
 
 __v7_setup:
-       adr     r0, __v7_setup_stack_ptr
-       ldr     r12, [r0]
-       add     r12, r12, r0                    @ the local stack
-       stmia   r12, {r1-r6, lr}                @ v7_invalidate_l1 touches r0-r6
-       bl      v7_invalidate_l1
-       ldmia   r12, {r1-r6, lr}
+       do_invalidate_l1
 
 __v7_setup_cont:
        and     r0, r9, #0xff000000             @ ARM?
@@ -548,17 +554,8 @@ __errata_finish:
        orr     r0, r0, r6                      @ set them
  THUMB(        orr     r0, r0, #1 << 30        )       @ Thumb exceptions
        ret     lr                              @ return to head.S:__ret
-
-       .align  2
-__v7_setup_stack_ptr:
-       .word   PHYS_RELATIVE(__v7_setup_stack, .)
 ENDPROC(__v7_setup)
 
-       .bss
-       .align  2
-__v7_setup_stack:
-       .space  4 * 7                           @ 7 registers
-
        __INITDATA
 
        .weak cpu_v7_bugs_init
index 598b636..318de96 100644 (file)
@@ -11,20 +11,9 @@ static int ptdump_show(struct seq_file *m, void *v)
        ptdump_walk_pgd(m, info);
        return 0;
 }
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ptdump_show, inode->i_private);
-}
-
-static const struct file_operations ptdump_fops = {
-       .open           = ptdump_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-void ptdump_debugfs_register(struct ptdump_info *info, const char *name)
+void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name)
 {
        debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
 }
index 977369f..a0dae35 100644 (file)
@@ -55,25 +55,25 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Data-processing (register), (register-shifted register), (immediate)")
 
 #define _DATA_PROCESSING_DNM(op,s,val)                                         \
-       TEST_RR(  op "eq" s "   r0,  r",1, VAL1,", r",2, val, "")               \
-       TEST_RR(  op "ne" s "   r1,  r",1, VAL1,", r",2, val, ", lsl #3")       \
-       TEST_RR(  op "cs" s "   r2,  r",3, VAL1,", r",2, val, ", lsr #4")       \
-       TEST_RR(  op "cc" s "   r3,  r",3, VAL1,", r",2, val, ", asr #5")       \
-       TEST_RR(  op "mi" s "   r4,  r",5, VAL1,", r",2, N(val),", asr #6")     \
-       TEST_RR(  op "pl" s "   r5,  r",5, VAL1,", r",2, val, ", ror #7")       \
-       TEST_RR(  op "vs" s "   r6,  r",7, VAL1,", r",2, val, ", rrx")          \
-       TEST_R(   op "vc" s "   r6,  r",7, VAL1,", pc, lsl #3")                 \
-       TEST_R(   op "vc" s "   r6,  r",7, VAL1,", sp, lsr #4")                 \
-       TEST_R(   op "vc" s "   r6,  pc, r",7, VAL1,", asr #5")                 \
-       TEST_R(   op "vc" s "   r6,  sp, r",7, VAL1,", ror #6")                 \
-       TEST_RRR( op "hi" s "   r8,  r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\
-       TEST_RRR( op "ls" s "   r9,  r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\
-       TEST_RRR( op "ge" s "   r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\
-       TEST_RRR( op "lt" s "   r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\
-       TEST_RR(  op "gt" s "   r12, r13"       ", r",14,val, ", ror r",14,7,"")\
-       TEST_RR(  op "le" s "   r14, r",0, val, ", r13"       ", lsl r",14,8,"")\
-       TEST_R(   op "eq" s "   r0,  r",11,VAL1,", #0xf5")                      \
-       TEST_R(   op "ne" s "   r11, r",0, VAL1,", #0xf5000000")                \
+       TEST_RR(  op s "eq      r0,  r",1, VAL1,", r",2, val, "")               \
+       TEST_RR(  op s "ne      r1,  r",1, VAL1,", r",2, val, ", lsl #3")       \
+       TEST_RR(  op s "cs      r2,  r",3, VAL1,", r",2, val, ", lsr #4")       \
+       TEST_RR(  op s "cc      r3,  r",3, VAL1,", r",2, val, ", asr #5")       \
+       TEST_RR(  op s "mi      r4,  r",5, VAL1,", r",2, N(val),", asr #6")     \
+       TEST_RR(  op s "pl      r5,  r",5, VAL1,", r",2, val, ", ror #7")       \
+       TEST_RR(  op s "vs      r6,  r",7, VAL1,", r",2, val, ", rrx")          \
+       TEST_R(   op s "vc      r6,  r",7, VAL1,", pc, lsl #3")                 \
+       TEST_R(   op s "vc      r6,  r",7, VAL1,", sp, lsr #4")                 \
+       TEST_R(   op s "vc      r6,  pc, r",7, VAL1,", asr #5")                 \
+       TEST_R(   op s "vc      r6,  sp, r",7, VAL1,", ror #6")                 \
+       TEST_RRR( op s "hi      r8,  r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\
+       TEST_RRR( op s "ls      r9,  r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\
+       TEST_RRR( op s "ge      r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\
+       TEST_RRR( op s "lt      r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\
+       TEST_RR(  op s "gt      r12, r13"       ", r",14,val, ", ror r",14,7,"")\
+       TEST_RR(  op s "le      r14, r",0, val, ", r13"       ", lsl r",14,8,"")\
+       TEST_R(   op s "eq      r0,  r",11,VAL1,", #0xf5")                      \
+       TEST_R(   op s "ne      r11, r",0, VAL1,", #0xf5000000")                \
        TEST_R(   op s "        r7,  r",8, VAL2,", #0x000af000")                \
        TEST(     op s "        r4,  pc"        ", #0x00005a00")
 
@@ -104,23 +104,23 @@ void kprobe_arm_test_cases(void)
        TEST_R(   op "  r",8, VAL2,", #0x000af000")
 
 #define _DATA_PROCESSING_DM(op,s,val)                                  \
-       TEST_R(   op "eq" s "   r0,  r",1, val, "")                     \
-       TEST_R(   op "ne" s "   r1,  r",1, val, ", lsl #3")             \
-       TEST_R(   op "cs" s "   r2,  r",3, val, ", lsr #4")             \
-       TEST_R(   op "cc" s "   r3,  r",3, val, ", asr #5")             \
-       TEST_R(   op "mi" s "   r4,  r",5, N(val),", asr #6")           \
-       TEST_R(   op "pl" s "   r5,  r",5, val, ", ror #7")             \
-       TEST_R(   op "vs" s "   r6,  r",10,val, ", rrx")                \
-       TEST(     op "vs" s "   r7,  pc, lsl #3")                       \
-       TEST(     op "vs" s "   r7,  sp, lsr #4")                       \
-       TEST_RR(  op "vc" s "   r8,  r",7, val, ", lsl r",0, 3,"")      \
-       TEST_RR(  op "hi" s "   r9,  r",9, val, ", lsr r",7, 4,"")      \
-       TEST_RR(  op "ls" s "   r10, r",9, val, ", asr r",7, 5,"")      \
-       TEST_RR(  op "ge" s "   r11, r",11,N(val),", asr r",7, 6,"")    \
-       TEST_RR(  op "lt" s "   r12, r",11,val, ", ror r",14,7,"")      \
-       TEST_R(   op "gt" s "   r14, r13"       ", lsl r",14,8,"")      \
-       TEST(     op "eq" s "   r0,  #0xf5")                            \
-       TEST(     op "ne" s "   r11, #0xf5000000")                      \
+       TEST_R(   op s "eq      r0,  r",1, val, "")                     \
+       TEST_R(   op s "ne      r1,  r",1, val, ", lsl #3")             \
+       TEST_R(   op s "cs      r2,  r",3, val, ", lsr #4")             \
+       TEST_R(   op s "cc      r3,  r",3, val, ", asr #5")             \
+       TEST_R(   op s "mi      r4,  r",5, N(val),", asr #6")           \
+       TEST_R(   op s "pl      r5,  r",5, val, ", ror #7")             \
+       TEST_R(   op s "vs      r6,  r",10,val, ", rrx")                \
+       TEST(     op s "vs      r7,  pc, lsl #3")                       \
+       TEST(     op s "vs      r7,  sp, lsr #4")                       \
+       TEST_RR(  op s "vc      r8,  r",7, val, ", lsl r",0, 3,"")      \
+       TEST_RR(  op s "hi      r9,  r",9, val, ", lsr r",7, 4,"")      \
+       TEST_RR(  op s "ls      r10, r",9, val, ", asr r",7, 5,"")      \
+       TEST_RR(  op s "ge      r11, r",11,N(val),", asr r",7, 6,"")    \
+       TEST_RR(  op s "lt      r12, r",11,val, ", ror r",14,7,"")      \
+       TEST_R(   op s "gt      r14, r13"       ", lsl r",14,8,"")      \
+       TEST(     op s "eq      r0,  #0xf5")                            \
+       TEST(     op s "ne      r11, #0xf5000000")                      \
        TEST(     op s "        r7,  #0x000af000")                      \
        TEST(     op s "        r4,  #0x00005a00")
 
@@ -166,10 +166,10 @@ void kprobe_arm_test_cases(void)
 
        /* Data-processing with PC as a target and status registers updated */
        TEST_UNSUPPORTED("movs  pc, r1")
-       TEST_UNSUPPORTED("movs  pc, r1, lsl r2")
+       TEST_UNSUPPORTED(__inst_arm(0xe1b0f211) "       @movs   pc, r1, lsl r2")
        TEST_UNSUPPORTED("movs  pc, #0x10000")
        TEST_UNSUPPORTED("adds  pc, lr, r1")
-       TEST_UNSUPPORTED("adds  pc, lr, r1, lsl r2")
+       TEST_UNSUPPORTED(__inst_arm(0xe09ef211) "       @adds   pc, lr, r1, lsl r2")
        TEST_UNSUPPORTED("adds  pc, lr, #4")
 
        /* Data-processing with SP as target */
@@ -352,7 +352,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe000029f) " @ mul r0, pc, r2")
        TEST_UNSUPPORTED(__inst_arm(0xe0000f91) " @ mul r0, r1, pc")
        TEST_RR(    "muls       r0, r",1, VAL1,", r",2, VAL2,"")
-       TEST_RR(    "mullss     r7, r",8, VAL2,", r",9, VAL2,"")
+       TEST_RR(    "mulsls     r7, r",8, VAL2,", r",9, VAL2,"")
        TEST_R(     "muls       lr, r",4, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe01f0291) " @ muls pc, r1, r2")
 
@@ -361,7 +361,7 @@ void kprobe_arm_test_cases(void)
        TEST_RR(     "mla       lr, r",1, VAL2,", r",2, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe02f3291) " @ mla pc, r1, r2, r3")
        TEST_RRR(    "mlas      r0, r",1, VAL1,", r",2, VAL2,", r",3,  VAL3,"")
-       TEST_RRR(    "mlahis    r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"")
+       TEST_RRR(    "mlashi    r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"")
        TEST_RR(     "mlas      lr, r",1, VAL2,", r",2, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe03f3291) " @ mlas pc, r1, r2, r3")
 
@@ -394,7 +394,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe081f392) " @ umull pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe08f1392) " @ umull r1, pc, r2, r3")
        TEST_RR(  "umulls       r0, r1, r",2, VAL1,", r",3, VAL2,"")
-       TEST_RR(  "umulllss     r7, r8, r",9, VAL2,", r",10, VAL1,"")
+       TEST_RR(  "umullsls     r7, r8, r",9, VAL2,", r",10, VAL1,"")
        TEST_R(   "umulls       lr, r12, r",11,VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe091f392) " @ umulls pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe09f1392) " @ umulls r1, pc, r2, r3")
@@ -405,7 +405,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0af1392) " @ umlal pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0a1f392) " @ umlal r1, pc, r2, r3")
        TEST_RRRR(  "umlals     r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4)
-       TEST_RRRR(  "umlalles   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
+       TEST_RRRR(  "umlalsle   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
        TEST_RRR(   "umlals     r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0bf1392) " @ umlals pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0b1f392) " @ umlals r1, pc, r2, r3")
@@ -416,7 +416,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0c1f392) " @ smull pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0cf1392) " @ smull r1, pc, r2, r3")
        TEST_RR(  "smulls       r0, r1, r",2, VAL1,", r",3, VAL2,"")
-       TEST_RR(  "smulllss     r7, r8, r",9, VAL2,", r",10, VAL1,"")
+       TEST_RR(  "smullsls     r7, r8, r",9, VAL2,", r",10, VAL1,"")
        TEST_R(   "smulls       lr, r12, r",11,VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0d1f392) " @ smulls pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0df1392) " @ smulls r1, pc, r2, r3")
@@ -427,7 +427,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0ef1392) " @ smlal pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0e1f392) " @ smlal r1, pc, r2, r3")
        TEST_RRRR(  "smlals     r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4)
-       TEST_RRRR(  "smlalles   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
+       TEST_RRRR(  "smlalsle   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
        TEST_RRR(   "smlals     r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0ff1392) " @ smlals pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0f0f392) " @ smlals r0, pc, r2, r3")
@@ -450,7 +450,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe10f0091) " @ swp r0, r1, [pc]")
 #if __LINUX_ARM_ARCH__ < 6
        TEST_RP("swpb   lr, r",7,VAL2,", [r",8,0,"]")
-       TEST_R( "swpvsb r0, r",1,VAL1,", [sp]")
+       TEST_R( "swpbvs r0, r",1,VAL1,", [sp]")
 #else
        TEST_UNSUPPORTED(__inst_arm(0xe148e097) " @ swpb        lr, r7, [r8]")
        TEST_UNSUPPORTED(__inst_arm(0x614d0091) " @ swpvsb      r0, r1, [sp]")
@@ -477,11 +477,11 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Extra load/store instructions")
 
        TEST_RPR(  "strh        r",0, VAL1,", [r",1, 48,", -r",2, 24,"]")
-       TEST_RPR(  "streqh      r",14,VAL2,", [r",11,0, ", r",12, 48,"]")
-       TEST_UNSUPPORTED(  "streqh      r14, [r13, r12]")
-       TEST_UNSUPPORTED(  "streqh      r14, [r12, r13]")
+       TEST_RPR(  "strheq      r",14,VAL2,", [r",11,0, ", r",12, 48,"]")
+       TEST_UNSUPPORTED(  "strheq      r14, [r13, r12]")
+       TEST_UNSUPPORTED(  "strheq      r14, [r12, r13]")
        TEST_RPR(  "strh        r",1, VAL1,", [r",2, 24,", r",3,  48,"]!")
-       TEST_RPR(  "strneh      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
+       TEST_RPR(  "strhne      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
        TEST_RPR(  "strh        r",2, VAL1,", [r",3, 24,"], r",4, 48,"")
        TEST_RPR(  "strh        r",10,VAL2,", [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0ba) "       @ strh r12, [pc, r10]!")
@@ -489,9 +489,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe089a0bf) "       @ strh r10, [r9], pc")
 
        TEST_PR(   "ldrh        r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrcsh      r14, [r",13,0, ", r",12, 48,"]")
+       TEST_PR(   "ldrhcs      r14, [r",13,0, ", r",12, 48,"]")
        TEST_PR(   "ldrh        r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrcch      r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrhcc      r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrh        r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrh        r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0ba) "       @ ldrh r12, [pc, r10]!")
@@ -499,9 +499,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe099a0bf) "       @ ldrh r10, [r9], pc")
 
        TEST_RP(   "strh        r",0, VAL1,", [r",1, 24,", #-2]")
-       TEST_RP(   "strmih      r",14,VAL2,", [r",13,0, ", #2]")
+       TEST_RP(   "strhmi      r",14,VAL2,", [r",13,0, ", #2]")
        TEST_RP(   "strh        r",1, VAL1,", [r",2, 24,", #4]!")
-       TEST_RP(   "strplh      r",12,VAL2,", [r",11,24,", #-4]!")
+       TEST_RP(   "strhpl      r",12,VAL2,", [r",11,24,", #-4]!")
        TEST_RP(   "strh        r",2, VAL1,", [r",3, 24,"], #48")
        TEST_RP(   "strh        r",10,VAL2,", [r",9, 64,"], #-48")
        TEST_RP(   "strh        r",3, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!")
@@ -511,9 +511,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0c9f3b0) "       @ strh pc, [r9], #48")
 
        TEST_P(    "ldrh        r0, [r",0,  24,", #-2]")
-       TEST_P(    "ldrvsh      r14, [r",13,0, ", #2]")
+       TEST_P(    "ldrhvs      r14, [r",13,0, ", #2]")
        TEST_P(    "ldrh        r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrvch      r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrhvc      r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrh        r2, [r",3,  24,"], #48")
        TEST_P(    "ldrh        r10, [r",9, 64,"], #-48")
        TEST(      "ldrh        r0, [pc, #0]")
@@ -521,18 +521,18 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0d9f3b0) "       @ ldrh pc, [r9], #48")
 
        TEST_PR(   "ldrsb       r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrhisb     r14, [r",13,0,", r",12,  48,"]")
+       TEST_PR(   "ldrsbhi     r14, [r",13,0,", r",12,  48,"]")
        TEST_PR(   "ldrsb       r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrlssb     r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrsbls     r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrsb       r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrsb       r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0da) "       @ ldrsb r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe099f0db) "       @ ldrsb pc, [r9], r11")
 
        TEST_P(    "ldrsb       r0, [r",0,  24,", #-1]")
-       TEST_P(    "ldrgesb     r14, [r",13,0, ", #1]")
+       TEST_P(    "ldrsbge     r14, [r",13,0, ", #1]")
        TEST_P(    "ldrsb       r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrltsb     r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrsblt     r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrsb       r2, [r",3,  24,"], #48")
        TEST_P(    "ldrsb       r10, [r",9, 64,"], #-48")
        TEST(      "ldrsb       r0, [pc, #0]")
@@ -540,18 +540,18 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0d9f3d0) "       @ ldrsb pc, [r9], #48")
 
        TEST_PR(   "ldrsh       r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrgtsh     r14, [r",13,0, ", r",12, 48,"]")
+       TEST_PR(   "ldrshgt     r14, [r",13,0, ", r",12, 48,"]")
        TEST_PR(   "ldrsh       r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrlesh     r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrshle     r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrsh       r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrsh       r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0fa) "       @ ldrsh r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe099f0fb) "       @ ldrsh pc, [r9], r11")
 
        TEST_P(    "ldrsh       r0, [r",0,  24,", #-1]")
-       TEST_P(    "ldreqsh     r14, [r",13,0 ,", #1]")
+       TEST_P(    "ldrsheq     r14, [r",13,0 ,", #1]")
        TEST_P(    "ldrsh       r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrnesh     r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrshne     r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrsh       r2, [r",3,  24,"], #48")
        TEST_P(    "ldrsh       r10, [r",9, 64,"], #-48")
        TEST(      "ldrsh       r0, [pc, #0]")
@@ -571,30 +571,30 @@ void kprobe_arm_test_cases(void)
 
 #if __LINUX_ARM_ARCH__ >= 5
        TEST_RPR(  "strd        r",0, VAL1,", [r",1, 48,", -r",2,24,"]")
-       TEST_RPR(  "strccd      r",8, VAL2,", [r",11,0, ", r",12,48,"]")
-       TEST_UNSUPPORTED(  "strccd r8, [r13, r12]")
-       TEST_UNSUPPORTED(  "strccd r8, [r12, r13]")
+       TEST_RPR(  "strdcc      r",8, VAL2,", [r",11,0, ", r",12,48,"]")
+       TEST_UNSUPPORTED(  "strdcc r8, [r13, r12]")
+       TEST_UNSUPPORTED(  "strdcc r8, [r12, r13]")
        TEST_RPR(  "strd        r",4, VAL1,", [r",2, 24,", r",3, 48,"]!")
-       TEST_RPR(  "strcsd      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
-       TEST_RPR(  "strd        r",2, VAL1,", [r",5, 24,"], r",4,48,"")
-       TEST_RPR(  "strd        r",10,VAL2,", [r",9, 48,"], -r",7,24,"")
+       TEST_RPR(  "strdcs      r",12,VAL2,", r13, [r",11,48,", -r",10,24,"]!")
+       TEST_RPR(  "strd        r",2, VAL1,", r3, [r",5, 24,"], r",4,48,"")
+       TEST_RPR(  "strd        r",10,VAL2,", r11, [r",9, 48,"], -r",7,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0fa) "       @ strd r12, [pc, r10]!")
 
        TEST_PR(   "ldrd        r0, [r",0, 48,", -r",2,24,"]")
-       TEST_PR(   "ldrmid      r8, [r",13,0, ", r",12,48,"]")
+       TEST_PR(   "ldrdmi      r8, [r",13,0, ", r",12,48,"]")
        TEST_PR(   "ldrd        r4, [r",2, 24,", r",3, 48,"]!")
-       TEST_PR(   "ldrpld      r6, [r",11,48,", -r",10,24,"]!")
-       TEST_PR(   "ldrd        r2, [r",5, 24,"], r",4,48,"")
-       TEST_PR(   "ldrd        r10, [r",9,48,"], -r",7,24,"")
+       TEST_PR(   "ldrdpl      r6, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrd        r2, r3, [r",5, 24,"], r",4,48,"")
+       TEST_PR(   "ldrd        r10, r11, [r",9,48,"], -r",7,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0da) "       @ ldrd r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe089f0db) "       @ ldrd pc, [r9], r11")
        TEST_UNSUPPORTED(__inst_arm(0xe089e0db) "       @ ldrd lr, [r9], r11")
        TEST_UNSUPPORTED(__inst_arm(0xe089c0df) "       @ ldrd r12, [r9], pc")
 
        TEST_RP(   "strd        r",0, VAL1,", [r",1, 24,", #-8]")
-       TEST_RP(   "strvsd      r",8, VAL2,", [r",13,0, ", #8]")
+       TEST_RP(   "strdvs      r",8, VAL2,", [r",13,0, ", #8]")
        TEST_RP(   "strd        r",4, VAL1,", [r",2, 24,", #16]!")
-       TEST_RP(   "strvcd      r",12,VAL2,", [r",11,24,", #-16]!")
+       TEST_RP(   "strdvc      r",12,VAL2,", r13, [r",11,24,", #-16]!")
        TEST_RP(   "strd        r",2, VAL1,", [r",4, 24,"], #48")
        TEST_RP(   "strd        r",10,VAL2,", [r",9, 64,"], #-48")
        TEST_RP(   "strd        r",6, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!")
@@ -603,9 +603,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe1efc3f0) "       @ strd r12, [pc, #48]!")
 
        TEST_P(    "ldrd        r0, [r",0, 24,", #-8]")
-       TEST_P(    "ldrhid      r8, [r",13,0, ", #8]")
+       TEST_P(    "ldrdhi      r8, [r",13,0, ", #8]")
        TEST_P(    "ldrd        r4, [r",2, 24,", #16]!")
-       TEST_P(    "ldrlsd      r6, [r",11,24,", #-16]!")
+       TEST_P(    "ldrdls      r6, [r",11,24,", #-16]!")
        TEST_P(    "ldrd        r2, [r",5, 24,"], #48")
        TEST_P(    "ldrd        r10, [r",9,6,"], #-48")
        TEST_UNSUPPORTED(__inst_arm(0xe1efc3d0) "       @ ldrd r12, [pc, #48]!")
@@ -1084,63 +1084,63 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Branch, branch with link, and block data transfer")
 
        TEST_P(   "stmda        r",0, 16*4,", {r0}")
-       TEST_P(   "stmeqda      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmneda      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmdaeq      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmdane      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmda        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmda        r",13,0,   "!, {pc}")
 
        TEST_P(   "ldmda        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmcsda      r",4, 15*4,", {r0-r15}")
-       TEST_BF_P("ldmccda      r",7, 15*4,"!, {r8-r15}")
+       TEST_BF_P("ldmdacs      r",4, 15*4,", {r0-r15}")
+       TEST_BF_P("ldmdacc      r",7, 15*4,"!, {r8-r15}")
        TEST_P(   "ldmda        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmda        r",14,15*4,"!, {pc}")
 
        TEST_P(   "stmia        r",0, 16*4,", {r0}")
-       TEST_P(   "stmmiia      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmplia      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmiami      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmiapl      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmia        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmia        r",14,0,   "!, {pc}")
 
        TEST_P(   "ldmia        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmvsia      r",4, 0,   ", {r0-r15}")
-       TEST_BF_P("ldmvcia      r",7, 8*4, "!, {r8-r15}")
+       TEST_BF_P("ldmiavs      r",4, 0,   ", {r0-r15}")
+       TEST_BF_P("ldmiavc      r",7, 8*4, "!, {r8-r15}")
        TEST_P(   "ldmia        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmia        r",14,15*4,"!, {pc}")
 
        TEST_P(   "stmdb        r",0, 16*4,", {r0}")
-       TEST_P(   "stmhidb      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmlsdb      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmdbhi      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmdbls      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmdb        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmdb        r",13,4,   "!, {pc}")
 
        TEST_P(   "ldmdb        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmgedb      r",4, 16*4,", {r0-r15}")
-       TEST_BF_P("ldmltdb      r",7, 16*4,"!, {r8-r15}")
+       TEST_BF_P("ldmdbge      r",4, 16*4,", {r0-r15}")
+       TEST_BF_P("ldmdblt      r",7, 16*4,"!, {r8-r15}")
        TEST_P(   "ldmdb        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmdb        r",14,16*4,"!, {pc}")
 
        TEST_P(   "stmib        r",0, 16*4,", {r0}")
-       TEST_P(   "stmgtib      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmleib      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmibgt      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmible      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmib        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmib        r",13,-4,  "!, {pc}")
 
        TEST_P(   "ldmib        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmeqib      r",4, -4,", {r0-r15}")
-       TEST_BF_P("ldmneib      r",7, 7*4,"!, {r8-r15}")
+       TEST_BF_P("ldmibeq      r",4, -4,", {r0-r15}")
+       TEST_BF_P("ldmibne      r",7, 7*4,"!, {r8-r15}")
        TEST_P(   "ldmib        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmib        r",14,14*4,"!, {pc}")
 
        TEST_P(   "stmdb        r",13,16*4,"!, {r3-r12,lr}")
-       TEST_P(   "stmeqdb      r",13,16*4,"!, {r3-r12}")
-       TEST_P(   "stmnedb      r",2, 16*4,", {r3-r12,lr}")
+       TEST_P(   "stmdbeq      r",13,16*4,"!, {r3-r12}")
+       TEST_P(   "stmdbne      r",2, 16*4,", {r3-r12,lr}")
        TEST_P(   "stmdb        r",13,16*4,"!, {r2-r12,lr}")
        TEST_P(   "stmdb        r",0, 16*4,", {r0-r12}")
        TEST_P(   "stmdb        r",0, 16*4,", {r0-r12,lr}")
 
        TEST_BF_P("ldmia        r",13,5*4, "!, {r3-r12,pc}")
-       TEST_P(   "ldmccia      r",13,5*4, "!, {r3-r12}")
-       TEST_BF_P("ldmcsia      r",2, 5*4, "!, {r3-r12,pc}")
+       TEST_P(   "ldmiacc      r",13,5*4, "!, {r3-r12}")
+       TEST_BF_P("ldmiacs      r",2, 5*4, "!, {r3-r12,pc}")
        TEST_BF_P("ldmia        r",13,4*4, "!, {r2-r12,pc}")
        TEST_P(   "ldmia        r",0, 16*4,", {r0-r12}")
        TEST_P(   "ldmia        r",0, 16*4,", {r0-r12,lr}")
@@ -1174,80 +1174,80 @@ void kprobe_arm_test_cases(void)
 #define TEST_COPROCESSOR(code) TEST_UNSUPPORTED(code)
 
 #define COPROCESSOR_INSTRUCTIONS_ST_LD(two,cc)                                 \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], {1}")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], {1}")                   \
                                                                                \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##daf0001) "  @ stc"two"      0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d2f0001) "  @ stc"two"      0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##caf0001) "  @ stc"two"      0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c2f0001) "  @ stc"two"      0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##def0001) "  @ stc"two"l     0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d6f0001) "  @ stc"two"l     0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cef0001) "  @ stc"two"l     0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c6f0001) "  @ stc"two"l     0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##dbf0001) "  @ ldc"two"      0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d3f0001) "  @ ldc"two"      0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cbf0001) "  @ ldc"two"      0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c3f0001) "  @ ldc"two"      0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##dff0001) "  @ ldc"two"l     0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d7f0001) "  @ ldc"two"l     0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cff0001) "  @ ldc"two"l     0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c7f0001) "  @ ldc"two"l     0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15], {1}")
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15], {1}")
 
 #define COPROCESSOR_INSTRUCTIONS_MC_MR(two,cc)                                 \
                                                                                \
-       TEST_COPROCESSOR( "mcrr"two"    0, 15, r0, r14, cr0")                   \
-       TEST_COPROCESSOR( "mcrr"two"    15, 0, r14, r0, cr15")                  \
+       TEST_COPROCESSOR( "mcrr"two"    p0, 15, r0, r14, cr0")                  \
+       TEST_COPROCESSOR( "mcrr"two"    p15, 0, r14, r0, cr15")                 \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c4f00f0) "  @ mcrr"two"     0, 15, r0, r15, cr0")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c40ff0f) "  @ mcrr"two"     15, 0, r15, r0, cr15")  \
-       TEST_COPROCESSOR( "mrrc"two"    0, 15, r0, r14, cr0")                   \
-       TEST_COPROCESSOR( "mrrc"two"    15, 0, r14, r0, cr15")                  \
+       TEST_COPROCESSOR( "mrrc"two"    p0, 15, r0, r14, cr0")                  \
+       TEST_COPROCESSOR( "mrrc"two"    p15, 0, r14, r0, cr15")                 \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c5f00f0) "  @ mrrc"two"     0, 15, r0, r15, cr0")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c50ff0f) "  @ mrrc"two"     15, 0, r15, r0, cr15")  \
-       TEST_COPROCESSOR( "cdp"two"     15, 15, cr15, cr15, cr15, 7")           \
-       TEST_COPROCESSOR( "cdp"two"     0, 0, cr0, cr0, cr0, 0")                \
-       TEST_COPROCESSOR( "mcr"two"     15, 7, r15, cr15, cr15, 7")             \
-       TEST_COPROCESSOR( "mcr"two"     0, 0, r0, cr0, cr0, 0")                 \
-       TEST_COPROCESSOR( "mrc"two"     15, 7, r15, cr15, cr15, 7")             \
-       TEST_COPROCESSOR( "mrc"two"     0, 0, r0, cr0, cr0, 0")
+       TEST_COPROCESSOR( "cdp"two"     p15, 15, cr15, cr15, cr15, 7")          \
+       TEST_COPROCESSOR( "cdp"two"     p0, 0, cr0, cr0, cr0, 0")               \
+       TEST_COPROCESSOR( "mcr"two"     p15, 7, r15, cr15, cr15, 7")            \
+       TEST_COPROCESSOR( "mcr"two"     p0, 0, r0, cr0, cr0, 0")                \
+       TEST_COPROCESSOR( "mrc"two"     p15, 7, r14, cr15, cr15, 7")            \
+       TEST_COPROCESSOR( "mrc"two"     p0, 0, r0, cr0, cr0, 0")
 
        COPROCESSOR_INSTRUCTIONS_ST_LD("",e)
 #if __LINUX_ARM_ARCH__ >= 5
index 19a5b2a..f1d5583 100644 (file)
@@ -108,6 +108,7 @@ struct test_arg_end {
 
 #define TESTCASE_START(title)                                  \
        __asm__ __volatile__ (                                  \
+       ".syntax unified                                \n\t"   \
        "bl     __kprobes_test_case_start               \n\t"   \
        ".pushsection .rodata                           \n\t"   \
        "10:                                            \n\t"   \
index 3654f97..87de1f6 100644 (file)
@@ -8,16 +8,15 @@
 gen := arch/$(ARCH)/include/generated
 kapi := $(gen)/asm
 uapi := $(gen)/uapi/asm
-syshdr := $(srctree)/$(src)/syscallhdr.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
 sysnr := $(srctree)/$(src)/syscallnr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 syscall := $(src)/syscall.tbl
 
 gen-y := $(gen)/calls-oabi.S
 gen-y += $(gen)/calls-eabi.S
 kapi-hdrs-y := $(kapi)/unistd-nr.h
 kapi-hdrs-y += $(kapi)/mach-types.h
-uapi-hdrs-y := $(uapi)/unistd-common.h
 uapi-hdrs-y += $(uapi)/unistd-oabi.h
 uapi-hdrs-y += $(uapi)/unistd-eabi.h
 
@@ -41,28 +40,21 @@ $(kapi)/mach-types.h: $(src)/gen-mach-types $(src)/mach-types FORCE
        $(call if_changed,gen_mach)
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
-                  '$(syshdr_abi_$(basetarget))' \
-                  '$(syshdr_pfx_$(basetarget))' \
-                  '__NR_SYSCALL_BASE'
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) \
+                  --offset __NR_SYSCALL_BASE $< $@
 
 quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \
-                  '$(systbl_abi_$(basetarget))'
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
 
 quiet_cmd_sysnr  = SYSNR   $@
       cmd_sysnr  = $(CONFIG_SHELL) '$(sysnr)' '$<' '$@' \
                   '$(syshdr_abi_$(basetarget))'
 
-syshdr_abi_unistd-common := common
-$(uapi)/unistd-common.h: $(syscall) $(syshdr) FORCE
-       $(call if_changed,syshdr)
-
-syshdr_abi_unistd-oabi := oabi
+$(uapi)/unistd-oabi.h: abis := common,oabi
 $(uapi)/unistd-oabi.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-syshdr_abi_unistd-eabi := eabi
+$(uapi)/unistd-eabi.h: abis := common,eabi
 $(uapi)/unistd-eabi.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
@@ -70,10 +62,10 @@ sysnr_abi_unistd-nr := common,oabi,eabi,compat
 $(kapi)/unistd-nr.h: $(syscall) $(sysnr) FORCE
        $(call if_changed,sysnr)
 
-systbl_abi_calls-oabi := common,oabi
+$(gen)/calls-oabi.S: abis := common,oabi
 $(gen)/calls-oabi.S: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abi_calls-eabi := common,eabi
+$(gen)/calls-eabi.S: abis := common,eabi
 $(gen)/calls-eabi.S: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
diff --git a/arch/arm/tools/syscallhdr.sh b/arch/arm/tools/syscallhdr.sh
deleted file mode 100644 (file)
index 6b2f25c..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_ASM_ARM_`basename "$out" | sed \
-    -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-    -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-if echo $out | grep -q uapi; then
-    fileguard="_UAPI$fileguard"
-fi
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-    echo "#ifndef ${fileguard}"
-    echo "#define ${fileguard} 1"
-    echo ""
-
-    while read nr abi name entry ; do
-       if [ -z "$offset" ]; then
-           echo "#define __NR_${prefix}${name} $nr"
-       else
-           echo "#define __NR_${prefix}${name} ($offset + $nr)"
-        fi
-    done
-
-    echo ""
-    echo "#endif /* ${fileguard} */"
-) > "$out"
diff --git a/arch/arm/tools/syscalltbl.sh b/arch/arm/tools/syscalltbl.sh
deleted file mode 100644 (file)
index ae7e93c..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-    while read nr abi name entry compat; do
-        if [ "$abi" = "eabi" -a -n "$compat" ]; then
-            echo "$in: error: a compat entry for an EABI syscall ($name) makes no sense" >&2
-            exit 1
-        fi
-
-       if [ -n "$entry" ]; then
-            if [ -z "$compat" ]; then
-                echo "NATIVE($nr, $entry)"
-            else
-                echo "COMPAT($nr, $entry, $compat)"
-            fi
-        fi
-    done
-) > "$out"
index 7f2a800..9f1d856 100644 (file)
@@ -11,6 +11,12 @@ config ARM64
        select ACPI_PPTT if ACPI
        select ARCH_HAS_DEBUG_WX
        select ARCH_BINFMT_ELF_STATE
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+       select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DMA_PREP_COHERENT
@@ -72,6 +78,7 @@ config ARM64
        select ARCH_USE_QUEUED_SPINLOCKS
        select ARCH_USE_SYM_ANNOTATIONS
        select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       select ARCH_SUPPORTS_HUGETLBFS
        select ARCH_SUPPORTS_MEMORY_FAILURE
        select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
        select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN
@@ -163,7 +170,6 @@ config ARM64
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
        select HAVE_CONTEXT_TRACKING
-       select HAVE_DEBUG_BUGVERBOSE
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_CONTIGUOUS
        select HAVE_DYNAMIC_FTRACE
@@ -213,6 +219,7 @@ config ARM64
        select SWIOTLB
        select SYSCTL_EXCEPTION_TRACE
        select THREAD_INFO_IN_TASK
+       select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
        help
          ARM 64-bit (AArch64) Linux support.
 
@@ -308,10 +315,7 @@ config ZONE_DMA32
        bool "Support DMA32 zone" if EXPERT
        default y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
        def_bool y
 
 config SMP
@@ -1056,32 +1060,15 @@ source "kernel/Kconfig.hz"
 config ARCH_SPARSEMEM_ENABLE
        def_bool y
        select SPARSEMEM_VMEMMAP_ENABLE
-
-config ARCH_SPARSEMEM_DEFAULT
-       def_bool ARCH_SPARSEMEM_ENABLE
-
-config ARCH_SELECT_MEMORY_MODEL
-       def_bool ARCH_SPARSEMEM_ENABLE
-
-config ARCH_FLATMEM_ENABLE
-       def_bool !NUMA
+       select SPARSEMEM_VMEMMAP
 
 config HW_PERF_EVENTS
        def_bool y
        depends on ARM_PMU
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y
-
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config ARCH_HAS_FILTER_PGPROT
        def_bool y
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y if PGTABLE_LEVELS > 2
-
 # Supported by clang >= 7.0
 config CC_HAVE_SHADOW_CALL_STACK
        def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
@@ -1923,14 +1910,6 @@ config SYSVIPC_COMPAT
        def_bool y
        depends on COMPAT && SYSVIPC
 
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
-       def_bool y
-       depends on TRANSPARENT_HUGEPAGE
-
 menu "Power management options"
 
 source "kernel/power/Kconfig"
index 5eb7af9..55f57df 100644 (file)
@@ -131,6 +131,9 @@ static inline void local_daif_inherit(struct pt_regs *regs)
        if (interrupts_enabled(regs))
                trace_hardirqs_on();
 
+       if (system_uses_irq_prio_masking())
+               gic_write_pmr(regs->pmr_save);
+
        /*
         * We can't use local_daif_restore(regs->pstate) here as
         * system_has_prio_mask_debugging() won't restore the I bit if it can
index 587c504..d44df9d 100644 (file)
  * has a direct correspondence, and needs to appear sufficiently aligned
  * in the virtual address space.
  */
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
+#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
 #define ARM64_MEMSTART_ALIGN   (1UL << SECTION_SIZE_BITS)
 #else
 #define ARM64_MEMSTART_ALIGN   (1UL << ARM64_MEMSTART_SHIFT)
index 6d9915d..87b90dc 100644 (file)
@@ -345,7 +345,7 @@ static inline void *phys_to_virt(phys_addr_t x)
  */
 #define ARCH_PFN_OFFSET                ((unsigned long)PHYS_PFN_OFFSET)
 
-#if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL)
+#if defined(CONFIG_DEBUG_VIRTUAL)
 #define page_to_virt(x)        ({                                              \
        __typeof__(x) __page = x;                                       \
        void *__addr = __va(page_to_phys(__page));                      \
@@ -365,7 +365,7 @@ static inline void *phys_to_virt(phys_addr_t x)
        u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page));     \
        (struct page *)__addr;                                          \
 })
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP || CONFIG_DEBUG_VIRTUAL */
+#endif /* CONFIG_DEBUG_VIRTUAL */
 
 #define virt_addr_valid(addr)  ({                                      \
        __typeof__(addr) __addr = __tag_reset(addr);                    \
index eb4a75d..4b73463 100644 (file)
@@ -5,7 +5,6 @@
 #ifndef __ASM_SPARSEMEM_H
 #define __ASM_SPARSEMEM_H
 
-#ifdef CONFIG_SPARSEMEM
 #define MAX_PHYSMEM_BITS       CONFIG_ARM64_PA_BITS
 
 /*
@@ -27,6 +26,4 @@
 #define SECTION_SIZE_BITS 27
 #endif /* CONFIG_ARM64_64K_PAGES */
 
-#endif /* CONFIG_SPARSEMEM*/
-
 #endif
index abc8463..c906d20 100644 (file)
@@ -133,11 +133,10 @@ static void clean_dcache_range_nopatch(u64 start, u64 end)
        } while (cur += d_size, cur < end);
 }
 
-static void __nocfi __apply_alternatives(void *alt_region,  bool is_module,
-                                        unsigned long *feature_mask)
+static void __nocfi __apply_alternatives(struct alt_region *region, bool is_module,
+                                unsigned long *feature_mask)
 {
        struct alt_instr *alt;
-       struct alt_region *region = alt_region;
        __le32 *origptr, *updptr;
        alternative_cb_t alt_cb;
 
index 30c82d3..efed283 100644 (file)
@@ -68,6 +68,7 @@
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
 #include <linux/types.h>
+#include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/kasan.h>
@@ -694,14 +695,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
                ret = ftrp->safe_val;
                break;
        case FTR_LOWER_SAFE:
-               ret = new < cur ? new : cur;
+               ret = min(new, cur);
                break;
        case FTR_HIGHER_OR_ZERO_SAFE:
                if (!cur || !new)
                        break;
                fallthrough;
        case FTR_HIGHER_SAFE:
-               ret = new > cur ? new : cur;
+               ret = max(new, cur);
                break;
        default:
                BUG();
index b512b55..03991ee 100644 (file)
@@ -29,7 +29,7 @@ int arm_cpuidle_init(unsigned int cpu)
 
 /**
  * arm_cpuidle_suspend() - function to enter a low-power idle state
- * @arg: argument to pass to CPU suspend operations
+ * @index: argument to pass to CPU suspend operations
  *
  * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU
  * operations back-end error code otherwise.
index a1ec351..340d04e 100644 (file)
@@ -230,14 +230,6 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
 {
        unsigned long far = read_sysreg(far_el1);
 
-       /*
-        * The CPU masked interrupts, and we are leaving them masked during
-        * do_debug_exception(). Update PMR as if we had called
-        * local_daif_mask().
-        */
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        arm64_enter_el1_dbg(regs);
        if (!cortex_a76_erratum_1463225_debug_handler(regs))
                do_debug_exception(far, esr, regs);
@@ -404,9 +396,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
        /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */
        unsigned long far = read_sysreg(far_el1);
 
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        do_debug_exception(far, esr, regs);
        local_daif_restore(DAIF_PROCCTX_NOIRQ);
@@ -414,9 +403,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
 
 static void noinstr el0_svc(struct pt_regs *regs)
 {
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        cortex_a76_erratum_1463225_svc_handler();
        do_el0_svc(regs);
@@ -492,9 +478,6 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr)
 
 static void noinstr el0_svc_compat(struct pt_regs *regs)
 {
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        cortex_a76_erratum_1463225_svc_handler();
        do_el0_svc_compat(regs);
index 4ac5455..3513984 100644 (file)
@@ -285,16 +285,16 @@ alternative_else_nop_endif
        stp     lr, x21, [sp, #S_LR]
 
        /*
-        * For exceptions from EL0, terminate the callchain here.
+        * For exceptions from EL0, create a terminal frame record.
         * For exceptions from EL1, create a synthetic frame record so the
         * interrupted code shows up in the backtrace.
         */
        .if \el == 0
-       mov     x29, xzr
+       stp     xzr, xzr, [sp, #S_STACKFRAME]
        .else
        stp     x29, x22, [sp, #S_STACKFRAME]
-       add     x29, sp, #S_STACKFRAME
        .endif
+       add     x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
@@ -314,6 +314,8 @@ alternative_else_nop_endif
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
        mrs_s   x20, SYS_ICC_PMR_EL1
        str     x20, [sp, #S_PMR_SAVE]
+       mov     x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
+       msr_s   SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
 
        /* Re-enable tag checking (TCO set on exception entry) */
@@ -550,17 +552,7 @@ tsk        .req    x28             // current thread_info
 #endif
        .endm
 
-       .macro  gic_prio_irq_setup, pmr:req, tmp:req
-#ifdef CONFIG_ARM64_PSEUDO_NMI
-       alternative_if ARM64_HAS_IRQ_PRIO_MASKING
-       orr     \tmp, \pmr, #GIC_PRIO_PSR_I_SET
-       msr_s   SYS_ICC_PMR_EL1, \tmp
-       alternative_else_nop_endif
-#endif
-       .endm
-
        .macro el1_interrupt_handler, handler:req
-       gic_prio_irq_setup pmr=x20, tmp=x1
        enable_da
 
        mov     x0, sp
@@ -590,7 +582,6 @@ alternative_else_nop_endif
        .endm
 
        .macro el0_interrupt_handler, handler:req
-       gic_prio_irq_setup pmr=x20, tmp=x0
        user_exit_irqoff
        enable_da
 
@@ -788,7 +779,6 @@ SYM_CODE_END(el0_fiq)
 SYM_CODE_START_LOCAL(el1_error)
        kernel_entry 1
        mrs     x1, esr_el1
-       gic_prio_kentry_setup tmp=x2
        enable_dbg
        mov     x0, sp
        bl      do_serror
@@ -799,7 +789,6 @@ SYM_CODE_START_LOCAL(el0_error)
        kernel_entry 0
 el0_error_naked:
        mrs     x25, esr_el1
-       gic_prio_kentry_setup tmp=x2
        user_exit_irqoff
        enable_dbg
        mov     x0, sp
index cbf5210..b4bb67f 100644 (file)
@@ -294,13 +294,10 @@ void __show_regs(struct pt_regs *regs)
        i = top_reg;
 
        while (i >= 0) {
-               printk("x%-2d: %016llx ", i, regs->regs[i]);
-               i--;
+               printk("x%-2d: %016llx", i, regs->regs[i]);
 
-               if (i % 2 == 0) {
-                       pr_cont("x%-2d: %016llx ", i, regs->regs[i]);
-                       i--;
-               }
+               while (i-- % 3)
+                       pr_cont(" x%-2d: %016llx", i, regs->regs[i]);
 
                pr_cont("\n");
        }
index 84b676b..de07147 100644 (file)
@@ -68,10 +68,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
        unsigned long fp = frame->fp;
        struct stack_info info;
 
-       /* Terminal record; nothing to unwind */
-       if (!fp)
-               return -ENOENT;
-
        if (fp & 0xf)
                return -EINVAL;
 
@@ -132,6 +128,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
        frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+       /*
+        * This is a terminal record, so we have finished unwinding.
+        */
+       if (!frame->fp && !frame->pc)
+               return -ENOENT;
+
        return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
index 61dbb4c..a5e61e0 100644 (file)
@@ -31,6 +31,13 @@ SECTIONS
        .gnu.version_d  : { *(.gnu.version_d) }
        .gnu.version_r  : { *(.gnu.version_r) }
 
+       /*
+        * Discard .note.gnu.property sections which are unused and have
+        * different alignment requirement from vDSO note sections.
+        */
+       /DISCARD/       : {
+               *(.note.GNU-stack .note.gnu.property)
+       }
        .note           : { *(.note.*) }                :text   :note
 
        . = ALIGN(16);
@@ -48,7 +55,6 @@ SECTIONS
        PROVIDE(end = .);
 
        /DISCARD/       : {
-               *(.note.GNU-stack)
                *(.data .data.* .gnu.linkonce.d.* .sdata*)
                *(.bss .sbss .dynbss .dynsbss)
                *(.eh_frame .eh_frame_hdr)
index 789ad42..3dba0c4 100644 (file)
@@ -10,15 +10,7 @@ include $(srctree)/lib/vdso/Makefile
 
 # Same as cc-*option, but using CC_COMPAT instead of CC
 ifeq ($(CONFIG_CC_IS_CLANG), y)
-COMPAT_GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE_COMPAT)elfedit))
-COMPAT_GCC_TOOLCHAIN := $(realpath $(COMPAT_GCC_TOOLCHAIN_DIR)/..)
-
 CC_COMPAT_CLANG_FLAGS := --target=$(notdir $(CROSS_COMPILE_COMPAT:%-=%))
-CC_COMPAT_CLANG_FLAGS += --prefix=$(COMPAT_GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE_COMPAT))
-CC_COMPAT_CLANG_FLAGS += -no-integrated-as -Qunused-arguments
-ifneq ($(COMPAT_GCC_TOOLCHAIN),)
-CC_COMPAT_CLANG_FLAGS += --gcc-toolchain=$(COMPAT_GCC_TOOLCHAIN)
-endif
 
 CC_COMPAT ?= $(CC)
 CC_COMPAT += $(CC_COMPAT_CLANG_FLAGS)
index 55ecf6d..58987a9 100644 (file)
@@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                set_pte(ptep, pte);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, unsigned long sz)
 {
        pgd_t *pgdp;
@@ -284,9 +284,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                 */
                ptep = pte_alloc_map(mm, pmdp, addr);
        } else if (sz == PMD_SIZE) {
-               if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
-                   pud_none(READ_ONCE(*pudp)))
-                       ptep = huge_pmd_share(mm, addr, pudp);
+               if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
+                       ptep = huge_pmd_share(mm, vma, addr, pudp);
                else
                        ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
        } else if (sz == (CONT_PMD_SIZE)) {
index 0696a45..16a2b2b 100644 (file)
@@ -221,6 +221,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 int pfn_valid(unsigned long pfn)
 {
        phys_addr_t addr = PFN_PHYS(pfn);
+       struct mem_section *ms;
 
        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
@@ -231,10 +232,6 @@ int pfn_valid(unsigned long pfn)
        if (PHYS_PFN(addr) != pfn)
                return 0;
 
-#ifdef CONFIG_SPARSEMEM
-{
-       struct mem_section *ms;
-
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
 
@@ -253,8 +250,7 @@ int pfn_valid(unsigned long pfn)
         */
        if (!early_section(ms))
                return pfn_section_valid(ms, pfn);
-}
-#endif
+
        return memblock_is_map_memory(addr);
 }
 EXPORT_SYMBOL(pfn_valid);
index 70fa3cd..6dd9369 100644 (file)
@@ -1113,7 +1113,6 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 }
 #endif
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap)
@@ -1177,7 +1176,6 @@ void vmemmap_free(unsigned long start, unsigned long end,
        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
 #endif
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static inline pud_t *fixmap_pud(unsigned long addr)
 {
index a50e92e..a1937df 100644 (file)
@@ -51,10 +51,8 @@ static struct addr_marker address_markers[] = {
        { FIXADDR_TOP,                  "Fixmap end" },
        { PCI_IO_START,                 "PCI I/O start" },
        { PCI_IO_END,                   "PCI I/O end" },
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
        { VMEMMAP_START,                "vmemmap start" },
        { VMEMMAP_START + VMEMMAP_SIZE, "vmemmap end" },
-#endif
        { -1,                           NULL },
 };
 
index 7aa16c7..c867a80 100644 (file)
@@ -9,6 +9,10 @@
 
 #include <linux/compiler.h>
 
+#include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
+#include <asm-generic/bitops/fls64.h>
+
 #ifdef __KERNEL__
 
 #ifndef _LINUX_BITOPS_H
@@ -173,8 +177,4 @@ static inline unsigned long __ffs(unsigned long word)
 
 #endif /* __KERNEL__ */
 
-#include <asm-generic/bitops/fls.h>
-#include <asm-generic/bitops/__fls.h>
-#include <asm-generic/bitops/fls64.h>
-
 #endif /* _H8300_BITOPS_H */
index c168c69..74b644e 100644 (file)
@@ -10,6 +10,9 @@ LDFLAGS_vmlinux += -G0
 # Do not use single-byte enums; these will overflow.
 KBUILD_CFLAGS += -fno-short-enums
 
+# We must use long-calls:
+KBUILD_CFLAGS += -mlong-calls
+
 # Modules must use either long-calls, or use pic/plt.
 # Use long-calls for now, it's easier.  And faster.
 # KBUILD_CFLAGS_MODULE += -fPIC
@@ -30,9 +33,6 @@ TIR_NAME := r19
 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__
 KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME)
 
-LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null)
-libs-y += $(LIBGCC)
-
 head-y := arch/hexagon/kernel/head.o
 
 core-y += arch/hexagon/kernel/ \
index f19ae2a..9b2b1cc 100644 (file)
@@ -34,7 +34,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_SERIO is not set
 # CONFIG_CONSOLE_TRANSLATIONS is not set
 CONFIG_LEGACY_PTY_COUNT=64
-# CONFIG_DEVKMEM is not set
 # CONFIG_HW_RANDOM is not set
 CONFIG_SPI=y
 CONFIG_SPI_DEBUG=y
@@ -81,4 +80,3 @@ CONFIG_FRAME_WARN=0
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 # CONFIG_SCHED_DEBUG is not set
-CONFIG_DEBUG_INFO=y
index 6b9c554..9fb00a0 100644 (file)
@@ -21,7 +21,7 @@
        "3:\n" \
        ".section .fixup,\"ax\"\n" \
        "4: %1 = #%5;\n" \
-       "   jump 3b\n" \
+       "   jump ##3b\n" \
        ".previous\n" \
        ".section __ex_table,\"a\"\n" \
        ".long 1b,4b,2b,4b\n" \
@@ -90,7 +90,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
        "3:\n"
        ".section .fixup,\"ax\"\n"
        "4: %0 = #%6\n"
-       "   jump 3b\n"
+       "   jump ##3b\n"
        ".previous\n"
        ".section __ex_table,\"a\"\n"
        ".long 1b,4b,2b,4b\n"
index bda2a9c..c332414 100644 (file)
@@ -64,7 +64,6 @@ static inline void *phys_to_virt(unsigned long address)
  * convert a physical pointer to a virtual kernel pointer for
  * /dev/mem access.
  */
-#define xlate_dev_kmem_ptr(p)    __va(p)
 #define xlate_dev_mem_ptr(p)    __va(p)
 
 /*
index 78338d8..8d4ec76 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <asm-generic/timex.h>
 #include <asm/timer-regs.h>
+#include <asm/hexagon_vm.h>
 
 /* Using TCX0 as our clock.  CLOCK_TICK_RATE scheduled to be removed. */
 #define CLOCK_TICK_RATE              TCX0_CLK_RATE
@@ -16,7 +17,7 @@
 
 static inline int read_current_timer(unsigned long *timer_val)
 {
-       *timer_val = (unsigned long) __vmgettime();
+       *timer_val = __vmgettime();
        return 0;
 }
 
index 6fb1aaa..35545a7 100644 (file)
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(_dflt_cache_att);
 DECLARE_EXPORT(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes);
 
 /* Additional functions */
-DECLARE_EXPORT(__divsi3);
-DECLARE_EXPORT(__modsi3);
-DECLARE_EXPORT(__udivsi3);
-DECLARE_EXPORT(__umodsi3);
+DECLARE_EXPORT(__hexagon_divsi3);
+DECLARE_EXPORT(__hexagon_modsi3);
+DECLARE_EXPORT(__hexagon_udivsi3);
+DECLARE_EXPORT(__hexagon_umodsi3);
 DECLARE_EXPORT(csum_tcpudp_magic);
index a5a89e9..8975f9b 100644 (file)
@@ -35,7 +35,7 @@ void user_disable_single_step(struct task_struct *child)
 
 static int genregs_get(struct task_struct *target,
                   const struct user_regset *regset,
-                  srtuct membuf to)
+                  struct membuf to)
 {
        struct pt_regs *regs = task_pt_regs(target);
 
@@ -54,7 +54,7 @@ static int genregs_get(struct task_struct *target,
        membuf_store(&to, regs->m0);
        membuf_store(&to, regs->m1);
        membuf_store(&to, regs->usr);
-       membuf_store(&to, regs->p3_0);
+       membuf_store(&to, regs->preds);
        membuf_store(&to, regs->gp);
        membuf_store(&to, regs->ugp);
        membuf_store(&to, pt_elr(regs)); // pc
index 54be529..a64641e 100644 (file)
@@ -2,4 +2,5 @@
 #
 # Makefile for hexagon-specific library files.
 #
-obj-y = checksum.o io.o memcpy.o memset.o
+obj-y = checksum.o io.o memcpy.o memset.o memcpy_likely_aligned.o \
+         divsi3.o modsi3.o udivsi3.o  umodsi3.o
diff --git a/arch/hexagon/lib/divsi3.S b/arch/hexagon/lib/divsi3.S
new file mode 100644 (file)
index 0000000..783e094
--- /dev/null
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_divsi3)
+        {
+                p0 = cmp.gt(r0,#-1)
+                p1 = cmp.gt(r1,#-1)
+                r3:2 = vabsw(r1:0)
+        }
+        {
+                p3 = xor(p0,p1)
+                r4 = sub(r2,r3)
+                r6 = cl0(r2)
+                p0 = cmp.gtu(r3,r2)
+        }
+        {
+                r0 = mux(p3,#-1,#1)
+                r7 = cl0(r3)
+                p1 = cmp.gtu(r3,r4)
+        }
+        {
+                r0 = mux(p0,#0,r0)
+                p0 = or(p0,p1)
+                if (p0.new) jumpr:nt r31
+                r6 = sub(r7,r6)
+        }
+        {
+                r7 = r6
+                r5:4 = combine(#1,r3)
+                r6 = add(#1,lsr(r6,#1))
+                p0 = cmp.gtu(r6,#4)
+        }
+        {
+                r5:4 = vaslw(r5:4,r7)
+                if (!p0) r6 = #3
+        }
+        {
+                loop0(1f,r6)
+                r7:6 = vlsrw(r5:4,#1)
+                r1:0 = #0
+        }
+        .falign
+1:
+        {
+                r5:4 = vlsrw(r5:4,#2)
+                if (!p0.new) r0 = add(r0,r5)
+                if (!p0.new) r2 = sub(r2,r4)
+                p0 = cmp.gtu(r4,r2)
+        }
+        {
+                r7:6 = vlsrw(r7:6,#2)
+                if (!p0.new) r0 = add(r0,r7)
+                if (!p0.new) r2 = sub(r2,r6)
+                p0 = cmp.gtu(r6,r2)
+        }:endloop0
+        {
+                if (!p0) r0 = add(r0,r7)
+        }
+        {
+                if (p3) r0 = sub(r1,r0)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_divsi3)
diff --git a/arch/hexagon/lib/memcpy_likely_aligned.S b/arch/hexagon/lib/memcpy_likely_aligned.S
new file mode 100644 (file)
index 0000000..6a541fb
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes)
+        {
+                p0 = bitsclr(r1,#7)
+                p0 = bitsclr(r0,#7)
+                if (p0.new) r5:4 = memd(r1)
+                if (p0.new) r7:6 = memd(r1+#8)
+        }
+        {
+                if (!p0) jump:nt .Lmemcpy_call
+                if (p0) r9:8 = memd(r1+#16)
+                if (p0) r11:10 = memd(r1+#24)
+                p0 = cmp.gtu(r2,#64)
+        }
+        {
+                if (p0) jump:nt .Lmemcpy_call
+                if (!p0) memd(r0) = r5:4
+                if (!p0) memd(r0+#8) = r7:6
+                p0 = cmp.gtu(r2,#32)
+        }
+        {
+                p1 = cmp.gtu(r2,#40)
+                p2 = cmp.gtu(r2,#48)
+                if (p0) r13:12 = memd(r1+#32)
+                if (p1.new) r15:14 = memd(r1+#40)
+        }
+        {
+                memd(r0+#16) = r9:8
+                memd(r0+#24) = r11:10
+        }
+        {
+                if (p0) memd(r0+#32) = r13:12
+                if (p1) memd(r0+#40) = r15:14
+                if (!p2) jumpr:t r31
+        }
+        {
+                p0 = cmp.gtu(r2,#56)
+                r5:4 = memd(r1+#48)
+                if (p0.new) r7:6 = memd(r1+#56)
+        }
+        {
+                memd(r0+#48) = r5:4
+                if (p0) memd(r0+#56) = r7:6
+                jumpr r31
+        }
+
+.Lmemcpy_call:
+        jump memcpy
+
+SYM_FUNC_END(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes)
diff --git a/arch/hexagon/lib/modsi3.S b/arch/hexagon/lib/modsi3.S
new file mode 100644 (file)
index 0000000..9ea1c86
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_modsi3)
+        {
+                p2 = cmp.ge(r0,#0)
+                r2 = abs(r0)
+                r1 = abs(r1)
+        }
+        {
+                r3 = cl0(r2)
+                r4 = cl0(r1)
+                p0 = cmp.gtu(r1,r2)
+        }
+        {
+                r3 = sub(r4,r3)
+                if (p0) jumpr r31
+        }
+        {
+                p1 = cmp.eq(r3,#0)
+                loop0(1f,r3)
+                r0 = r2
+                r2 = lsl(r1,r3)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r2)
+                r2 = lsr(r2,#1)
+                if (p1) r1 = #0
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r1)
+                if (p2) jumpr r31
+        }
+        {
+                r0 = neg(r0)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_modsi3)
diff --git a/arch/hexagon/lib/udivsi3.S b/arch/hexagon/lib/udivsi3.S
new file mode 100644 (file)
index 0000000..477f27b
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_udivsi3)
+        {
+                r2 = cl0(r0)
+                r3 = cl0(r1)
+                r5:4 = combine(#1,#0)
+                p0 = cmp.gtu(r1,r0)
+        }
+        {
+                r6 = sub(r3,r2)
+                r4 = r1
+                r1:0 = combine(r0,r4)
+                if (p0) jumpr r31
+        }
+        {
+                r3:2 = vlslw(r5:4,r6)
+                loop0(1f,r6)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r1)
+                if (!p0.new) r1 = sub(r1,r2)
+                if (!p0.new) r0 = add(r0,r3)
+                r3:2 = vlsrw(r3:2,#1)
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r1)
+                if (!p0.new) r0 = add(r0,r3)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_udivsi3)
diff --git a/arch/hexagon/lib/umodsi3.S b/arch/hexagon/lib/umodsi3.S
new file mode 100644 (file)
index 0000000..280bf06
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_umodsi3)
+        {
+                r2 = cl0(r0)
+                r3 = cl0(r1)
+                p0 = cmp.gtu(r1,r0)
+        }
+        {
+                r2 = sub(r3,r2)
+                if (p0) jumpr r31
+        }
+        {
+                loop0(1f,r2)
+                p1 = cmp.eq(r2,#0)
+                r2 = lsl(r1,r2)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r2)
+                r2 = lsr(r2,#1)
+                if (p1) r1 = #0
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r1)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_umodsi3)
index 81e2b89..279252e 100644 (file)
@@ -13,6 +13,8 @@ config IA64
        select ARCH_MIGHT_HAVE_PC_SERIO
        select ACPI
        select ACPI_NUMA if NUMA
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
        select ARCH_SUPPORTS_ACPI
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
        select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -32,6 +34,7 @@ config IA64
        select TTY
        select HAVE_ARCH_TRACEHOOK
        select HAVE_VIRT_CPU_ACCOUNTING
+       select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
        select VIRT_TO_BUS
        select GENERIC_IRQ_PROBE
        select GENERIC_PENDING_IRQ if SMP
@@ -82,11 +85,6 @@ config STACKTRACE_SUPPORT
 config GENERIC_LOCKBREAK
        def_bool n
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-       bool
-       depends on HUGETLB_PAGE
-       default y
-
 config GENERIC_CALIBRATE_DELAY
        bool
        default y
@@ -250,12 +248,6 @@ config HOTPLUG_CPU
          can be controlled through /sys/devices/system/cpu/cpu#.
          Say N if you want to disable CPU hotplug.
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
 config SCHED_SMT
        bool "SMT scheduler support"
        depends on SMP
index 3d666a1..6d93b92 100644 (file)
@@ -277,7 +277,6 @@ extern void memset_io(volatile void __iomem *s, int c, long n);
 #define memcpy_fromio memcpy_fromio
 #define memcpy_toio memcpy_toio
 #define memset_io memset_io
-#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
 #define xlate_dev_mem_ptr xlate_dev_mem_ptr
 #include <asm-generic/io.h>
 #undef PCI_IOBASE
index 179243c..e19d2dc 100644 (file)
@@ -272,22 +272,4 @@ xlate_dev_mem_ptr(phys_addr_t p)
        return ptr;
 }
 
-/*
- * Convert a virtual cached kernel memory pointer to an uncached pointer
- */
-static __inline__ void *
-xlate_dev_kmem_ptr(void *p)
-{
-       struct page *page;
-       void *ptr;
-
-       page = virt_to_page((unsigned long)p);
-       if (PageUncached(page))
-               ptr = (void *)__pa(p) + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = p;
-
-       return ptr;
-}
-
 #endif /* _ASM_IA64_UACCESS_H */
index b331f94..f993cb3 100644 (file)
@@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
 EXPORT_SYMBOL(hpage_shift);
 
 pte_t *
-huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+              unsigned long addr, unsigned long sz)
 {
        unsigned long taddr = htlbpage_to_page(addr);
        pgd_t *pgd;
index ea14f20..82620f1 100644 (file)
@@ -16,7 +16,7 @@
 
 KBUILD_DEFCONFIG := multi_defconfig
 
-ifneq ($(SUBARCH),$(ARCH))
+ifdef cross_compiling
        ifeq ($(CROSS_COMPILE),)
                CROSS_COMPILE := $(call cc-cross-prefix, \
                        m68k-linux-gnu- m68k-linux- m68k-unknown-linux-gnu-)
index 1068670..7e44d0e 100644 (file)
@@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t )
 
     return( 0 );
 }
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 3a84f24..6d9ed21 100644 (file)
@@ -60,7 +60,6 @@ CONFIG_DM9000=y
 # CONFIG_VT is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_DEVMEM is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MCF=y
 CONFIG_SERIAL_MCF_BAUDRATE=115200
 CONFIG_SERIAL_MCF_CONSOLE=y
index 10133a9..7b41409 100644 (file)
@@ -440,8 +440,6 @@ static inline unsigned long ffz(unsigned long word)
 
 #endif
 
-#include <asm-generic/bitops/find.h>
-
 #ifdef __KERNEL__
 
 #if defined(CONFIG_CPU_HAS_NO_BITFIELDS)
@@ -525,10 +523,12 @@ static inline int __fls(int x)
 #define __clear_bit_unlock     clear_bit_unlock
 
 #include <asm-generic/bitops/ext2-atomic.h>
-#include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/le.h>
 #endif /* __KERNEL__ */
 
+#include <asm-generic/bitops/find.h>
+
 #endif /* _M68K_BITOPS_H */
index 819f611..d41fa48 100644 (file)
@@ -397,11 +397,6 @@ static inline void isa_delay(void)
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define readb_relaxed(addr)    readb(addr)
 #define readw_relaxed(addr)    readw(addr)
 #define readl_relaxed(addr)    readl(addr)
index 49a3c9c..ed51970 100644 (file)
@@ -19,6 +19,7 @@ config MIPS
        select ARCH_USE_MEMTEST
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_SUPPORTS_HUGETLBFS if CPU_SUPPORTS_HUGEPAGES
        select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
        select ARCH_WANT_IPC_PARSE_VERSION
        select ARCH_WANT_LD_ORPHAN_WARN
@@ -1287,11 +1288,6 @@ config SYS_SUPPORTS_BIG_ENDIAN
 config SYS_SUPPORTS_LITTLE_ENDIAN
        bool
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-       depends on CPU_SUPPORTS_HUGEPAGES
-       default y
-
 config MIPS_HUGE_TLB_SUPPORT
        def_bool HUGETLB_PAGE || TRANSPARENT_HUGEPAGE
 
index e71d587..258234c 100644 (file)
@@ -50,7 +50,7 @@ tool-archpref         = $(64bit-tool-archpref)
 UTS_MACHINE            := mips64
 endif
 
-ifneq ($(SUBARCH),$(ARCH))
+ifdef cross_compiling
   ifeq ($(CROSS_COMPILE),)
     CROSS_COMPILE := $(call cc-cross-prefix, $(tool-archpref)-linux-  $(tool-archpref)-linux-gnu-  $(tool-archpref)-unknown-linux-gnu-)
   endif
index 2c13845..6f5c86d 100644 (file)
@@ -564,11 +564,6 @@ extern void (*_dma_cache_inv)(unsigned long start, unsigned long size);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 void __ioread64_copy(void *to, const void __iomem *from, size_t count);
 
 #endif /* _ASM_IO_H */
index b9f76f4..7eaff5b 100644 (file)
@@ -21,8 +21,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
-                     unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
        p4d_t *p4d;
index 75f2da3..6e1e004 100644 (file)
@@ -43,7 +43,6 @@ CONFIG_MICREL_PHY=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
index afc3b8d..bde9907 100644 (file)
@@ -12,6 +12,7 @@ config PARISC
        select ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_NO_SG_CHAIN
+       select ARCH_SUPPORTS_HUGETLBFS if PA20
        select ARCH_SUPPORTS_MEMORY_FAILURE
        select DMA_OPS
        select RTC_CLASS
@@ -138,10 +139,6 @@ config PGTABLE_LEVELS
        default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
        default 2
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y if PA20
-
-
 menu "Processor type and features"
 
 choice
index 7d9f71a..aed8ea2 100644 (file)
@@ -41,7 +41,7 @@ endif
 
 export LD_BFD
 
-ifneq ($(SUBARCH),$(UTS_MACHINE))
+ifdef cross_compiling
        ifeq ($(CROSS_COMPILE),)
                CC_SUFFIXES = linux linux-gnu unknown-linux-gnu
                CROSS_COMPILE := $(call cc-cross-prefix, \
index 8a11b8c..0b52591 100644 (file)
@@ -316,11 +316,6 @@ extern void iowrite64be(u64 val, void __iomem *addr);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 extern int devmem_is_allowed(unsigned long pfn);
 
 #endif
index ae3e108..d6d82f5 100644 (file)
@@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void);
                                         PDC_CHASSIS_EOM_SET            )
 
 #endif /* _PARISC_PDC_CHASSIS_H */
-/* vim: set ts=8 */
index 43652de..d1d3990 100644 (file)
@@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 }
 
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index 1e6230b..088dd2a 100644 (file)
@@ -118,28 +118,31 @@ config PPC
        # Please keep this list sorted alphabetically.
        #
        select ARCH_32BIT_OFF_T if PPC32
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_HAS_COPY_MC                 if PPC64
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEVMEM_IS_ALLOWED
+       select ARCH_HAS_DMA_MAP_DIRECT          if PPC_PSERIES
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
-       select ARCH_HAS_KCOV
        select ARCH_HAS_HUGEPD                  if HUGETLB_PAGE
+       select ARCH_HAS_KCOV
+       select ARCH_HAS_MEMBARRIER_CALLBACKS
+       select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
        select ARCH_HAS_MMIOWB                  if PPC64
+       select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PHYS_TO_DMA
        select ARCH_HAS_PMEM_API
-       select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PTE_DEVMAP              if PPC_BOOK3S_64
        select ARCH_HAS_PTE_SPECIAL
-       select ARCH_HAS_MEMBARRIER_CALLBACKS
-       select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_SCALED_CPUTIME          if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
        select ARCH_HAS_STRICT_KERNEL_RWX       if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION)
        select ARCH_HAS_TICK_BROADCAST          if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAS_UACCESS_FLUSHCACHE
-       select ARCH_HAS_COPY_MC                 if PPC64
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select ARCH_KEEP_MEMBLOCK
@@ -162,9 +165,8 @@ config PPC
        select BUILDTIME_TABLE_SORT
        select CLONE_BACKWARDS
        select DCACHE_WORD_ACCESS               if PPC64 && CPU_LITTLE_ENDIAN
-       select DMA_OPS                          if PPC64
        select DMA_OPS_BYPASS                   if PPC64
-       select ARCH_HAS_DMA_MAP_DIRECT          if PPC64 && PPC_PSERIES
+       select DMA_OPS                          if PPC64
        select DYNAMIC_FTRACE                   if FUNCTION_TRACER
        select EDAC_ATOMIC_SCRUB
        select EDAC_SUPPORT
@@ -184,23 +186,22 @@ config PPC
        select GENERIC_TIME_VSYSCALL
        select GENERIC_VDSO_TIME_NS
        select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_ARCH_HUGE_VMALLOC           if HAVE_ARCH_HUGE_VMAP
        select HAVE_ARCH_HUGE_VMAP              if PPC_BOOK3S_64 && PPC_RADIX_MMU
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_JUMP_LABEL_RELATIVE
        select HAVE_ARCH_KASAN                  if PPC32 && PPC_PAGE_SHIFT <= 14
        select HAVE_ARCH_KASAN_VMALLOC          if PPC32 && PPC_PAGE_SHIFT <= 14
-       select HAVE_ARCH_KGDB
        select HAVE_ARCH_KFENCE                 if PPC32
+       select HAVE_ARCH_KGDB
        select HAVE_ARCH_MMAP_RND_BITS
        select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
        select HAVE_ARCH_NVRAM_OPS
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_TRACEHOOK
        select HAVE_ASM_MODVERSIONS
-       select HAVE_C_RECORDMCOUNT
-       select HAVE_STACKPROTECTOR              if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
-       select HAVE_STACKPROTECTOR              if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
        select HAVE_CONTEXT_TRACKING            if PPC64
+       select HAVE_C_RECORDMCOUNT
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DYNAMIC_FTRACE
@@ -214,10 +215,13 @@ config PPC
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS                 if GCC_VERSION >= 50200   # plugin support on gcc <= 5.1 is buggy on PPC
        select HAVE_GENERIC_VDSO
+       select HAVE_HARDLOCKUP_DETECTOR_ARCH    if PPC_BOOK3S_64 && SMP
+       select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
        select HAVE_HW_BREAKPOINT               if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
        select HAVE_IDE
        select HAVE_IOREMAP_PROT
        select HAVE_IRQ_EXIT_ON_IRQ_STACK
+       select HAVE_IRQ_TIME_ACCOUNTING
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_LZMA                 if DEFAULT_UIMAGE
        select HAVE_KERNEL_LZO                  if DEFAULT_UIMAGE
@@ -229,25 +233,25 @@ config PPC
        select HAVE_LIVEPATCH                   if HAVE_DYNAMIC_FTRACE_WITH_REGS
        select HAVE_MOD_ARCH_SPECIFIC
        select HAVE_NMI                         if PERF_EVENTS || (PPC64 && PPC_BOOK3S)
-       select HAVE_HARDLOCKUP_DETECTOR_ARCH    if PPC64 && PPC_BOOK3S && SMP
        select HAVE_OPTPROBES
        select HAVE_PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI             if PPC64
-       select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
-       select MMU_GATHER_RCU_TABLE_FREE
-       select MMU_GATHER_PAGE_SIZE
        select HAVE_REGS_AND_STACK_ACCESS_API
        select HAVE_RELIABLE_STACKTRACE
+       select HAVE_RSEQ
        select HAVE_SOFTIRQ_ON_OWN_STACK
+       select HAVE_STACKPROTECTOR              if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
+       select HAVE_STACKPROTECTOR              if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_VIRT_CPU_ACCOUNTING
-       select HAVE_IRQ_TIME_ACCOUNTING
-       select HAVE_RSEQ
+       select HUGETLB_PAGE_SIZE_VARIABLE       if PPC_BOOK3S_64 && HUGETLB_PAGE
        select IOMMU_HELPER                     if PPC64
        select IRQ_DOMAIN
        select IRQ_FORCED_THREADING
+       select MMU_GATHER_PAGE_SIZE
+       select MMU_GATHER_RCU_TABLE_FREE
        select MODULES_USE_ELF_RELA
        select NEED_DMA_MAP_STATE               if PPC64 || NOT_COHERENT_CACHE
        select NEED_SG_DMA_LENGTH
@@ -420,11 +424,6 @@ config HIGHMEM
 
 source "kernel/Kconfig.hz"
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-       bool
-       depends on HUGETLB_PAGE && PPC_BOOK3S_64
-       default y
-
 config MATH_EMULATION
        bool "Math emulation"
        depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
@@ -520,12 +519,6 @@ config ARCH_CPU_PROBE_RELEASE
        def_bool y
        depends on HOTPLUG_CPU
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
 config PPC64_SUPPORTS_MEMORY_FAILURE
        bool "Add support for memory hwpoison"
        depends on PPC_BOOK3S_64
@@ -705,9 +698,6 @@ config ARCH_SPARSEMEM_DEFAULT
        def_bool y
        depends on PPC_BOOK3S_64
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-
 config ILLEGAL_POINTER_VALUE
        hex
        # This is roughly half way between the top of user space and the bottom
index 41fa0a8..cdb796b 100755 (executable)
@@ -191,7 +191,7 @@ if [ -z "$kernel" ]; then
     kernel=vmlinux
 fi
 
-LANG=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`"
+LC_ALL=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`"
 case "$elfformat" in
     elf64-powerpcle)   format=elf64lppc        ;;
     elf64-powerpc)     format=elf32ppc ;;
index 273edd2..f130783 100644 (file)
@@ -663,11 +663,6 @@ static inline void name at                                 \
 #define xlate_dev_mem_ptr(p)   __va(p)
 
 /*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
-/*
  * We don't do relaxed operations yet, at least not with this semantic
  */
 #define readb_relaxed(addr)    readb(addr)
index a6e9a55..e6b53c6 100644 (file)
@@ -210,7 +210,7 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
                                      unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
-extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                            unsigned long gfn);
 extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                          unsigned long gfn);
index c761572..6ea9001 100644 (file)
@@ -28,9 +28,6 @@ extern struct device_node *opal_node;
 
 /* API functions */
 int64_t opal_invalid_call(void);
-int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
-int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
-                       uint64_t bdf);
 int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
                        uint64_t lpcr);
 int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t bdfn,
index d2a2a14..74424c1 100644 (file)
@@ -126,7 +126,6 @@ struct pci_controller {
 #endif /* CONFIG_PPC64 */
 
        void *private_data;
-       struct npu *npu;
 };
 
 /* These are used for config access before all the PCI probing
index 6436f0b..d1f5326 100644 (file)
@@ -119,11 +119,4 @@ extern void pcibios_scan_phb(struct pci_controller *hose);
 
 #endif /* __KERNEL__ */
 
-extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
-extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
-extern int pnv_npu2_init(struct pci_controller *hose);
-extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
-               unsigned long msr);
-extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
-
 #endif /* __ASM_POWERPC_PCI_H */
index fab8402..3f35c8d 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/moduleloader.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include <linux/bug.h>
 #include <asm/module.h>
 #include <linux/uaccess.h>
@@ -88,17 +89,22 @@ int module_finalize(const Elf_Ehdr *hdr,
        return 0;
 }
 
-#ifdef MODULES_VADDR
 static __always_inline void *
 __module_alloc(unsigned long size, unsigned long start, unsigned long end)
 {
-       return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-                                   __builtin_return_address(0));
+       /*
+        * Don't do huge page allocations for modules yet until more testing
+        * is done. STRICT_MODULE_RWX may require extra work to support this
+        * too.
+        */
+       return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, PAGE_KERNEL_EXEC,
+                                   VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP,
+                                   NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 void *module_alloc(unsigned long size)
 {
+#ifdef MODULES_VADDR
        unsigned long limit = (unsigned long)_etext - SZ_32M;
        void *ptr = NULL;
 
@@ -112,5 +118,7 @@ void *module_alloc(unsigned long size)
                ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
 
        return ptr;
-}
+#else
+       return __module_alloc(size, VMALLOC_START, VMALLOC_END);
 #endif
+}
index f9eb49e..5056e17 100644 (file)
@@ -951,6 +951,93 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image)
 }
 
 /**
+ * add_node_props - Reads node properties from device node structure and add
+ *                  them to fdt.
+ * @fdt:            Flattened device tree of the kernel
+ * @node_offset:    offset of the node to add a property at
+ * @dn:             device node pointer
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_node_props(void *fdt, int node_offset, const struct device_node *dn)
+{
+       int ret = 0;
+       struct property *pp;
+
+       if (!dn)
+               return -EINVAL;
+
+       for_each_property_of_node(dn, pp) {
+               ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, pp->length);
+               if (ret < 0) {
+                       pr_err("Unable to add %s property: %s\n", pp->name, fdt_strerror(ret));
+                       return ret;
+               }
+       }
+       return ret;
+}
+
+/**
+ * update_cpus_node - Update cpus node of flattened device tree using of_root
+ *                    device node.
+ * @fdt:              Flattened device tree of the kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int update_cpus_node(void *fdt)
+{
+       struct device_node *cpus_node, *dn;
+       int cpus_offset, cpus_subnode_offset, ret = 0;
+
+       cpus_offset = fdt_path_offset(fdt, "/cpus");
+       if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) {
+               pr_err("Malformed device tree: error reading /cpus node: %s\n",
+                      fdt_strerror(cpus_offset));
+               return cpus_offset;
+       }
+
+       if (cpus_offset > 0) {
+               ret = fdt_del_node(fdt, cpus_offset);
+               if (ret < 0) {
+                       pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret));
+                       return -EINVAL;
+               }
+       }
+
+       /* Add cpus node to fdt */
+       cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus");
+       if (cpus_offset < 0) {
+               pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset));
+               return -EINVAL;
+       }
+
+       /* Add cpus node properties */
+       cpus_node = of_find_node_by_path("/cpus");
+       ret = add_node_props(fdt, cpus_offset, cpus_node);
+       of_node_put(cpus_node);
+       if (ret < 0)
+               return ret;
+
+       /* Loop through all subnodes of cpus and add them to fdt */
+       for_each_node_by_type(dn, "cpu") {
+               cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name);
+               if (cpus_subnode_offset < 0) {
+                       pr_err("Unable to add %s subnode: %s\n", dn->full_name,
+                              fdt_strerror(cpus_subnode_offset));
+                       ret = cpus_subnode_offset;
+                       goto out;
+               }
+
+               ret = add_node_props(fdt, cpus_subnode_offset, dn);
+               if (ret < 0)
+                       goto out;
+       }
+out:
+       of_node_put(dn);
+       return ret;
+}
+
+/**
  * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
  *                       being loaded.
  * @image:               kexec image being loaded.
@@ -1006,6 +1093,11 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
                }
        }
 
+       /* Update cpus nodes information to account hotplug CPUs. */
+       ret =  update_cpus_node(fdt);
+       if (ret < 0)
+               goto out;
+
        /* Update memory reserve map */
        ret = get_reserved_memory_ranges(&rmem);
        if (ret)
index b7bd9ca..2d9193c 100644 (file)
@@ -795,7 +795,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
        }
 }
 
-static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                            unsigned long gfn)
 {
        unsigned long i;
@@ -829,15 +829,21 @@ static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                unlock_rmap(rmapp);
                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        }
-       return false;
 }
 
 bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       if (kvm_is_radix(kvm))
-               return kvm_unmap_radix(kvm, range->slot, range->start);
+       gfn_t gfn;
+
+       if (kvm_is_radix(kvm)) {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       kvm_unmap_radix(kvm, range->slot, gfn);
+       } else {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       kvm_unmap_rmapp(kvm, range->slot, range->start);
+       }
 
-       return kvm_unmap_rmapp(kvm, range->slot, range->start);
+       return false;
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -924,10 +930,18 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       if (kvm_is_radix(kvm))
-               kvm_age_radix(kvm, range->slot, range->start);
+       gfn_t gfn;
+       bool ret = false;
 
-       return kvm_age_rmapp(kvm, range->slot, range->start);
+       if (kvm_is_radix(kvm)) {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       ret |= kvm_age_radix(kvm, range->slot, gfn);
+       } else {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       ret |= kvm_age_rmapp(kvm, range->slot, gfn);
+       }
+
+       return ret;
 }
 
 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
@@ -965,18 +979,24 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       if (kvm_is_radix(kvm))
-               kvm_test_age_radix(kvm, range->slot, range->start);
+       WARN_ON(range->start + 1 != range->end);
 
-       return kvm_test_age_rmapp(kvm, range->slot, range->start);
+       if (kvm_is_radix(kvm))
+               return kvm_test_age_radix(kvm, range->slot, range->start);
+       else
+               return kvm_test_age_rmapp(kvm, range->slot, range->start);
 }
 
 bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
+       WARN_ON(range->start + 1 != range->end);
+
        if (kvm_is_radix(kvm))
-               return kvm_unmap_radix(kvm, range->slot, range->start);
+               kvm_unmap_radix(kvm, range->slot, range->start);
+       else
+               kvm_unmap_rmapp(kvm, range->slot, range->start);
 
-       return kvm_unmap_rmapp(kvm, range->slot, range->start);
+       return false;
 }
 
 static int vcpus_running(struct kvm *kvm)
index ec4f58f..d909c06 100644 (file)
@@ -993,7 +993,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 }
 
 /* Called with kvm->mmu_lock held */
-bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                     unsigned long gfn)
 {
        pte_t *ptep;
@@ -1002,14 +1002,13 @@ bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
-               return false;
+               return;
        }
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep))
                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
                                 kvm->arch.lpid);
-       return false;
 }
 
 /* Called with kvm->mmu_lock held */
index f2c690e..cc1a8a0 100644 (file)
@@ -5,6 +5,9 @@
 
 ccflags-$(CONFIG_PPC64)        := $(NO_MINIMAL_TOC)
 
+CFLAGS_code-patching.o += -fno-stack-protector
+CFLAGS_feature-fixups.o += -fno-stack-protector
+
 CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
index d142b76..9a75ba0 100644 (file)
@@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  * At this point we do the placement change only for BOOK3S 64. This would
  * possibly work on other subarchs.
  */
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, unsigned long sz)
 {
        pgd_t *pg;
        p4d_t *p4;
index e4b0566..f998e65 100644 (file)
@@ -40,8 +40,8 @@ config PPC_85xx
 
 config PPC_8xx
        bool "Freescale 8xx"
+       select ARCH_SUPPORTS_HUGETLBFS
        select FSL_SOC
-       select SYS_SUPPORTS_HUGETLBFS
        select PPC_HAVE_KUEP
        select PPC_HAVE_KUAP
        select HAVE_ARCH_VMAP_STACK
@@ -95,9 +95,11 @@ config PPC_BOOK3S_64
        bool "Server processors"
        select PPC_FPU
        select PPC_HAVE_PMU_SUPPORT
-       select SYS_SUPPORTS_HUGETLBFS
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_PMD_SPLIT_PTLOCK
        select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
+       select ARCH_SUPPORTS_HUGETLBFS
        select ARCH_SUPPORTS_NUMA_BALANCING
        select IRQ_WORK
        select PPC_MM_SLICES
@@ -280,9 +282,9 @@ config FSL_BOOKE
 # this is for common code between PPC32 & PPC64 FSL BOOKE
 config PPC_FSL_BOOK3E
        bool
+       select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
        select FSL_EMB_PERFMON
        select PPC_SMP_MUXED_IPI
-       select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
        select PPC_DOORBELL
        default y if FSL_BOOKE
 
@@ -358,10 +360,6 @@ config SPE
 
          If in doubt, say Y here.
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-       depends on PPC_BOOK3S_64
-
 config PPC_RADIX_MMU
        bool "Radix MMU Support"
        depends on PPC_BOOK3S_64
@@ -421,10 +419,6 @@ config PPC_PKEY
        depends on PPC_BOOK3S_64
        depends on PPC_MEM_KEYS || PPC_KUAP || PPC_KUEP
 
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-
 
 config PPC_MMU_NOHASH
        def_bool y
index 2eb6ae1..be2546b 100644 (file)
@@ -10,7 +10,7 @@ obj-$(CONFIG_SMP)     += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_FA_DUMP)  += opal-fadump.o
 obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
 obj-$(CONFIG_OPAL_CORE)        += opal-core.o
-obj-$(CONFIG_PCI)      += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI)      += pci.o pci-ioda.o pci-ioda-tce.o
 obj-$(CONFIG_PCI_IOV)   += pci-sriov.o
 obj-$(CONFIG_CXL_BASE) += pci-cxl.o
 obj-$(CONFIG_EEH)      += eeh-powernv.o
index 71c1262..537a4da 100644 (file)
@@ -104,8 +104,8 @@ static void memtrace_clear_range(unsigned long start_pfn,
         * Before we go ahead and use this range as cache inhibited range
         * flush the cache.
         */
-       flush_dcache_range_chunked(PFN_PHYS(start_pfn),
-                                  PFN_PHYS(start_pfn + nr_pages),
+       flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
+                                  (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
                                   FLUSH_CHUNK_SIZE);
 }
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
deleted file mode 100644 (file)
index b711dc3..0000000
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file implements the DMA operations for NVLink devices. The NPU
- * devices all point to the same iommu table as the parent PCI device.
- *
- * Copyright Alistair Popple, IBM Corporation 2015.
- */
-
-#include <linux/mmu_notifier.h>
-#include <linux/mmu_context.h>
-#include <linux/of.h>
-#include <linux/pci.h>
-#include <linux/memblock.h>
-#include <linux/sizes.h>
-
-#include <asm/debugfs.h>
-#include <asm/powernv.h>
-#include <asm/ppc-pci.h>
-#include <asm/opal.h>
-
-#include "pci.h"
-
-static struct pci_dev *get_pci_dev(struct device_node *dn)
-{
-       struct pci_dn *pdn = PCI_DN(dn);
-       struct pci_dev *pdev;
-
-       pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
-                                          pdn->busno, pdn->devfn);
-
-       /*
-        * pci_get_domain_bus_and_slot() increased the reference count of
-        * the PCI device, but callers don't need that actually as the PE
-        * already holds a reference to the device. Since callers aren't
-        * aware of the reference count change, call pci_dev_put() now to
-        * avoid leaks.
-        */
-       if (pdev)
-               pci_dev_put(pdev);
-
-       return pdev;
-}
-
-/* Given a NPU device get the associated PCI device. */
-struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
-{
-       struct device_node *dn;
-       struct pci_dev *gpdev;
-
-       if (WARN_ON(!npdev))
-               return NULL;
-
-       if (WARN_ON(!npdev->dev.of_node))
-               return NULL;
-
-       /* Get assoicated PCI device */
-       dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
-       if (!dn)
-               return NULL;
-
-       gpdev = get_pci_dev(dn);
-       of_node_put(dn);
-
-       return gpdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
-
-/* Given the real PCI device get a linked NPU device. */
-struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
-{
-       struct device_node *dn;
-       struct pci_dev *npdev;
-
-       if (WARN_ON(!gpdev))
-               return NULL;
-
-       /* Not all PCI devices have device-tree nodes */
-       if (!gpdev->dev.of_node)
-               return NULL;
-
-       /* Get assoicated PCI device */
-       dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
-       if (!dn)
-               return NULL;
-
-       npdev = get_pci_dev(dn);
-       of_node_put(dn);
-
-       return npdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_npu_dev);
-
-#ifdef CONFIG_IOMMU_API
-/*
- * Returns the PE assoicated with the PCI device of the given
- * NPU. Returns the linked pci device if pci_dev != NULL.
- */
-static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
-                                                 struct pci_dev **gpdev)
-{
-       struct pnv_phb *phb;
-       struct pci_controller *hose;
-       struct pci_dev *pdev;
-       struct pnv_ioda_pe *pe;
-       struct pci_dn *pdn;
-
-       pdev = pnv_pci_get_gpu_dev(npe->pdev);
-       if (!pdev)
-               return NULL;
-
-       pdn = pci_get_pdn(pdev);
-       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-               return NULL;
-
-       hose = pci_bus_to_host(pdev->bus);
-       phb = hose->private_data;
-       pe = &phb->ioda.pe_array[pdn->pe_number];
-
-       if (gpdev)
-               *gpdev = pdev;
-
-       return pe;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group,
-               int num);
-
-static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
-               struct iommu_table *tbl)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-       const unsigned long size = tbl->it_indirect_levels ?
-               tbl->it_level_size : tbl->it_size;
-       const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
-       const __u64 win_size = tbl->it_size << tbl->it_page_shift;
-       int num2 = (num == 0) ? 1 : 0;
-
-       /* NPU has just one TVE so if there is another table, remove it first */
-       if (npe->table_group.tables[num2])
-               pnv_npu_unset_window(&npe->table_group, num2);
-
-       pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
-                       start_addr, start_addr + win_size - 1,
-                       IOMMU_PAGE_SIZE(tbl));
-
-       rc = opal_pci_map_pe_dma_window(phb->opal_id,
-                       npe->pe_number,
-                       npe->pe_number,
-                       tbl->it_indirect_levels + 1,
-                       __pa(tbl->it_base),
-                       size << 3,
-                       IOMMU_PAGE_SIZE(tbl));
-       if (rc) {
-               pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
-               return rc;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-       /* Add the table to the list so its TCE cache will get invalidated */
-       pnv_pci_link_table_and_group(phb->hose->node, num,
-                       tbl, &npe->table_group);
-
-       return 0;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-
-       if (!npe->table_group.tables[num])
-               return 0;
-
-       pe_info(npe, "Removing DMA window\n");
-
-       rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
-                       npe->pe_number,
-                       0/* levels */, 0/* table address */,
-                       0/* table size */, 0/* page size */);
-       if (rc) {
-               pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
-               return rc;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-       pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
-                       &npe->table_group);
-
-       return 0;
-}
-
-/* Switch ownership from platform code to external user (e.g. VFIO) */
-static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-       struct pci_dev *gpdev = NULL;
-
-       /*
-        * Note: NPU has just a single TVE in the hardware which means that
-        * while used by the kernel, it can have either 32bit window or
-        * DMA bypass but never both. So we deconfigure 32bit window only
-        * if it was enabled at the moment of ownership change.
-        */
-       if (npe->table_group.tables[0]) {
-               pnv_npu_unset_window(&npe->table_group, 0);
-               return;
-       }
-
-       /* Disable bypass */
-       rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-                       npe->pe_number, npe->pe_number,
-                       0 /* bypass base */, 0);
-       if (rc) {
-               pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
-               return;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
-
-       get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (gpdev)
-               pnv_npu2_unmap_lpar_dev(gpdev);
-}
-
-static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pci_dev *gpdev = NULL;
-
-       get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (gpdev)
-               pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
-}
-
-static struct iommu_table_group_ops pnv_pci_npu_ops = {
-       .set_window = pnv_npu_set_window,
-       .unset_window = pnv_npu_unset_window,
-       .take_ownership = pnv_npu_take_ownership,
-       .release_ownership = pnv_npu_release_ownership,
-};
-#endif /* !CONFIG_IOMMU_API */
-
-/*
- * NPU2 ATS
- */
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-#define NV_NPU_MAX_PE_NUM      16
-
-/*
- * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
- * up to 3 x (GPU + 2xNPUs) (POWER9).
- */
-struct npu_comp {
-       struct iommu_table_group table_group;
-       int pe_num;
-       struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
-};
-
-/* An NPU descriptor, valid for POWER9 only */
-struct npu {
-       int index;
-       struct npu_comp npucomp;
-};
-
-#ifdef CONFIG_IOMMU_API
-static long pnv_npu_peers_create_table_userspace(
-               struct iommu_table_group *table_group,
-               int num, __u32 page_shift, __u64 window_size, __u32 levels,
-               struct iommu_table **ptbl)
-{
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       if (!npucomp->pe_num || !npucomp->pe[0] ||
-                       !npucomp->pe[0]->table_group.ops ||
-                       !npucomp->pe[0]->table_group.ops->create_table)
-               return -EFAULT;
-
-       return npucomp->pe[0]->table_group.ops->create_table(
-                       &npucomp->pe[0]->table_group, num, page_shift,
-                       window_size, levels, ptbl);
-}
-
-static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
-               int num, struct iommu_table *tbl)
-{
-       int i, j;
-       long ret = 0;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops->set_window)
-                       continue;
-
-               ret = pe->table_group.ops->set_window(&pe->table_group,
-                               num, tbl);
-               if (ret)
-                       break;
-       }
-
-       if (ret) {
-               for (j = 0; j < i; ++j) {
-                       struct pnv_ioda_pe *pe = npucomp->pe[j];
-
-                       if (!pe->table_group.ops->unset_window)
-                               continue;
-
-                       ret = pe->table_group.ops->unset_window(
-                                       &pe->table_group, num);
-                       if (ret)
-                               break;
-               }
-       } else {
-               table_group->tables[num] = iommu_tce_table_get(tbl);
-       }
-
-       return ret;
-}
-
-static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
-               int num)
-{
-       int i, j;
-       long ret = 0;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               WARN_ON(npucomp->table_group.tables[num] !=
-                               table_group->tables[num]);
-               if (!npucomp->table_group.tables[num])
-                       continue;
-
-               if (!pe->table_group.ops->unset_window)
-                       continue;
-
-               ret = pe->table_group.ops->unset_window(&pe->table_group, num);
-               if (ret)
-                       break;
-       }
-
-       if (ret) {
-               for (j = 0; j < i; ++j) {
-                       struct pnv_ioda_pe *pe = npucomp->pe[j];
-
-                       if (!npucomp->table_group.tables[num])
-                               continue;
-
-                       if (!pe->table_group.ops->set_window)
-                               continue;
-
-                       ret = pe->table_group.ops->set_window(&pe->table_group,
-                                       num, table_group->tables[num]);
-                       if (ret)
-                               break;
-               }
-       } else if (table_group->tables[num]) {
-               iommu_tce_table_put(table_group->tables[num]);
-               table_group->tables[num] = NULL;
-       }
-
-       return ret;
-}
-
-static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
-{
-       int i;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops ||
-                   !pe->table_group.ops->take_ownership)
-                       continue;
-               pe->table_group.ops->take_ownership(&pe->table_group);
-       }
-}
-
-static void pnv_npu_peers_release_ownership(
-               struct iommu_table_group *table_group)
-{
-       int i;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops ||
-                   !pe->table_group.ops->release_ownership)
-                       continue;
-               pe->table_group.ops->release_ownership(&pe->table_group);
-       }
-}
-
-static struct iommu_table_group_ops pnv_npu_peers_ops = {
-       .get_table_size = pnv_pci_ioda2_get_table_size,
-       .create_table = pnv_npu_peers_create_table_userspace,
-       .set_window = pnv_npu_peers_set_window,
-       .unset_window = pnv_npu_peers_unset_window,
-       .take_ownership = pnv_npu_peers_take_ownership,
-       .release_ownership = pnv_npu_peers_release_ownership,
-};
-
-static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
-               struct pnv_ioda_pe *pe)
-{
-       if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
-               return;
-
-       npucomp->pe[npucomp->pe_num] = pe;
-       ++npucomp->pe_num;
-}
-
-static struct iommu_table_group *
-       pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
-{
-       struct iommu_table_group *compound_group;
-       struct npu_comp *npucomp;
-       struct pci_dev *gpdev = NULL;
-       struct pci_controller *hose;
-       struct pci_dev *npdev = NULL;
-
-       list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
-               npdev = pnv_pci_get_npu_dev(gpdev, 0);
-               if (npdev)
-                       break;
-       }
-
-       if (!npdev)
-               /* It is not an NPU attached device, skip */
-               return NULL;
-
-       hose = pci_bus_to_host(npdev->bus);
-
-       if (hose->npu) {
-               /* P9 case: compound group is per-NPU (all gpus, all links) */
-               npucomp = &hose->npu->npucomp;
-       } else {
-               /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
-               npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
-       }
-
-       compound_group = &npucomp->table_group;
-       if (!compound_group->group) {
-               compound_group->ops = &pnv_npu_peers_ops;
-               iommu_register_group(compound_group, hose->global_number,
-                               pe->pe_number);
-
-               /* Steal capabilities from a GPU PE */
-               compound_group->max_dynamic_windows_supported =
-                       pe->table_group.max_dynamic_windows_supported;
-               compound_group->tce32_start = pe->table_group.tce32_start;
-               compound_group->tce32_size = pe->table_group.tce32_size;
-               compound_group->max_levels = pe->table_group.max_levels;
-               if (!compound_group->pgsizes)
-                       compound_group->pgsizes = pe->table_group.pgsizes;
-       }
-
-       /*
-        * The gpu would have been added to the iommu group that's created
-        * for the PE. Pull it out now.
-        */
-       iommu_del_device(&gpdev->dev);
-
-       /*
-       * I'm not sure this is strictly required, but it's probably a good idea
-       * since the table_group for the PE is going to be attached to the
-       * compound table group. If we leave the PE's iommu group active then
-       * we might have the same table_group being modifiable via two sepeate
-       * iommu groups.
-       */
-       iommu_group_put(pe->table_group.group);
-
-       /* now put the GPU into the compound group */
-       pnv_comp_attach_table_group(npucomp, pe);
-       iommu_add_device(compound_group, &gpdev->dev);
-
-       return compound_group;
-}
-
-static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
-{
-       struct iommu_table_group *table_group;
-       struct npu_comp *npucomp;
-       struct pci_dev *gpdev = NULL;
-       struct pci_dev *npdev;
-       struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
-
-       WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
-       if (!gpe)
-               return NULL;
-
-       /*
-        * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
-        * but NPU bridges do not have this hook defined so we do it here.
-        * We do not setup other table group parameters as they won't be used
-        * anyway - NVLink bridges are subordinate PEs.
-        */
-       pe->table_group.ops = &pnv_pci_npu_ops;
-
-       table_group = iommu_group_get_iommudata(
-                       iommu_group_get(&gpdev->dev));
-
-       /*
-        * On P9 NPU PHB and PCI PHB support different page sizes,
-        * keep only matching. We expect here that NVLink bridge PE pgsizes is
-        * initialized by the caller.
-        */
-       table_group->pgsizes &= pe->table_group.pgsizes;
-       npucomp = container_of(table_group, struct npu_comp, table_group);
-       pnv_comp_attach_table_group(npucomp, pe);
-
-       list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
-               struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
-
-               if (gpdevtmp != gpdev)
-                       continue;
-
-               iommu_add_device(table_group, &npdev->dev);
-       }
-
-       return table_group;
-}
-
-void pnv_pci_npu_setup_iommu_groups(void)
-{
-       struct pci_controller *hose;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       /*
-        * For non-nvlink devices the IOMMU group is registered when the PE is
-        * configured and devices are added to the group when the per-device
-        * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
-        * only initialise for "normal" IODA PHBs.
-        *
-        * For NVLink devices we need to ensure the NVLinks and the GPU end up
-        * in the same IOMMU group, so that's handled here.
-        */
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-
-               if (phb->type == PNV_PHB_IODA2)
-                       list_for_each_entry(pe, &phb->ioda.pe_list, list)
-                               pnv_try_setup_npu_table_group(pe);
-       }
-
-       /*
-        * Now we have all PHBs discovered, time to add NPU devices to
-        * the corresponding IOMMU groups.
-        */
-       list_for_each_entry(hose, &hose_list, list_node) {
-               unsigned long  pgsizes;
-
-               phb = hose->private_data;
-
-               if (phb->type != PNV_PHB_NPU_NVLINK)
-                       continue;
-
-               pgsizes = pnv_ioda_parse_tce_sizes(phb);
-               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-                       /*
-                        * IODA2 bridges get this set up from
-                        * pci_controller_ops::setup_bridge but NPU bridges
-                        * do not have this hook defined so we do it here.
-                        */
-                       pe->table_group.pgsizes = pgsizes;
-                       pnv_npu_compound_attach(pe);
-               }
-       }
-}
-#endif /* CONFIG_IOMMU_API */
-
-int pnv_npu2_init(struct pci_controller *hose)
-{
-       static int npu_index;
-       struct npu *npu;
-       int ret;
-
-       npu = kzalloc(sizeof(*npu), GFP_KERNEL);
-       if (!npu)
-               return -ENOMEM;
-
-       npu_index++;
-       if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
-               ret = -ENOSPC;
-               goto fail_exit;
-       }
-       npu->index = npu_index;
-       hose->npu = npu;
-
-       return 0;
-
-fail_exit:
-       kfree(npu);
-       return ret;
-}
-
-int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
-               unsigned long msr)
-{
-       int ret;
-       struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-       struct pci_controller *hose;
-       struct pnv_phb *nphb;
-
-       if (!npdev)
-               return -ENODEV;
-
-       hose = pci_bus_to_host(npdev->bus);
-       if (hose->npu == NULL) {
-               dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
-               return 0;
-       }
-
-       nphb = hose->private_data;
-
-       dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
-                       nphb->opal_id, lparid);
-       /*
-        * Currently we only support radix and non-zero LPCR only makes sense
-        * for hash tables so skiboot expects the LPCR parameter to be a zero.
-        */
-       ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
-                               0 /* LPCR bits */);
-       if (ret) {
-               dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-               return ret;
-       }
-
-       dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
-                       nphb->opal_id, msr);
-       ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
-                                   pci_dev_id(gpdev));
-       if (ret < 0)
-               dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
-       else
-               ret = 0;
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
-
-void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
-{
-       struct pci_dev *gpdev;
-
-       list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
-               pnv_npu2_map_lpar_dev(gpdev, 0, msr);
-}
-
-int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
-{
-       int ret;
-       struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-       struct pci_controller *hose;
-       struct pnv_phb *nphb;
-
-       if (!npdev)
-               return -ENODEV;
-
-       hose = pci_bus_to_host(npdev->bus);
-       if (hose->npu == NULL) {
-               dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
-               return 0;
-       }
-
-       nphb = hose->private_data;
-
-       dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
-                       nphb->opal_id);
-       ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
-                                      pci_dev_id(gpdev));
-       if (ret < 0) {
-               dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
-               return ret;
-       }
-
-       /* Set LPID to 0 anyway, just to be safe */
-       dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
-       ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
-                               0 /* LPCR bits */);
-       if (ret)
-               dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
index 5cd0f52..01401e3 100644 (file)
@@ -267,8 +267,6 @@ OPAL_CALL(opal_xive_get_queue_state,                OPAL_XIVE_GET_QUEUE_STATE);
 OPAL_CALL(opal_xive_set_queue_state,           OPAL_XIVE_SET_QUEUE_STATE);
 OPAL_CALL(opal_xive_get_vp_state,              OPAL_XIVE_GET_VP_STATE);
 OPAL_CALL(opal_signal_system_reset,            OPAL_SIGNAL_SYSTEM_RESET);
-OPAL_CALL(opal_npu_init_context,               OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context,            OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,                   OPAL_NPU_MAP_LPAR);
 OPAL_CALL(opal_imc_counters_init,              OPAL_IMC_COUNTERS_INIT);
 OPAL_CALL(opal_imc_counters_start,             OPAL_IMC_COUNTERS_START);
index 66c3c33..7de4646 100644 (file)
@@ -47,8 +47,7 @@
 #define PNV_IODA1_M64_SEGS     8       /* Segments per M64 BAR */
 #define PNV_IODA1_DMA32_SEGSIZE        0x10000000
 
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
-                                             "NPU_OCAPI" };
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 static void pnv_pci_configure_bus(struct pci_bus *bus);
@@ -192,8 +191,6 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
        unsigned int pe_num = pe->pe_number;
 
        WARN_ON(pe->pdev);
-       WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
-       kfree(pe->npucomp);
        memset(pe, 0, sizeof(struct pnv_ioda_pe));
 
        mutex_lock(&phb->ioda.pe_alloc_mutex);
@@ -875,7 +872,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
         * Release from all parents PELT-V. NPUs don't have a PELTV
         * table
         */
-       if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+       if (phb->type != PNV_PHB_NPU_OCAPI)
                pnv_ioda_unset_peltv(phb, pe, parent);
 
        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
@@ -946,7 +943,7 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
         * Configure PELTV. NPUs don't have a PELTV table so skip
         * configuration on them.
         */
-       if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+       if (phb->type != PNV_PHB_NPU_OCAPI)
                pnv_ioda_set_peltv(phb, pe, true);
 
        /* Setup reverse map */
@@ -1002,8 +999,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
        /* NOTE: We don't get a reference for the pointer in the PE
         * data structure, both the device and PE structures should be
-        * destroyed at the same time. However, removing nvlink
-        * devices will need some work.
+        * destroyed at the same time.
         *
         * At some point we want to remove the PDN completely anyways
         */
@@ -1099,113 +1095,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
        return pe;
 }
 
-static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
-{
-       int pe_num, found_pe = false, rc;
-       long rid;
-       struct pnv_ioda_pe *pe;
-       struct pci_dev *gpu_pdev;
-       struct pci_dn *npu_pdn;
-       struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
-
-       /*
-        * Intentionally leak a reference on the npu device (for
-        * nvlink only; this is not an opencapi path) to make sure it
-        * never goes away, as it's been the case all along and some
-        * work is needed otherwise.
-        */
-       pci_dev_get(npu_pdev);
-
-       /*
-        * Due to a hardware errata PE#0 on the NPU is reserved for
-        * error handling. This means we only have three PEs remaining
-        * which need to be assigned to four links, implying some
-        * links must share PEs.
-        *
-        * To achieve this we assign PEs such that NPUs linking the
-        * same GPU get assigned the same PE.
-        */
-       gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
-       for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
-               pe = &phb->ioda.pe_array[pe_num];
-               if (!pe->pdev)
-                       continue;
-
-               if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
-                       /*
-                        * This device has the same peer GPU so should
-                        * be assigned the same PE as the existing
-                        * peer NPU.
-                        */
-                       dev_info(&npu_pdev->dev,
-                               "Associating to existing PE %x\n", pe_num);
-                       npu_pdn = pci_get_pdn(npu_pdev);
-                       rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
-                       npu_pdn->pe_number = pe_num;
-                       phb->ioda.pe_rmap[rid] = pe->pe_number;
-                       pe->device_count++;
-
-                       /* Map the PE to this link */
-                       rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
-                                       OpalPciBusAll,
-                                       OPAL_COMPARE_RID_DEVICE_NUMBER,
-                                       OPAL_COMPARE_RID_FUNCTION_NUMBER,
-                                       OPAL_MAP_PE);
-                       WARN_ON(rc != OPAL_SUCCESS);
-                       found_pe = true;
-                       break;
-               }
-       }
-
-       if (!found_pe)
-               /*
-                * Could not find an existing PE so allocate a new
-                * one.
-                */
-               return pnv_ioda_setup_dev_PE(npu_pdev);
-       else
-               return pe;
-}
-
-static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
-{
-       struct pci_dev *pdev;
-
-       list_for_each_entry(pdev, &bus->devices, bus_list)
-               pnv_ioda_setup_npu_PE(pdev);
-}
-
-static void pnv_pci_ioda_setup_nvlink(void)
-{
-       struct pci_controller *hose;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type == PNV_PHB_NPU_NVLINK) {
-                       /* PE#0 is needed for error reporting */
-                       pnv_ioda_reserve_pe(phb, 0);
-                       pnv_ioda_setup_npu_PEs(hose->bus);
-                       if (phb->model == PNV_PHB_MODEL_NPU2)
-                               WARN_ON_ONCE(pnv_npu2_init(hose));
-               }
-       }
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type != PNV_PHB_IODA2)
-                       continue;
-
-               list_for_each_entry(pe, &phb->ioda.pe_list, list)
-                       pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
-       }
-
-#ifdef CONFIG_IOMMU_API
-       /* setup iommu groups so we can do nvlink pass-thru */
-       pnv_pci_npu_setup_iommu_groups();
-#endif
-}
-
 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
                                       struct pnv_ioda_pe *pe);
 
@@ -1468,18 +1357,6 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 #define PHB3_TCE_KILL_INVAL_PE         PPC_BIT(1)
 #define PHB3_TCE_KILL_INVAL_ONE                PPC_BIT(2)
 
-static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
-       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
-       const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
-
-       mb(); /* Ensure previous TCE table stores are visible */
-       if (rm)
-               __raw_rm_writeq_be(val, invalidate);
-       else
-               __raw_writeq_be(val, invalidate);
-}
-
 static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
 {
        /* 01xb - invalidate TCEs that match the specified PE# */
@@ -1539,20 +1416,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
                struct pnv_phb *phb = pe->phb;
                unsigned int shift = tbl->it_page_shift;
 
-               /*
-                * NVLink1 can use the TCE kill register directly as
-                * it's the same as PHB3. NVLink2 is different and
-                * should go via the OPAL call.
-                */
-               if (phb->model == PNV_PHB_MODEL_NPU) {
-                       /*
-                        * The NVLink hardware does not support TCE kill
-                        * per TCE entry so we have to invalidate
-                        * the entire cache for it.
-                        */
-                       pnv_pci_phb3_tce_invalidate_entire(phb, rm);
-                       continue;
-               }
                if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
                        pnv_pci_phb3_tce_invalidate(pe, rm, shift,
                                                    index, npages);
@@ -1564,14 +1427,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
        }
 }
 
-void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
-       if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
-               pnv_pci_phb3_tce_invalidate_entire(phb, rm);
-       else
-               opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
-}
-
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
                long npages, unsigned long uaddr,
                enum dma_data_direction direction,
@@ -2451,7 +2306,6 @@ static void pnv_pci_enable_bridges(void)
 
 static void pnv_pci_ioda_fixup(void)
 {
-       pnv_pci_ioda_setup_nvlink();
        pnv_pci_ioda_create_dbgfs();
 
        pnv_pci_enable_bridges();
@@ -2824,15 +2678,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
                pnv_ioda_release_pe(pe);
 }
 
-static void pnv_npu_disable_device(struct pci_dev *pdev)
-{
-       struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
-       struct eeh_pe *eehpe = edev ? edev->pe : NULL;
-
-       if (eehpe && eeh_ops && eeh_ops->reset)
-               eeh_ops->reset(eehpe, EEH_RESET_HOT);
-}
-
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
        struct pnv_phb *phb = hose->private_data;
@@ -2874,16 +2719,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .shutdown               = pnv_pci_ioda_shutdown,
 };
 
-static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-       .setup_msi_irqs         = pnv_setup_msi_irqs,
-       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
-       .enable_device_hook     = pnv_pci_enable_device_hook,
-       .window_alignment       = pnv_pci_window_alignment,
-       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
-       .shutdown               = pnv_pci_ioda_shutdown,
-       .disable_device         = pnv_npu_disable_device,
-};
-
 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
        .enable_device_hook     = pnv_ocapi_enable_device_hook,
        .release_device         = pnv_pci_release_device,
@@ -2957,10 +2792,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
                phb->model = PNV_PHB_MODEL_P7IOC;
        else if (of_device_is_compatible(np, "ibm,power8-pciex"))
                phb->model = PNV_PHB_MODEL_PHB3;
-       else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
-               phb->model = PNV_PHB_MODEL_NPU;
-       else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
-               phb->model = PNV_PHB_MODEL_NPU2;
        else
                phb->model = PNV_PHB_MODEL_UNKNOWN;
 
@@ -3118,9 +2949,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
 
        switch (phb->type) {
-       case PNV_PHB_NPU_NVLINK:
-               hose->controller_ops = pnv_npu_ioda_controller_ops;
-               break;
        case PNV_PHB_NPU_OCAPI:
                hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
                break;
@@ -3173,11 +3001,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
 }
 
-void __init pnv_pci_init_npu_phb(struct device_node *np)
-{
-       pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
-}
-
 void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
 {
        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
index 9b9bca1..b18468d 100644 (file)
@@ -926,17 +926,6 @@ void __init pnv_pci_init(void)
        for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
                pnv_pci_init_ioda2_phb(np);
 
-       /* Look for NPU PHBs */
-       for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
-               pnv_pci_init_npu_phb(np);
-
-       /*
-        * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
-        * the exception of TCE kill which requires an OPAL call.
-        */
-       for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
-               pnv_pci_init_npu_phb(np);
-
        /* Look for NPU2 OpenCAPI PHBs */
        for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
                pnv_pci_init_npu2_opencapi_phb(np);
index 36d2292..c8d4f22 100644 (file)
 struct pci_dn;
 
 enum pnv_phb_type {
-       PNV_PHB_IODA1           = 0,
-       PNV_PHB_IODA2           = 1,
-       PNV_PHB_NPU_NVLINK      = 2,
-       PNV_PHB_NPU_OCAPI       = 3,
+       PNV_PHB_IODA1,
+       PNV_PHB_IODA2,
+       PNV_PHB_NPU_OCAPI,
 };
 
 /* Precise PHB model for error management */
@@ -21,8 +20,6 @@ enum pnv_phb_model {
        PNV_PHB_MODEL_UNKNOWN,
        PNV_PHB_MODEL_P7IOC,
        PNV_PHB_MODEL_PHB3,
-       PNV_PHB_MODEL_NPU,
-       PNV_PHB_MODEL_NPU2,
 };
 
 #define PNV_PCI_DIAG_BUF_SIZE  8192
@@ -81,7 +78,6 @@ struct pnv_ioda_pe {
 
        /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
        struct iommu_table_group table_group;
-       struct npu_comp         *npucomp;
 
        /* 64-bit TCE bypass region */
        bool                    tce_bypass_enabled;
@@ -289,9 +285,7 @@ extern struct iommu_table *pnv_pci_table_alloc(int nid);
 
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
-extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
-extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
@@ -314,11 +308,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 #define pe_info(pe, fmt, ...)                                  \
        pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
 
-/* Nvlink functions */
-extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
-extern void pnv_pci_npu_setup_iommu_groups(void);
-
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   2
 #define POWERNV_IOMMU_MAX_LEVELS       5
index 1bffbd1..3b6800f 100644 (file)
@@ -224,8 +224,6 @@ static void __init pSeries_request_regions(void)
 
 void __init pSeries_final_fixup(void)
 {
-       struct pci_controller *hose;
-
        pSeries_request_regions();
 
        eeh_show_enabled();
@@ -234,27 +232,6 @@ void __init pSeries_final_fixup(void)
        ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
        ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
 #endif
-       list_for_each_entry(hose, &hose_list, list_node) {
-               struct device_node *dn = hose->dn, *nvdn;
-
-               while (1) {
-                       dn = of_find_all_nodes(dn);
-                       if (!dn)
-                               break;
-                       nvdn = of_parse_phandle(dn, "ibm,nvlink", 0);
-                       if (!nvdn)
-                               continue;
-                       if (!of_device_is_compatible(nvdn, "ibm,npu-link"))
-                               continue;
-                       if (!of_device_is_compatible(nvdn->parent,
-                                               "ibm,power9-npu"))
-                               continue;
-#ifdef CONFIG_PPC_POWERNV
-                       WARN_ON_ONCE(pnv_npu2_init(hose));
-#endif
-                       break;
-               }
-       }
 }
 
 /*
index add528e..a8ad8eb 100644 (file)
@@ -20,6 +20,7 @@ config RISCV
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEBUG_VIRTUAL if MMU
        select ARCH_HAS_DEBUG_WX
+       select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_HAS_KCOV
@@ -27,9 +28,12 @@ config RISCV
        select ARCH_HAS_PTE_SPECIAL
        select ARCH_HAS_SET_DIRECT_MAP
        select ARCH_HAS_SET_MEMORY
-       select ARCH_HAS_STRICT_KERNEL_RWX if MMU
+       select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
+       select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
+       select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+       select ARCH_SUPPORTS_HUGETLBFS if MMU
        select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
@@ -40,6 +44,7 @@ config RISCV
        select EDAC_SUPPORT
        select GENERIC_ARCH_TOPOLOGY if SMP
        select GENERIC_ATOMIC64 if !64BIT
+       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
        select GENERIC_EARLY_IOREMAP
        select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO
        select GENERIC_IOREMAP
@@ -166,10 +171,6 @@ config ARCH_WANT_GENERAL_HUGETLB
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
-config SYS_SUPPORTS_HUGETLBFS
-       depends on MMU
-       def_bool y
-
 config STACKTRACE_SUPPORT
        def_bool y
 
@@ -205,6 +206,7 @@ config LOCKDEP_SUPPORT
        def_bool y
 
 source "arch/riscv/Kconfig.socs"
+source "arch/riscv/Kconfig.erratas"
 
 menu "Platform type"
 
@@ -228,7 +230,7 @@ config ARCH_RV64I
        bool "RV64I"
        select 64BIT
        select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000
-       select HAVE_DYNAMIC_FTRACE if MMU
+       select HAVE_DYNAMIC_FTRACE if MMU && $(cc-option,-fpatchable-function-entry=8)
        select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_FUNCTION_GRAPH_TRACER
@@ -387,6 +389,31 @@ config RISCV_SBI_V01
        help
          This config allows kernel to use SBI v0.1 APIs. This will be
          deprecated in future once legacy M-mode software are no longer in use.
+
+config KEXEC
+       bool "Kexec system call"
+       select KEXEC_CORE
+       select HOTPLUG_CPU if SMP
+       depends on MMU
+       help
+         kexec is a system call that implements the ability to shutdown your
+         current kernel, and to start another kernel. It is like a reboot
+         but it is independent of the system firmware. And like a reboot
+         you can start any kernel with it, not just Linux.
+
+         The name comes from the similarity to the exec system call.
+
+config CRASH_DUMP
+       bool "Build kdump crash kernel"
+       help
+         Generate crash dump after being started by kexec. This should
+         be normally only set in special crash dump kernels which are
+         loaded in the main kernel with kexec-tools into a specially
+         reserved region and then later executed after a crash by
+         kdump/kexec.
+
+         For more details see Documentation/admin-guide/kdump/kdump.rst
+
 endmenu
 
 menu "Boot options"
@@ -439,7 +466,7 @@ config EFI_STUB
 
 config EFI
        bool "UEFI runtime support"
-       depends on OF
+       depends on OF && !XIP_KERNEL
        select LIBFDT
        select UCS2_STRING
        select EFI_PARAMS_FROM_FDT
@@ -463,11 +490,63 @@ config STACKPROTECTOR_PER_TASK
        def_bool y
        depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS
 
+config PHYS_RAM_BASE_FIXED
+       bool "Explicitly specified physical RAM address"
+       default n
+
+config PHYS_RAM_BASE
+       hex "Platform Physical RAM address"
+       depends on PHYS_RAM_BASE_FIXED
+       default "0x80000000"
+       help
+         This is the physical address of RAM in the system. It has to be
+         explicitly specified to run early relocations of read-write data
+         from flash to RAM.
+
+config XIP_KERNEL
+       bool "Kernel Execute-In-Place from ROM"
+       depends on MMU && SPARSEMEM
+       # This prevents XIP from being enabled by all{yes,mod}config, which
+       # fail to build since XIP doesn't support large kernels.
+       depends on !COMPILE_TEST
+       select PHYS_RAM_BASE_FIXED
+       help
+         Execute-In-Place allows the kernel to run from non-volatile storage
+         directly addressable by the CPU, such as NOR flash. This saves RAM
+         space since the text section of the kernel is not loaded from flash
+         to RAM.  Read-write sections, such as the data section and stack,
+         are still copied to RAM.  The XIP kernel is not compressed since
+         it has to run directly from flash, so it will take more space to
+         store it.  The flash address used to link the kernel object files,
+         and for storing it, is configuration dependent. Therefore, if you
+         say Y here, you must know the proper physical address where to
+         store the kernel image depending on your own flash memory usage.
+
+         Also note that the make target becomes "make xipImage" rather than
+         "make zImage" or "make Image".  The final kernel binary to put in
+         ROM memory will be arch/riscv/boot/xipImage.
+
+         SPARSEMEM is required because the kernel text and rodata that are
+         flash resident are not backed by memmap, then any attempt to get
+         a struct page on those regions will trigger a fault.
+
+         If unsure, say N.
+
+config XIP_PHYS_ADDR
+       hex "XIP Kernel Physical Location"
+       depends on XIP_KERNEL
+       default "0x21000000"
+       help
+         This is the physical address in your flash memory the kernel will
+         be linked for and stored to.  This address is dependent on your
+         own flash usage.
+
 endmenu
 
 config BUILTIN_DTB
-       def_bool n
+       bool
        depends on OF
+       default y if XIP_KERNEL
 
 menu "Power management options"
 
diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas
new file mode 100644 (file)
index 0000000..b44d6ec
--- /dev/null
@@ -0,0 +1,44 @@
+menu "CPU errata selection"
+
+config RISCV_ERRATA_ALTERNATIVE
+       bool "RISC-V alternative scheme"
+       default y
+       help
+         This Kconfig allows the kernel to automatically patch the
+         errata required by the execution platform at run time. The
+         code patching is performed once in the boot stages. It means
+         that the overhead from this mechanism is just taken once.
+
+config ERRATA_SIFIVE
+       bool "SiFive errata"
+       depends on RISCV_ERRATA_ALTERNATIVE
+       help
+         All SiFive errata Kconfig depend on this Kconfig. Disabling
+         this Kconfig will disable all SiFive errata. Please say "Y"
+         here if your platform uses SiFive CPU cores.
+
+         Otherwise, please say "N" here to avoid unnecessary overhead.
+
+config ERRATA_SIFIVE_CIP_453
+       bool "Apply SiFive errata CIP-453"
+       depends on ERRATA_SIFIVE && 64BIT
+       default y
+       help
+         This will apply the SiFive CIP-453 errata to add sign extension
+         to the $badaddr when exception type is instruction page fault
+         and instruction access fault.
+
+         If you don't know what to do here, say "Y".
+
+config ERRATA_SIFIVE_CIP_1200
+       bool "Apply SiFive errata CIP-1200"
+       depends on ERRATA_SIFIVE && 64BIT
+       default y
+       help
+         This will apply the SiFive CIP-1200 errata to repalce all
+         "sfence.vma addr" with "sfence.vma" to ensure that the addr
+         has been flushed from TLB.
+
+         If you don't know what to do here, say "Y".
+
+endmenu
index e1b2690..ed96376 100644 (file)
@@ -1,5 +1,12 @@
 menu "SoC selection"
 
+config SOC_MICROCHIP_POLARFIRE
+       bool "Microchip PolarFire SoCs"
+       select MCHP_CLK_MPFS
+       select SIFIVE_PLIC
+       help
+         This enables support for Microchip PolarFire SoC platforms.
+
 config SOC_SIFIVE
        bool "SiFive SoCs"
        select SERIAL_SIFIVE if TTY
@@ -7,6 +14,7 @@ config SOC_SIFIVE
        select CLK_SIFIVE
        select CLK_SIFIVE_PRCI
        select SIFIVE_PLIC
+       select ERRATA_SIFIVE
        help
          This enables support for SiFive SoC platform hardware.
 
index 1368d94..3eb9590 100644 (file)
@@ -82,11 +82,16 @@ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS)
 
 # Default target when executing plain make
 boot           := arch/riscv/boot
+ifeq ($(CONFIG_XIP_KERNEL),y)
+KBUILD_IMAGE := $(boot)/xipImage
+else
 KBUILD_IMAGE   := $(boot)/Image.gz
+endif
 
 head-y := arch/riscv/kernel/head.o
 
 core-y += arch/riscv/
+core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/
 
 libs-y += arch/riscv/lib/
 libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
@@ -95,12 +100,14 @@ PHONY += vdso_install
 vdso_install:
        $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
 
+ifneq ($(CONFIG_XIP_KERNEL),y)
 ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy)
 KBUILD_IMAGE := $(boot)/loader.bin
 else
 KBUILD_IMAGE := $(boot)/Image.gz
 endif
-BOOT_TARGETS := Image Image.gz loader loader.bin
+endif
+BOOT_TARGETS := Image Image.gz loader loader.bin xipImage
 
 all:   $(notdir $(KBUILD_IMAGE))
 
index 03404c8..6bf299f 100644 (file)
 KCOV_INSTRUMENT := n
 
 OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
+OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 
 targets := Image Image.* loader loader.o loader.lds loader.bin
+targets := Image Image.* loader loader.o loader.lds loader.bin xipImage
+
+ifeq ($(CONFIG_XIP_KERNEL),y)
+
+quiet_cmd_mkxip = $(quiet_cmd_objcopy)
+cmd_mkxip = $(cmd_objcopy)
+
+$(obj)/xipImage: vmlinux FORCE
+       $(call if_changed,mkxip)
+       @$(kecho) '  Physical Address of xipImage: $(CONFIG_XIP_PHYS_ADDR)'
+
+endif
 
 $(obj)/Image: vmlinux FORCE
        $(call if_changed,objcopy)
index 7ffd502..fe996b8 100644 (file)
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 subdir-y += sifive
 subdir-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += canaan
+subdir-y += microchip
 
 obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y))
diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile
new file mode 100644 (file)
index 0000000..622b127
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += microchip-mpfs-icicle-kit.dtb
diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
new file mode 100644 (file)
index 0000000..ec79944
--- /dev/null
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Copyright (c) 2020 Microchip Technology Inc */
+
+/dts-v1/;
+
+#include "microchip-mpfs.dtsi"
+
+/* Clock frequency (in Hz) of the rtcclk */
+#define RTCCLK_FREQ            1000000
+
+/ {
+       #address-cells = <2>;
+       #size-cells = <2>;
+       model = "Microchip PolarFire-SoC Icicle Kit";
+       compatible = "microchip,mpfs-icicle-kit";
+
+       chosen {
+               stdout-path = &serial0;
+       };
+
+       cpus {
+               timebase-frequency = <RTCCLK_FREQ>;
+       };
+
+       memory@80000000 {
+               device_type = "memory";
+               reg = <0x0 0x80000000 0x0 0x40000000>;
+               clocks = <&clkcfg 26>;
+       };
+
+       soc {
+       };
+};
+
+&serial0 {
+       status = "okay";
+};
+
+&serial1 {
+       status = "okay";
+};
+
+&serial2 {
+       status = "okay";
+};
+
+&serial3 {
+       status = "okay";
+};
+
+&sdcard {
+       status = "okay";
+};
+
+&emac0 {
+       phy-mode = "sgmii";
+       phy-handle = <&phy0>;
+       phy0: ethernet-phy@8 {
+               reg = <8>;
+               ti,fifo-depth = <0x01>;
+       };
+};
+
+&emac1 {
+       status = "okay";
+       phy-mode = "sgmii";
+       phy-handle = <&phy1>;
+       phy1: ethernet-phy@9 {
+               reg = <9>;
+               ti,fifo-depth = <0x01>;
+       };
+};
diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
new file mode 100644 (file)
index 0000000..b981957
--- /dev/null
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Copyright (c) 2020 Microchip Technology Inc */
+
+/dts-v1/;
+
+/ {
+       #address-cells = <2>;
+       #size-cells = <2>;
+       model = "Microchip MPFS Icicle Kit";
+       compatible = "microchip,mpfs-icicle-kit";
+
+       chosen {
+       };
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               cpu@0 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,e51", "sifive,rocket0", "riscv";
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <128>;
+                       i-cache-size = <16384>;
+                       reg = <0>;
+                       riscv,isa = "rv64imac";
+                       status = "disabled";
+
+                       cpu0_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@1 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <1>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu1_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@2 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <2>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu2_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@3 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <3>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu3_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@4 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <4>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+                       cpu4_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+       };
+
+       soc {
+               #address-cells = <2>;
+               #size-cells = <2>;
+               compatible = "simple-bus";
+               ranges;
+
+               cache-controller@2010000 {
+                       compatible = "sifive,fu540-c000-ccache", "cache";
+                       cache-block-size = <64>;
+                       cache-level = <2>;
+                       cache-sets = <1024>;
+                       cache-size = <2097152>;
+                       cache-unified;
+                       interrupt-parent = <&plic>;
+                       interrupts = <1 2 3>;
+                       reg = <0x0 0x2010000 0x0 0x1000>;
+               };
+
+               clint@2000000 {
+                       compatible = "sifive,clint0";
+                       reg = <0x0 0x2000000 0x0 0xC000>;
+                       interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7
+                                               &cpu1_intc 3 &cpu1_intc 7
+                                               &cpu2_intc 3 &cpu2_intc 7
+                                               &cpu3_intc 3 &cpu3_intc 7
+                                               &cpu4_intc 3 &cpu4_intc 7>;
+               };
+
+               plic: interrupt-controller@c000000 {
+                       #interrupt-cells = <1>;
+                       compatible = "sifive,plic-1.0.0";
+                       reg = <0x0 0xc000000 0x0 0x4000000>;
+                       riscv,ndev = <186>;
+                       interrupt-controller;
+                       interrupts-extended = <&cpu0_intc 11
+                                       &cpu1_intc 11 &cpu1_intc 9
+                                       &cpu2_intc 11 &cpu2_intc 9
+                                       &cpu3_intc 11 &cpu3_intc 9
+                                       &cpu4_intc 11 &cpu4_intc 9>;
+               };
+
+               dma@3000000 {
+                       compatible = "sifive,fu540-c000-pdma";
+                       reg = <0x0 0x3000000 0x0 0x8000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <23 24 25 26 27 28 29 30>;
+                       #dma-cells = <1>;
+               };
+
+               refclk: refclk {
+                       compatible = "fixed-clock";
+                       #clock-cells = <0>;
+                       clock-frequency = <600000000>;
+                       clock-output-names = "msspllclk";
+               };
+
+               clkcfg: clkcfg@20002000 {
+                       compatible = "microchip,mpfs-clkcfg";
+                       reg = <0x0 0x20002000 0x0 0x1000>;
+                       reg-names = "mss_sysreg";
+                       clocks = <&refclk>;
+                       #clock-cells = <1>;
+                       clock-output-names = "cpu", "axi", "ahb", "envm",       /* 0-3   */
+                                "mac0", "mac1", "mmc", "timer",                /* 4-7   */
+                               "mmuart0", "mmuart1", "mmuart2", "mmuart3",     /* 8-11  */
+                               "mmuart4", "spi0", "spi1", "i2c0",              /* 12-15 */
+                               "i2c1", "can0", "can1", "usb",                  /* 16-19 */
+                               "rsvd", "rtc", "qspi", "gpio0",                 /* 20-23 */
+                               "gpio1", "gpio2", "ddrc", "fic0",               /* 24-27 */
+                               "fic1", "fic2", "fic3", "athena", "cfm";        /* 28-32 */
+               };
+
+               serial0: serial@20000000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20000000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <90>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 8>;
+                       status = "disabled";
+               };
+
+               serial1: serial@20100000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20100000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <91>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 9>;
+                       status = "disabled";
+               };
+
+               serial2: serial@20102000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20102000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <92>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 10>;
+                       status = "disabled";
+               };
+
+               serial3: serial@20104000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20104000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <93>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 11>;
+                       status = "disabled";
+               };
+
+               emmc: mmc@20008000 {
+                       compatible = "cdns,sd4hc";
+                       reg = <0x0 0x20008000 0x0 0x1000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <88 89>;
+                       pinctrl-names = "default";
+                       clocks = <&clkcfg 6>;
+                       bus-width = <4>;
+                       cap-mmc-highspeed;
+                       mmc-ddr-3_3v;
+                       max-frequency = <200000000>;
+                       non-removable;
+                       no-sd;
+                       no-sdio;
+                       voltage-ranges = <3300 3300>;
+                       status = "disabled";
+               };
+
+               sdcard: sdhc@20008000 {
+                       compatible = "cdns,sd4hc";
+                       reg = <0x0 0x20008000 0x0 0x1000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <88>;
+                       pinctrl-names = "default";
+                       clocks = <&clkcfg 6>;
+                       bus-width = <4>;
+                       disable-wp;
+                       cap-sd-highspeed;
+                       card-detect-delay = <200>;
+                       sd-uhs-sdr12;
+                       sd-uhs-sdr25;
+                       sd-uhs-sdr50;
+                       sd-uhs-sdr104;
+                       max-frequency = <200000000>;
+                       status = "disabled";
+               };
+
+               emac0: ethernet@20110000 {
+                       compatible = "cdns,macb";
+                       reg = <0x0 0x20110000 0x0 0x2000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <64 65 66 67>;
+                       local-mac-address = [00 00 00 00 00 00];
+                       clocks = <&clkcfg 4>, <&clkcfg 2>;
+                       clock-names = "pclk", "hclk";
+                       status = "disabled";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+               emac1: ethernet@20112000 {
+                       compatible = "cdns,macb";
+                       reg = <0x0 0x20112000 0x0 0x2000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <70 71 72 73>;
+                       mac-address = [00 00 00 00 00 00];
+                       clocks = <&clkcfg 5>, <&clkcfg 2>;
+                       status = "disabled";
+                       clock-names = "pclk", "hclk";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+       };
+};
index 47a5003..62d9469 100644 (file)
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #include <asm/page.h>
+#include <asm/pgtable.h>
 
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
 
 SECTIONS
 {
-       . = PAGE_OFFSET;
+       . = KERNEL_LINK_ADDR;
 
        .payload : {
                *(.payload)
index 6c0625a..1f2be23 100644 (file)
@@ -16,6 +16,7 @@ CONFIG_EXPERT=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_SOC_SIFIVE=y
 CONFIG_SOC_VIRT=y
+CONFIG_SOC_MICROCHIP_POLARFIRE=y
 CONFIG_SMP=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_JUMP_LABEL=y
@@ -82,6 +83,9 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_OHCI_HCD_PLATFORM=y
 CONFIG_USB_STORAGE=y
 CONFIG_USB_UAS=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
+CONFIG_MMC_SDHCI_CADENCE=y
 CONFIG_MMC=y
 CONFIG_MMC_SPI=y
 CONFIG_RTC_CLASS=y
diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile
new file mode 100644 (file)
index 0000000..b8f8740
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y  += alternative.o
+obj-$(CONFIG_ERRATA_SIFIVE) += sifive/
diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c
new file mode 100644 (file)
index 0000000..3b15885
--- /dev/null
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * alternative runtime patching
+ * inspired by the ARM64 and x86 version
+ *
+ * Copyright (C) 2021 Sifive.
+ */
+
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/vendorid_list.h>
+#include <asm/sbi.h>
+#include <asm/csr.h>
+
+static struct cpu_manufacturer_info_t {
+       unsigned long vendor_id;
+       unsigned long arch_id;
+       unsigned long imp_id;
+} cpu_mfr_info;
+
+static void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end,
+                                unsigned long archid, unsigned long impid);
+
+static inline void __init riscv_fill_cpu_mfr_info(void)
+{
+#ifdef CONFIG_RISCV_M_MODE
+       cpu_mfr_info.vendor_id = csr_read(CSR_MVENDORID);
+       cpu_mfr_info.arch_id = csr_read(CSR_MARCHID);
+       cpu_mfr_info.imp_id = csr_read(CSR_MIMPID);
+#else
+       cpu_mfr_info.vendor_id = sbi_get_mvendorid();
+       cpu_mfr_info.arch_id = sbi_get_marchid();
+       cpu_mfr_info.imp_id = sbi_get_mimpid();
+#endif
+}
+
+static void __init init_alternative(void)
+{
+       riscv_fill_cpu_mfr_info();
+
+       switch (cpu_mfr_info.vendor_id) {
+#ifdef CONFIG_ERRATA_SIFIVE
+       case SIFIVE_VENDOR_ID:
+               vendor_patch_func = sifive_errata_patch_func;
+               break;
+#endif
+       default:
+               vendor_patch_func = NULL;
+       }
+}
+
+/*
+ * This is called very early in the boot process (directly after we run
+ * a feature detect on the boot CPU). No need to worry about other CPUs
+ * here.
+ */
+void __init apply_boot_alternatives(void)
+{
+       /* If called on non-boot cpu things could go wrong */
+       WARN_ON(smp_processor_id() != 0);
+
+       init_alternative();
+
+       if (!vendor_patch_func)
+               return;
+
+       vendor_patch_func((struct alt_entry *)__alt_start,
+                         (struct alt_entry *)__alt_end,
+                         cpu_mfr_info.arch_id, cpu_mfr_info.imp_id);
+}
+
diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile
new file mode 100644 (file)
index 0000000..bdd5fc8
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y += errata_cip_453.o
+obj-y += errata.o
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
new file mode 100644 (file)
index 0000000..f5e5ae7
--- /dev/null
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <asm/patch.h>
+#include <asm/alternative.h>
+#include <asm/vendorid_list.h>
+#include <asm/errata_list.h>
+
+struct errata_info_t {
+       char name[ERRATA_STRING_LENGTH_MAX];
+       bool (*check_func)(unsigned long  arch_id, unsigned long impid);
+};
+
+static bool errata_cip_453_check_func(unsigned long  arch_id, unsigned long impid)
+{
+       /*
+        * Affected cores:
+        * Architecture ID: 0x8000000000000007
+        * Implement ID: 0x20181004 <= impid <= 0x20191105
+        */
+       if (arch_id != 0x8000000000000007 ||
+           (impid < 0x20181004 || impid > 0x20191105))
+               return false;
+       return true;
+}
+
+static bool errata_cip_1200_check_func(unsigned long  arch_id, unsigned long impid)
+{
+       /*
+        * Affected cores:
+        * Architecture ID: 0x8000000000000007 or 0x1
+        * Implement ID: mimpid[23:0] <= 0x200630 and mimpid != 0x01200626
+        */
+       if (arch_id != 0x8000000000000007 && arch_id != 0x1)
+               return false;
+       if ((impid & 0xffffff) > 0x200630 || impid == 0x1200626)
+               return false;
+       return true;
+}
+
+static struct errata_info_t errata_list[ERRATA_SIFIVE_NUMBER] = {
+       {
+               .name = "cip-453",
+               .check_func = errata_cip_453_check_func
+       },
+       {
+               .name = "cip-1200",
+               .check_func = errata_cip_1200_check_func
+       },
+};
+
+static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid)
+{
+       int idx;
+       u32 cpu_req_errata = 0;
+
+       for (idx = 0; idx < ERRATA_SIFIVE_NUMBER; idx++)
+               if (errata_list[idx].check_func(archid, impid))
+                       cpu_req_errata |= (1U << idx);
+
+       return cpu_req_errata;
+}
+
+static void __init warn_miss_errata(u32 miss_errata)
+{
+       int i;
+
+       pr_warn("----------------------------------------------------------------\n");
+       pr_warn("WARNING: Missing the following errata may cause potential issues\n");
+       for (i = 0; i < ERRATA_SIFIVE_NUMBER; i++)
+               if (miss_errata & 0x1 << i)
+                       pr_warn("\tSiFive Errata[%d]:%s\n", i, errata_list[i].name);
+       pr_warn("Please enable the corresponding Kconfig to apply them\n");
+       pr_warn("----------------------------------------------------------------\n");
+}
+
+void __init sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+                                    unsigned long archid, unsigned long impid)
+{
+       struct alt_entry *alt;
+       u32 cpu_req_errata = sifive_errata_probe(archid, impid);
+       u32 cpu_apply_errata = 0;
+       u32 tmp;
+
+       for (alt = begin; alt < end; alt++) {
+               if (alt->vendor_id != SIFIVE_VENDOR_ID)
+                       continue;
+               if (alt->errata_id >= ERRATA_SIFIVE_NUMBER) {
+                       WARN(1, "This errata id:%d is not in kernel errata list", alt->errata_id);
+                       continue;
+               }
+
+               tmp = (1U << alt->errata_id);
+               if (cpu_req_errata & tmp) {
+                       patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
+                       cpu_apply_errata |= tmp;
+               }
+       }
+       if (cpu_apply_errata != cpu_req_errata)
+               warn_miss_errata(cpu_req_errata - cpu_apply_errata);
+}
diff --git a/arch/riscv/errata/sifive/errata_cip_453.S b/arch/riscv/errata/sifive/errata_cip_453.S
new file mode 100644 (file)
index 0000000..f1b9623
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/alternative.h>
+
+.macro ADD_SIGN_EXT pt_reg badaddr tmp_reg
+       REG_L \badaddr, PT_BADADDR(\pt_reg)
+       li \tmp_reg,1
+       slli \tmp_reg,\tmp_reg,0x26
+       and \tmp_reg,\tmp_reg,\badaddr
+       beqz \tmp_reg, 1f
+       li \tmp_reg,-1
+       slli \tmp_reg,\tmp_reg,0x27
+       or \badaddr,\tmp_reg,\badaddr
+       REG_S \badaddr, PT_BADADDR(\pt_reg)
+1:
+.endm
+
+ENTRY(sifive_cip_453_page_fault_trp)
+       ADD_SIGN_EXT a0, t0, t1
+#ifdef CONFIG_MMU
+       la t0, do_page_fault
+#else
+       la t0, do_trap_unknown
+#endif
+       jr t0
+END(sifive_cip_453_page_fault_trp)
+
+ENTRY(sifive_cip_453_insn_fault_trp)
+       ADD_SIGN_EXT a0, t0, t1
+       la t0, do_trap_insn_fault
+       jr t0
+END(sifive_cip_453_insn_fault_trp)
diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h
new file mode 100644 (file)
index 0000000..88c0870
--- /dev/null
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ALTERNATIVE_MACROS_H
+#define __ASM_ALTERNATIVE_MACROS_H
+
+#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE
+
+#ifdef __ASSEMBLY__
+
+.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len
+       RISCV_PTR \oldptr
+       RISCV_PTR \newptr
+       REG_ASM \vendor_id
+       REG_ASM \new_len
+       .word   \errata_id
+.endm
+
+.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg
+       .if \enable
+       .pushsection .alternative, "a"
+       ALT_ENTRY 886b, 888f, \vendor_id, \errata_id, 889f - 888f
+       .popsection
+       .subsection 1
+888 :
+       \new_c
+889 :
+       .previous
+       .org    . - (889b - 888b) + (887b - 886b)
+       .org    . - (887b - 886b) + (889b - 888b)
+       .endif
+.endm
+
+.macro __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable
+886 :
+       \old_c
+887 :
+       ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c
+.endm
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/asm.h>
+#include <linux/stringify.h>
+
+#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \
+       RISCV_PTR " " oldptr "\n" \
+       RISCV_PTR " " newptr "\n" \
+       REG_ASM " " vendor_id "\n" \
+       REG_ASM " " newlen "\n" \
+       ".word " errata_id "\n"
+
+#define ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) \
+       ".if " __stringify(enable) " == 1\n"                            \
+       ".pushsection .alternative, \"a\"\n"                            \
+       ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \
+       ".popsection\n"                                                 \
+       ".subsection 1\n"                                               \
+       "888 :\n"                                                       \
+       new_c "\n"                                                      \
+       "889 :\n"                                                       \
+       ".previous\n"                                                   \
+       ".org   . - (887b - 886b) + (889b - 888b)\n"                    \
+       ".org   . - (889b - 888b) + (887b - 886b)\n"                    \
+       ".endif\n"
+
+#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, enable) \
+       "886 :\n"       \
+       old_c "\n"      \
+       "887 :\n"       \
+       ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c)
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k))
+
+#endif /* __ASSEMBLY__ */
+
+#else /* !CONFIG_RISCV_ERRATA_ALTERNATIVE*/
+#ifdef __ASSEMBLY__
+
+.macro __ALTERNATIVE_CFG old_c
+       \old_c
+.endm
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG old_c
+
+#else /* !__ASSEMBLY__ */
+
+#define __ALTERNATIVE_CFG(old_c)  \
+       old_c "\n"
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG(old_c)
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_RISCV_ERRATA_ALTERNATIVE */
+/*
+ * Usage:
+ *   ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k)
+ * in the assembly code. Otherwise,
+ *   asm(ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k));
+ *
+ * old_content: The old content which is probably replaced with new content.
+ * new_content: The new content.
+ * vendor_id: The CPU vendor ID.
+ * errata_id: The errata ID.
+ * CONFIG_k: The Kconfig of this errata. When Kconfig is disabled, the old
+ *          content will alwyas be executed.
+ */
+#define ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) \
+       _ALTERNATIVE_CFG(old_content, new_content, vendor_id, errata_id, CONFIG_k)
+
+/*
+ * A vendor wants to replace an old_content, but another vendor has used
+ * ALTERNATIVE() to patch its customized content at the same location. In
+ * this case, this vendor can create a new macro ALTERNATIVE_2() based
+ * on the following sample code and then replace ALTERNATIVE() with
+ * ALTERNATIVE_2() to append its customized content.
+ *
+ * .macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \
+ *                                   new_c_2, vendor_id_2, errata_id_2, enable_2
+ * 886 :
+ *      \old_c
+ * 887 :
+ *      ALT_NEW_CONTENT \vendor_id_1, \errata_id_1, \enable_1, \new_c_1
+ *      ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2
+ * .endm
+ *
+ * #define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                   new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \
+ *        __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \
+ *                                   new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2) \
+ *
+ * #define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                    new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \
+ *         _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                         new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2)
+ *
+ */
+#endif
diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h
new file mode 100644 (file)
index 0000000..e625d3c
--- /dev/null
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+
+#ifndef __ASM_ALTERNATIVE_H
+#define __ASM_ALTERNATIVE_H
+
+#define ERRATA_STRING_LENGTH_MAX 32
+
+#include <asm/alternative-macros.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/hwcap.h>
+
+void __init apply_boot_alternatives(void);
+
+struct alt_entry {
+       void *old_ptr;           /* address of original instruciton or data  */
+       void *alt_ptr;           /* address of replacement instruction or data */
+       unsigned long vendor_id; /* cpu vendor id */
+       unsigned long alt_len;   /* The replacement size */
+       unsigned int errata_id;  /* The errata id */
+} __packed;
+
+struct errata_checkfunc_id {
+       unsigned long vendor_id;
+       bool (*func)(struct alt_entry *alt);
+};
+
+void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+                             unsigned long archid, unsigned long impid);
+
+#endif
+#endif
index 9c992a8..618d7c5 100644 (file)
@@ -23,6 +23,7 @@
 #define REG_L          __REG_SEL(ld, lw)
 #define REG_S          __REG_SEL(sd, sw)
 #define REG_SC         __REG_SEL(sc.d, sc.w)
+#define REG_ASM                __REG_SEL(.dword, .word)
 #define SZREG          __REG_SEL(8, 4)
 #define LGREG          __REG_SEL(3, 2)
 
index caadfc1..87ac656 100644 (file)
 #define CSR_MIP                        0x344
 #define CSR_PMPCFG0            0x3a0
 #define CSR_PMPADDR0           0x3b0
+#define CSR_MVENDORID          0xf11
+#define CSR_MARCHID            0xf12
+#define CSR_MIMPID             0xf13
 #define CSR_MHARTID            0xf14
 
 #ifdef CONFIG_RISCV_M_MODE
index 5c725e1..f4b490c 100644 (file)
@@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
        int uses_interp);
 #endif /* CONFIG_MMU */
 
+#define ELF_CORE_COPY_REGS(dest, regs)                 \
+do {                                                   \
+       *(struct user_regs_struct *)&(dest) =           \
+               *(struct user_regs_struct *)regs;       \
+} while (0);
+
 #endif /* _ASM_RISCV_ELF_H */
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
new file mode 100644 (file)
index 0000000..5f1046e
--- /dev/null
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+#ifndef ASM_ERRATA_LIST_H
+#define ASM_ERRATA_LIST_H
+
+#include <asm/alternative.h>
+#include <asm/vendorid_list.h>
+
+#ifdef CONFIG_ERRATA_SIFIVE
+#define        ERRATA_SIFIVE_CIP_453 0
+#define        ERRATA_SIFIVE_CIP_1200 1
+#define        ERRATA_SIFIVE_NUMBER 2
+#endif
+
+#ifdef __ASSEMBLY__
+
+#define ALT_INSN_FAULT(x)                                              \
+ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault),                 \
+           __stringify(RISCV_PTR sifive_cip_453_insn_fault_trp),       \
+           SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453,                    \
+           CONFIG_ERRATA_SIFIVE_CIP_453)
+
+#define ALT_PAGE_FAULT(x)                                              \
+ALTERNATIVE(__stringify(RISCV_PTR do_page_fault),                      \
+           __stringify(RISCV_PTR sifive_cip_453_page_fault_trp),       \
+           SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453,                    \
+           CONFIG_ERRATA_SIFIVE_CIP_453)
+#else /* !__ASSEMBLY__ */
+
+#define ALT_FLUSH_TLB_PAGE(x)                                          \
+asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID,       \
+               ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200)  \
+               : : "r" (addr) : "memory")
+
+#endif /* __ASSEMBLY__ */
+
+#endif
index 845002c..04dad33 100644 (file)
 #endif
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
+/*
+ * Clang prior to 13 had "mcount" instead of "_mcount":
+ * https://reviews.llvm.org/D98881
+ */
+#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
+#define MCOUNT_NAME _mcount
+#else
+#define MCOUNT_NAME mcount
+#endif
+
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
-void _mcount(void);
+void MCOUNT_NAME(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
        return addr;
@@ -36,7 +46,7 @@ struct dyn_arch_ftrace {
  * both auipc and jalr at the same time.
  */
 
-#define MCOUNT_ADDR            ((unsigned long)_mcount)
+#define MCOUNT_ADDR            ((unsigned long)MCOUNT_NAME)
 #define JALR_SIGN_MASK         (0x00000800)
 #define JALR_OFFSET_MASK       (0x00000fff)
 #define AUIPC_OFFSET_MASK      (0xfffff000)
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
new file mode 100644 (file)
index 0000000..1e95410
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#ifndef _RISCV_KEXEC_H
+#define _RISCV_KEXEC_H
+
+#include <asm/page.h>    /* For PAGE_SIZE */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+/* Reserve a page for the control code buffer */
+#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE
+
+#define KEXEC_ARCH KEXEC_ARCH_RISCV
+
+extern void riscv_crash_save_regs(struct pt_regs *newregs);
+
+static inline void
+crash_setup_regs(struct pt_regs *newregs,
+                struct pt_regs *oldregs)
+{
+       if (oldregs)
+               memcpy(newregs, oldregs, sizeof(struct pt_regs));
+       else
+               riscv_crash_save_regs(newregs);
+}
+
+
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+       unsigned long fdt_addr;
+};
+
+const extern unsigned char riscv_kexec_relocate[];
+const extern unsigned int riscv_kexec_relocate_size;
+
+typedef void (*riscv_kexec_method)(unsigned long first_ind_entry,
+                                  unsigned long jump_addr,
+                                  unsigned long fdt_addr,
+                                  unsigned long hartid,
+                                  unsigned long va_pa_off);
+
+extern riscv_kexec_method riscv_kexec_norelocate;
+
+#endif
index adc9d26..6a7761c 100644 (file)
@@ -90,15 +90,58 @@ typedef struct page *pgtable_t;
 
 #ifdef CONFIG_MMU
 extern unsigned long va_pa_offset;
+#ifdef CONFIG_64BIT
+extern unsigned long va_kernel_pa_offset;
+#endif
+#ifdef CONFIG_XIP_KERNEL
+extern unsigned long va_kernel_xip_pa_offset;
+#endif
 extern unsigned long pfn_base;
 #define ARCH_PFN_OFFSET                (pfn_base)
 #else
 #define va_pa_offset           0
+#ifdef CONFIG_64BIT
+#define va_kernel_pa_offset    0
+#endif
 #define ARCH_PFN_OFFSET                (PAGE_OFFSET >> PAGE_SHIFT)
 #endif /* CONFIG_MMU */
 
-#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
-#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+extern unsigned long kernel_virt_addr;
+
+#ifdef CONFIG_64BIT
+#define linear_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_pa_offset))
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_mapping_pa_to_va(y)     ({                                              \
+       unsigned long _y = y;                                                           \
+       (_y >= CONFIG_PHYS_RAM_BASE) ?                                                  \
+               (void *)((unsigned long)(_y) + va_kernel_pa_offset + XIP_OFFSET) :      \
+               (void *)((unsigned long)(_y) + va_kernel_xip_pa_offset);                \
+       })
+#else
+#define kernel_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_kernel_pa_offset))
+#endif
+#define __pa_to_va_nodebug(x)          linear_mapping_pa_to_va(x)
+
+#define linear_mapping_va_to_pa(x)     ((unsigned long)(x) - va_pa_offset)
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_mapping_va_to_pa(y) ({                                          \
+       unsigned long _y = y;                                                   \
+       (_y < kernel_virt_addr + XIP_OFFSET) ?                                  \
+               ((unsigned long)(_y) - va_kernel_xip_pa_offset) :               \
+               ((unsigned long)(_y) - va_kernel_pa_offset - XIP_OFFSET);       \
+       })
+#else
+#define kernel_mapping_va_to_pa(x)     ((unsigned long)(x) - va_kernel_pa_offset)
+#endif
+#define __va_to_pa_nodebug(x)  ({                                              \
+       unsigned long _x = x;                                                   \
+       (_x < kernel_virt_addr) ?                                               \
+               linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x);      \
+       })
+#else
+#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
+#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+#endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
 extern phys_addr_t __virt_to_phys(unsigned long x);
index ebf817c..9469f46 100644 (file)
 
 #include <asm/pgtable-bits.h>
 
-#ifndef __ASSEMBLY__
+#ifndef CONFIG_MMU
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#else
 
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include <linux/mm_types.h>
+#define ADDRESS_SPACE_END      (UL(-1))
 
-#ifdef CONFIG_MMU
+#ifdef CONFIG_64BIT
+/* Leave 2GB for kernel and BPF at the end of the address space */
+#define KERNEL_LINK_ADDR       (ADDRESS_SPACE_END - SZ_2G + 1)
+#else
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#endif
 
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 
 #define BPF_JIT_REGION_SIZE    (SZ_128M)
+#ifdef CONFIG_64BIT
+/* KASLR should leave at least 128MB for BPF after the kernel */
+#define BPF_JIT_REGION_START   PFN_ALIGN((unsigned long)&_end)
+#define BPF_JIT_REGION_END     (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#else
 #define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 #define BPF_JIT_REGION_END     (VMALLOC_END)
+#endif
+
+/* Modules always live before the kernel */
+#ifdef CONFIG_64BIT
+#define MODULES_VADDR  (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
+#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))
+#endif
 
 /*
  * Roughly size the vmemmap space to be large enough to fit enough
 
 #endif
 
+#ifdef CONFIG_XIP_KERNEL
+#define XIP_OFFSET             SZ_8M
+#endif
+
+#ifndef __ASSEMBLY__
+
+/* Page Upper Directory not used in RISC-V */
+#include <asm-generic/pgtable-nopud.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <linux/mm_types.h>
+
 #ifdef CONFIG_64BIT
 #include <asm/pgtable-64.h>
 #else
 #include <asm/pgtable-32.h>
 #endif /* CONFIG_64BIT */
 
+#ifdef CONFIG_XIP_KERNEL
+#define XIP_FIXUP(addr) ({                                                     \
+       uintptr_t __a = (uintptr_t)(addr);                                      \
+       (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ?  \
+               __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\
+               __a;                                                            \
+       })
+#else
+#define XIP_FIXUP(addr)                (addr)
+#endif /* CONFIG_XIP_KERNEL */
+
 #ifdef CONFIG_MMU
 /* Number of entries in the page global directory */
 #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
@@ -484,8 +522,17 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 
 #define kern_addr_valid(addr)   (1) /* FIXME */
 
-extern void *dtb_early_va;
-extern uintptr_t dtb_early_pa;
+extern char _start[];
+extern void *_dtb_early_va;
+extern uintptr_t _dtb_early_pa;
+#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU)
+#define dtb_early_va   (*(void **)XIP_FIXUP(&_dtb_early_va))
+#define dtb_early_pa   (*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa))
+#else
+#define dtb_early_va   _dtb_early_va
+#define dtb_early_pa   _dtb_early_pa
+#endif /* CONFIG_XIP_KERNEL */
+
 void setup_bootmem(void);
 void paging_init(void);
 void misc_mem_init(void);
index d702741..0d42693 100644 (file)
@@ -97,6 +97,9 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
 
 void sbi_console_putchar(int ch);
 int sbi_console_getchar(void);
+long sbi_get_mvendorid(void);
+long sbi_get_marchid(void);
+long sbi_get_mimpid(void);
 void sbi_set_timer(uint64_t stime_value);
 void sbi_shutdown(void);
 void sbi_clear_ipi(void);
index 1595c5b..8a303fb 100644 (file)
@@ -11,5 +11,6 @@ extern char _start[];
 extern char _start_kernel[];
 extern char __init_data_begin[], __init_data_end[];
 extern char __init_text_begin[], __init_text_end[];
+extern char __alt_start[], __alt_end[];
 
 #endif /* __ASM_SECTIONS_H */
index 6887b3d..086f757 100644 (file)
@@ -26,6 +26,12 @@ static inline void protect_kernel_text_data(void) {}
 static inline int set_memory_rw_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX)
+void protect_kernel_linear_mapping_text_rodata(void);
+#else
+static inline void protect_kernel_linear_mapping_text_rodata(void) {}
+#endif
+
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
 bool kernel_page_present(struct page *page);
index df1f7c4..a7d2811 100644 (file)
@@ -46,7 +46,7 @@ int riscv_hartid_to_cpuid(int hartid);
 void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
 
 /* Set custom IPI operations */
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops);
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops);
 
 /* Clear IPI for current CPU */
 void riscv_clear_ipi(void);
@@ -92,7 +92,7 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
        cpumask_set_cpu(boot_cpu_hartid, out);
 }
 
-static inline void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
 {
 }
 
index 5477e7e..9090493 100644 (file)
@@ -23,5 +23,10 @@ extern asmlinkage void *__memmove(void *, const void *, size_t);
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
 #define memmove(dst, src, len) __memmove(dst, src, len)
+
+#ifndef __NO_FORTIFY
+#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
+#endif
+
 #endif
 #endif /* _ASM_RISCV_STRING_H */
index 49350c8..b933b15 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/err.h>
 
 /* The array of function pointers for syscalls. */
-extern void *sys_call_table[];
+extern void * const sys_call_table[];
 
 /*
  * Only the low 32 bits of orig_r0 are meaningful, so we return int.
index 394cfbc..c84218a 100644 (file)
@@ -9,6 +9,7 @@
 
 #include <linux/mm_types.h>
 #include <asm/smp.h>
+#include <asm/errata_list.h>
 
 #ifdef CONFIG_MMU
 static inline void local_flush_tlb_all(void)
@@ -19,7 +20,7 @@ static inline void local_flush_tlb_all(void)
 /* Flush one page from local TLB */
 static inline void local_flush_tlb_page(unsigned long addr)
 {
-       __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
+       ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"));
 }
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()                  do { } while (0)
diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h
new file mode 100644 (file)
index 0000000..9d93421
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+#ifndef ASM_VENDOR_LIST_H
+#define ASM_VENDOR_LIST_H
+
+#define SIFIVE_VENDOR_ID       0x489
+
+#endif
index 647a47f..d3081e4 100644 (file)
@@ -10,6 +10,10 @@ CFLAGS_REMOVE_sbi.o  = $(CC_FLAGS_FTRACE)
 endif
 CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,)
 
+ifdef CONFIG_KEXEC
+AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax
+endif
+
 extra-y += head.o
 extra-y += vmlinux.lds
 
@@ -55,6 +59,8 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)      += cpu-hotplug.o
 obj-$(CONFIG_KGDB)             += kgdb.o
+obj-$(CONFIG_KEXEC)            += kexec_relocate.o crash_save_regs.o machine_kexec.o
+obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
 
 obj-$(CONFIG_JUMP_LABEL)       += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c
new file mode 100644 (file)
index 0000000..86cc0ad
--- /dev/null
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code comes from arch/arm64/kernel/crash_dump.c
+ * Created by: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ * Copyright (C) 2017 Linaro Limited
+ */
+
+#include <linux/crash_dump.h>
+#include <linux/io.h>
+
+/**
+ * copy_oldmem_page() - copy one page from old kernel memory
+ * @pfn: page frame number to be copied
+ * @buf: buffer where the copied page is placed
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page
+ * @userbuf: if set, @buf is in a user address space
+ *
+ * This function copies one page from old kernel memory into buffer pointed by
+ * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes
+ * copied or negative error in case of failure.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                        size_t csize, unsigned long offset,
+                        int userbuf)
+{
+       void *vaddr;
+
+       if (!csize)
+               return 0;
+
+       vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB);
+       if (!vaddr)
+               return -ENOMEM;
+
+       if (userbuf) {
+               if (copy_to_user((char __user *)buf, vaddr + offset, csize)) {
+                       memunmap(vaddr);
+                       return -EFAULT;
+               }
+       } else
+               memcpy(buf, vaddr + offset, csize);
+
+       memunmap(vaddr);
+       return csize;
+}
diff --git a/arch/riscv/kernel/crash_save_regs.S b/arch/riscv/kernel/crash_save_regs.S
new file mode 100644 (file)
index 0000000..7832fb7
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <asm/asm.h>           /* For RISCV_* and REG_* macros */
+#include <asm/csr.h>           /* For CSR_* macros */
+#include <asm/asm-offsets.h>   /* For offsets on pt_regs */
+#include <linux/linkage.h>     /* For SYM_* macros */
+
+.section ".text"
+SYM_CODE_START(riscv_crash_save_regs)
+       REG_S ra,  PT_RA(a0)    /* x1 */
+       REG_S sp,  PT_SP(a0)    /* x2 */
+       REG_S gp,  PT_GP(a0)    /* x3 */
+       REG_S tp,  PT_TP(a0)    /* x4 */
+       REG_S t0,  PT_T0(a0)    /* x5 */
+       REG_S t1,  PT_T1(a0)    /* x6 */
+       REG_S t2,  PT_T2(a0)    /* x7 */
+       REG_S s0,  PT_S0(a0)    /* x8/fp */
+       REG_S s1,  PT_S1(a0)    /* x9 */
+       REG_S a0,  PT_A0(a0)    /* x10 */
+       REG_S a1,  PT_A1(a0)    /* x11 */
+       REG_S a2,  PT_A2(a0)    /* x12 */
+       REG_S a3,  PT_A3(a0)    /* x13 */
+       REG_S a4,  PT_A4(a0)    /* x14 */
+       REG_S a5,  PT_A5(a0)    /* x15 */
+       REG_S a6,  PT_A6(a0)    /* x16 */
+       REG_S a7,  PT_A7(a0)    /* x17 */
+       REG_S s2,  PT_S2(a0)    /* x18 */
+       REG_S s3,  PT_S3(a0)    /* x19 */
+       REG_S s4,  PT_S4(a0)    /* x20 */
+       REG_S s5,  PT_S5(a0)    /* x21 */
+       REG_S s6,  PT_S6(a0)    /* x22 */
+       REG_S s7,  PT_S7(a0)    /* x23 */
+       REG_S s8,  PT_S8(a0)    /* x24 */
+       REG_S s9,  PT_S9(a0)    /* x25 */
+       REG_S s10, PT_S10(a0)   /* x26 */
+       REG_S s11, PT_S11(a0)   /* x27 */
+       REG_S t3,  PT_T3(a0)    /* x28 */
+       REG_S t4,  PT_T4(a0)    /* x29 */
+       REG_S t5,  PT_T5(a0)    /* x30 */
+       REG_S t6,  PT_T6(a0)    /* x31 */
+
+       csrr t1, CSR_STATUS
+       csrr t2, CSR_EPC
+       csrr t3, CSR_TVAL
+       csrr t4, CSR_CAUSE
+
+       REG_S t1, PT_STATUS(a0)
+       REG_S t2, PT_EPC(a0)
+       REG_S t3, PT_BADADDR(a0)
+       REG_S t4, PT_CAUSE(a0)
+       ret
+SYM_CODE_END(riscv_crash_save_regs)
index 83095fa..80d5a9e 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/unistd.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
+#include <asm/errata_list.h>
 
 #if !IS_ENABLED(CONFIG_PREEMPTION)
 .set resume_kernel, restore_all
@@ -454,7 +455,7 @@ ENDPROC(__switch_to)
        /* Exception vector table */
 ENTRY(excp_vect_table)
        RISCV_PTR do_trap_insn_misaligned
-       RISCV_PTR do_trap_insn_fault
+       ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault)
        RISCV_PTR do_trap_insn_illegal
        RISCV_PTR do_trap_break
        RISCV_PTR do_trap_load_misaligned
@@ -465,7 +466,8 @@ ENTRY(excp_vect_table)
        RISCV_PTR do_trap_ecall_s
        RISCV_PTR do_trap_unknown
        RISCV_PTR do_trap_ecall_m
-       RISCV_PTR do_page_fault   /* instruction page fault */
+       /* instruciton page fault */
+       ALT_PAGE_FAULT(RISCV_PTR do_page_fault)
        RISCV_PTR do_page_fault   /* load page fault */
        RISCV_PTR do_trap_unknown
        RISCV_PTR do_page_fault   /* store page fault */
index f5a9bad..89cc58a 100644 (file)
@@ -9,11 +9,23 @@
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/page.h>
+#include <asm/pgtable.h>
 #include <asm/csr.h>
 #include <asm/hwcap.h>
 #include <asm/image.h>
 #include "efi-header.S"
 
+#ifdef CONFIG_XIP_KERNEL
+.macro XIP_FIXUP_OFFSET reg
+       REG_L t0, _xip_fixup
+       add \reg, \reg, t0
+.endm
+_xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET
+#else
+.macro XIP_FIXUP_OFFSET reg
+.endm
+#endif /* CONFIG_XIP_KERNEL */
+
 __HEAD
 ENTRY(_start)
        /*
@@ -69,7 +81,9 @@ pe_head_start:
 #ifdef CONFIG_MMU
 relocate:
        /* Relocate return address */
-       li a1, PAGE_OFFSET
+       la a1, kernel_virt_addr
+       XIP_FIXUP_OFFSET a1
+       REG_L a1, 0(a1)
        la a2, _start
        sub a1, a1, a2
        add ra, ra, a1
@@ -91,6 +105,7 @@ relocate:
         * to ensure the new translations are in use.
         */
        la a0, trampoline_pg_dir
+       XIP_FIXUP_OFFSET a0
        srl a0, a0, PAGE_SHIFT
        or a0, a0, a1
        sfence.vma
@@ -144,7 +159,9 @@ secondary_start_sbi:
 
        slli a3, a0, LGREG
        la a4, __cpu_up_stack_pointer
+       XIP_FIXUP_OFFSET a4
        la a5, __cpu_up_task_pointer
+       XIP_FIXUP_OFFSET a5
        add a4, a3, a4
        add a5, a3, a5
        REG_L sp, (a4)
@@ -156,6 +173,7 @@ secondary_start_common:
 #ifdef CONFIG_MMU
        /* Enable virtual memory and relocate to virtual address */
        la a0, swapper_pg_dir
+       XIP_FIXUP_OFFSET a0
        call relocate
 #endif
        call setup_trap_vector
@@ -236,12 +254,33 @@ pmp_done:
 .Lgood_cores:
 #endif
 
+#ifndef CONFIG_XIP_KERNEL
        /* Pick one hart to run the main boot sequence */
        la a3, hart_lottery
        li a2, 1
        amoadd.w a3, a2, (a3)
        bnez a3, .Lsecondary_start
 
+#else
+       /* hart_lottery in flash contains a magic number */
+       la a3, hart_lottery
+       mv a2, a3
+       XIP_FIXUP_OFFSET a2
+       lw t1, (a3)
+       amoswap.w t0, t1, (a2)
+       /* first time here if hart_lottery in RAM is not set */
+       beq t0, t1, .Lsecondary_start
+
+       la sp, _end + THREAD_SIZE
+       XIP_FIXUP_OFFSET sp
+       mv s0, a0
+       call __copy_data
+
+       /* Restore a0 copy */
+       mv a0, s0
+#endif
+
+#ifndef CONFIG_XIP_KERNEL
        /* Clear BSS for flat non-ELF images */
        la a3, __bss_start
        la a4, __bss_stop
@@ -251,15 +290,18 @@ clear_bss:
        add a3, a3, RISCV_SZPTR
        blt a3, a4, clear_bss
 clear_bss_done:
-
+#endif
        /* Save hart ID and DTB physical address */
        mv s0, a0
        mv s1, a1
+
        la a2, boot_cpu_hartid
+       XIP_FIXUP_OFFSET a2
        REG_S a0, (a2)
 
        /* Initialize page tables and relocate to virtual addresses */
        la sp, init_thread_union + THREAD_SIZE
+       XIP_FIXUP_OFFSET sp
 #ifdef CONFIG_BUILTIN_DTB
        la a0, __dtb_start
 #else
@@ -268,6 +310,7 @@ clear_bss_done:
        call setup_vm
 #ifdef CONFIG_MMU
        la a0, early_pg_dir
+       XIP_FIXUP_OFFSET a0
        call relocate
 #endif /* CONFIG_MMU */
 
@@ -292,7 +335,9 @@ clear_bss_done:
 
        slli a3, a0, LGREG
        la a1, __cpu_up_stack_pointer
+       XIP_FIXUP_OFFSET a1
        la a2, __cpu_up_task_pointer
+       XIP_FIXUP_OFFSET a2
        add a1, a3, a1
        add a2, a3, a2
 
index b48dda3..aabbc3a 100644 (file)
@@ -12,6 +12,9 @@ extern atomic_t hart_lottery;
 
 asmlinkage void do_page_fault(struct pt_regs *regs);
 asmlinkage void __init setup_vm(uintptr_t dtb_pa);
+#ifdef CONFIG_XIP_KERNEL
+asmlinkage void __init __copy_data(void);
+#endif
 
 extern void *__cpu_up_stack_pointer[];
 extern void *__cpu_up_task_pointer[];
diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S
new file mode 100644 (file)
index 0000000..88c3bea
--- /dev/null
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <asm/asm.h>   /* For RISCV_* and REG_* macros */
+#include <asm/csr.h>   /* For CSR_* macros */
+#include <asm/page.h>  /* For PAGE_SIZE */
+#include <linux/linkage.h> /* For SYM_* macros */
+
+.section ".rodata"
+SYM_CODE_START(riscv_kexec_relocate)
+
+       /*
+        * s0: Pointer to the current entry
+        * s1: (const) Phys address to jump to after relocation
+        * s2: (const) Phys address of the FDT image
+        * s3: (const) The hartid of the current hart
+        * s4: Pointer to the destination address for the relocation
+        * s5: (const) Number of words per page
+        * s6: (const) 1, used for subtraction
+        * s7: (const) va_pa_offset, used when switching MMU off
+        * s8: (const) Physical address of the main loop
+        * s9: (debug) indirection page counter
+        * s10: (debug) entry counter
+        * s11: (debug) copied words counter
+        */
+       mv      s0, a0
+       mv      s1, a1
+       mv      s2, a2
+       mv      s3, a3
+       mv      s4, zero
+       li      s5, (PAGE_SIZE / RISCV_SZPTR)
+       li      s6, 1
+       mv      s7, a4
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       /* Disable / cleanup interrupts */
+       csrw    CSR_SIE, zero
+       csrw    CSR_SIP, zero
+
+       /*
+        * When we switch SATP.MODE to "Bare" we'll only
+        * play with physical addresses. However the first time
+        * we try to jump somewhere, the offset on the jump
+        * will be relative to pc which will still be on VA. To
+        * deal with this we set stvec to the physical address at
+        * the start of the loop below so that we jump there in
+        * any case.
+        */
+       la      s8, 1f
+       sub     s8, s8, s7
+       csrw    CSR_STVEC, s8
+
+       /* Process entries in a loop */
+.align 2
+1:
+       addi    s10, s10, 1
+       REG_L   t0, 0(s0)               /* t0 = *image->entry */
+       addi    s0, s0, RISCV_SZPTR     /* image->entry++ */
+
+       /* IND_DESTINATION entry ? -> save destination address */
+       andi    t1, t0, 0x1
+       beqz    t1, 2f
+       andi    s4, t0, ~0x1
+       j       1b
+
+2:
+       /* IND_INDIRECTION entry ? -> update next entry ptr (PA) */
+       andi    t1, t0, 0x2
+       beqz    t1, 2f
+       andi    s0, t0, ~0x2
+       addi    s9, s9, 1
+       csrw    CSR_SATP, zero
+       jalr    zero, s8, 0
+
+2:
+       /* IND_DONE entry ? -> jump to done label */
+       andi    t1, t0, 0x4
+       beqz    t1, 2f
+       j       4f
+
+2:
+       /*
+        * IND_SOURCE entry ? -> copy page word by word to the
+        * destination address we got from IND_DESTINATION
+        */
+       andi    t1, t0, 0x8
+       beqz    t1, 1b          /* Unknown entry type, ignore it */
+       andi    t0, t0, ~0x8
+       mv      t3, s5          /* i = num words per page */
+3:     /* copy loop */
+       REG_L   t1, (t0)        /* t1 = *src_ptr */
+       REG_S   t1, (s4)        /* *dst_ptr = *src_ptr */
+       addi    t0, t0, RISCV_SZPTR /* stc_ptr++ */
+       addi    s4, s4, RISCV_SZPTR /* dst_ptr++ */
+       sub     t3, t3, s6      /* i-- */
+       addi    s11, s11, 1     /* c++ */
+       beqz    t3, 1b          /* copy done ? */
+       j       3b
+
+4:
+       /* Pass the arguments to the next kernel  / Cleanup*/
+       mv      a0, s3
+       mv      a1, s2
+       mv      a2, s1
+
+       /* Cleanup */
+       mv      a3, zero
+       mv      a4, zero
+       mv      a5, zero
+       mv      a6, zero
+       mv      a7, zero
+
+       mv      s0, zero
+       mv      s1, zero
+       mv      s2, zero
+       mv      s3, zero
+       mv      s4, zero
+       mv      s5, zero
+       mv      s6, zero
+       mv      s7, zero
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       mv      t0, zero
+       mv      t1, zero
+       mv      t2, zero
+       mv      t3, zero
+       mv      t4, zero
+       mv      t5, zero
+       mv      t6, zero
+       csrw    CSR_SEPC, zero
+       csrw    CSR_SCAUSE, zero
+       csrw    CSR_SSCRATCH, zero
+
+       /*
+        * Make sure the relocated code is visible
+        * and jump to the new kernel
+        */
+       fence.i
+
+       jalr    zero, a2, 0
+
+SYM_CODE_END(riscv_kexec_relocate)
+riscv_kexec_relocate_end:
+
+
+/* Used for jumping to crashkernel */
+.section ".text"
+SYM_CODE_START(riscv_kexec_norelocate)
+       /*
+        * s0: (const) Phys address to jump to
+        * s1: (const) Phys address of the FDT image
+        * s2: (const) The hartid of the current hart
+        * s3: (const) va_pa_offset, used when switching MMU off
+        */
+       mv      s0, a1
+       mv      s1, a2
+       mv      s2, a3
+       mv      s3, a4
+
+       /* Disable / cleanup interrupts */
+       csrw    CSR_SIE, zero
+       csrw    CSR_SIP, zero
+
+       /* Switch to physical addressing */
+       la      s4, 1f
+       sub     s4, s4, s3
+       csrw    CSR_STVEC, s4
+       csrw    CSR_SATP, zero
+
+.align 2
+1:
+       /* Pass the arguments to the next kernel  / Cleanup*/
+       mv      a0, s2
+       mv      a1, s1
+       mv      a2, s0
+
+       /* Cleanup */
+       mv      a3, zero
+       mv      a4, zero
+       mv      a5, zero
+       mv      a6, zero
+       mv      a7, zero
+
+       mv      s0, zero
+       mv      s1, zero
+       mv      s2, zero
+       mv      s3, zero
+       mv      s4, zero
+       mv      s5, zero
+       mv      s6, zero
+       mv      s7, zero
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       mv      t0, zero
+       mv      t1, zero
+       mv      t2, zero
+       mv      t3, zero
+       mv      t4, zero
+       mv      t5, zero
+       mv      t6, zero
+       csrw    CSR_SEPC, zero
+       csrw    CSR_SCAUSE, zero
+       csrw    CSR_SSCRATCH, zero
+
+       jalr    zero, a2, 0
+SYM_CODE_END(riscv_kexec_norelocate)
+
+.section ".rodata"
+SYM_DATA(riscv_kexec_relocate_size,
+       .long riscv_kexec_relocate_end - riscv_kexec_relocate)
+
diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c
new file mode 100644 (file)
index 0000000..cc04814
--- /dev/null
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <linux/kexec.h>
+#include <asm/kexec.h>         /* For riscv_kexec_* symbol defines */
+#include <linux/smp.h>         /* For smp_send_stop () */
+#include <asm/cacheflush.h>    /* For local_flush_icache_all() */
+#include <asm/barrier.h>       /* For smp_wmb() */
+#include <asm/page.h>          /* For PAGE_MASK */
+#include <linux/libfdt.h>      /* For fdt_check_header() */
+#include <asm/set_memory.h>    /* For set_memory_x() */
+#include <linux/compiler.h>    /* For unreachable() */
+#include <linux/cpu.h>         /* For cpu_down() */
+
+/**
+ * kexec_image_info - Print received image details
+ */
+static void
+kexec_image_info(const struct kimage *image)
+{
+       unsigned long i;
+
+       pr_debug("Kexec image info:\n");
+       pr_debug("\ttype:        %d\n", image->type);
+       pr_debug("\tstart:       %lx\n", image->start);
+       pr_debug("\thead:        %lx\n", image->head);
+       pr_debug("\tnr_segments: %lu\n", image->nr_segments);
+
+       for (i = 0; i < image->nr_segments; i++) {
+               pr_debug("\t    segment[%lu]: %016lx - %016lx", i,
+                       image->segment[i].mem,
+                       image->segment[i].mem + image->segment[i].memsz);
+               pr_debug("\t\t0x%lx bytes, %lu pages\n",
+                       (unsigned long) image->segment[i].memsz,
+                       (unsigned long) image->segment[i].memsz /  PAGE_SIZE);
+       }
+}
+
+/**
+ * machine_kexec_prepare - Initialize kexec
+ *
+ * This function is called from do_kexec_load, when the user has
+ * provided us with an image to be loaded. Its goal is to validate
+ * the image and prepare the control code buffer as needed.
+ * Note that kimage_alloc_init has already been called and the
+ * control buffer has already been allocated.
+ */
+int
+machine_kexec_prepare(struct kimage *image)
+{
+       struct kimage_arch *internal = &image->arch;
+       struct fdt_header fdt = {0};
+       void *control_code_buffer = NULL;
+       unsigned int control_code_buffer_sz = 0;
+       int i = 0;
+
+       kexec_image_info(image);
+
+       /* Find the Flattened Device Tree and save its physical address */
+       for (i = 0; i < image->nr_segments; i++) {
+               if (image->segment[i].memsz <= sizeof(fdt))
+                       continue;
+
+               if (copy_from_user(&fdt, image->segment[i].buf, sizeof(fdt)))
+                       continue;
+
+               if (fdt_check_header(&fdt))
+                       continue;
+
+               internal->fdt_addr = (unsigned long) image->segment[i].mem;
+               break;
+       }
+
+       if (!internal->fdt_addr) {
+               pr_err("Device tree not included in the provided image\n");
+               return -EINVAL;
+       }
+
+       /* Copy the assembler code for relocation to the control page */
+       if (image->type != KEXEC_TYPE_CRASH) {
+               control_code_buffer = page_address(image->control_code_page);
+               control_code_buffer_sz = page_size(image->control_code_page);
+
+               if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) {
+                       pr_err("Relocation code doesn't fit within a control page\n");
+                       return -EINVAL;
+               }
+
+               memcpy(control_code_buffer, riscv_kexec_relocate,
+                       riscv_kexec_relocate_size);
+
+               /* Mark the control page executable */
+               set_memory_x((unsigned long) control_code_buffer, 1);
+       }
+
+       return 0;
+}
+
+
+/**
+ * machine_kexec_cleanup - Cleanup any leftovers from
+ *                        machine_kexec_prepare
+ *
+ * This function is called by kimage_free to handle any arch-specific
+ * allocations done on machine_kexec_prepare. Since we didn't do any
+ * allocations there, this is just an empty function. Note that the
+ * control buffer is freed by kimage_free.
+ */
+void
+machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+
+/*
+ * machine_shutdown - Prepare for a kexec reboot
+ *
+ * This function is called by kernel_kexec just before machine_kexec
+ * below. Its goal is to prepare the rest of the system (the other
+ * harts and possibly devices etc) for a kexec reboot.
+ */
+void machine_shutdown(void)
+{
+       /*
+        * No more interrupts on this hart
+        * until we are back up.
+        */
+       local_irq_disable();
+
+#if defined(CONFIG_HOTPLUG_CPU)
+       smp_shutdown_nonboot_cpus(smp_processor_id());
+#endif
+}
+
+/**
+ * machine_crash_shutdown - Prepare to kexec after a kernel crash
+ *
+ * This function is called by crash_kexec just before machine_kexec
+ * below and its goal is similar to machine_shutdown, but in case of
+ * a kernel crash. Since we don't handle such cases yet, this function
+ * is empty.
+ */
+void
+machine_crash_shutdown(struct pt_regs *regs)
+{
+       crash_save_cpu(regs, smp_processor_id());
+       machine_shutdown();
+       pr_info("Starting crashdump kernel...\n");
+}
+
+/**
+ * machine_kexec - Jump to the loaded kimage
+ *
+ * This function is called by kernel_kexec which is called by the
+ * reboot system call when the reboot cmd is LINUX_REBOOT_CMD_KEXEC,
+ * or by crash_kernel which is called by the kernel's arch-specific
+ * trap handler in case of a kernel panic. It's the final stage of
+ * the kexec process where the pre-loaded kimage is ready to be
+ * executed. We assume at this point that all other harts are
+ * suspended and this hart will be the new boot hart.
+ */
+void __noreturn
+machine_kexec(struct kimage *image)
+{
+       struct kimage_arch *internal = &image->arch;
+       unsigned long jump_addr = (unsigned long) image->start;
+       unsigned long first_ind_entry = (unsigned long) &image->head;
+       unsigned long this_hart_id = raw_smp_processor_id();
+       unsigned long fdt_addr = internal->fdt_addr;
+       void *control_code_buffer = page_address(image->control_code_page);
+       riscv_kexec_method kexec_method = NULL;
+
+       if (image->type != KEXEC_TYPE_CRASH)
+               kexec_method = control_code_buffer;
+       else
+               kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate;
+
+       pr_notice("Will call new kernel at %08lx from hart id %lx\n",
+                 jump_addr, this_hart_id);
+       pr_notice("FDT image at %08lx\n", fdt_addr);
+
+       /* Make sure the relocation code is visible to the hart */
+       local_flush_icache_all();
+
+       /* Jump to the relocation code */
+       pr_notice("Bye...\n");
+       kexec_method(first_ind_entry, jump_addr, fdt_addr,
+                    this_hart_id, va_pa_offset);
+       unreachable();
+}
index 8a5593f..6d46268 100644 (file)
@@ -47,8 +47,8 @@
 
 ENTRY(ftrace_stub)
 #ifdef CONFIG_DYNAMIC_FTRACE
-       .global _mcount
-       .set    _mcount, ftrace_stub
+       .global MCOUNT_NAME
+       .set    MCOUNT_NAME, ftrace_stub
 #endif
        ret
 ENDPROC(ftrace_stub)
@@ -78,7 +78,7 @@ ENDPROC(return_to_handler)
 #endif
 
 #ifndef CONFIG_DYNAMIC_FTRACE
-ENTRY(_mcount)
+ENTRY(MCOUNT_NAME)
        la      t4, ftrace_stub
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        la      t0, ftrace_graph_return
@@ -124,6 +124,6 @@ do_trace:
        jalr    t5
        RESTORE_ABI_STATE
        ret
-ENDPROC(_mcount)
+ENDPROC(MCOUNT_NAME)
 #endif
-EXPORT_SYMBOL(_mcount)
+EXPORT_SYMBOL(MCOUNT_NAME)
index 104fba8..68a9e3d 100644 (file)
@@ -408,13 +408,11 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 }
 
 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
 void *module_alloc(unsigned long size)
 {
-       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-                                   VMALLOC_END, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+       return __vmalloc_node_range(size, 1, MODULES_VADDR,
+                                   MODULES_END, GFP_KERNEL,
+                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
                                    __builtin_return_address(0));
 }
 #endif
index 7e2c78e..10b965c 100644 (file)
@@ -84,6 +84,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
        return 0;
 }
 
+void *alloc_insn_page(void)
+{
+       return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
+                                    GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
+                                    VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+                                    __builtin_return_address(0));
+}
+
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
@@ -260,8 +268,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
 
                if (kcb->kprobe_status == KPROBE_REENTER)
                        restore_previous_kprobe(kcb);
-               else
+               else {
+                       kprobes_restore_local_irqflag(kcb, regs);
                        reset_current_kprobe();
+               }
 
                break;
        case KPROBE_HIT_ACTIVE:
index d3bf756..7402a41 100644 (file)
 #include <asm/smp.h>
 
 /* default SBI version is 0.1 */
-unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT;
+unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT;
 EXPORT_SYMBOL(sbi_spec_version);
 
-static void (*__sbi_set_timer)(uint64_t stime);
-static int (*__sbi_send_ipi)(const unsigned long *hart_mask);
+static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init;
+static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init;
 static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
                           unsigned long start, unsigned long size,
-                          unsigned long arg4, unsigned long arg5);
+                          unsigned long arg4, unsigned long arg5) __ro_after_init;
 
 struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
                        unsigned long arg1, unsigned long arg2,
@@ -547,6 +547,21 @@ static inline long sbi_get_firmware_version(void)
        return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION);
 }
 
+long sbi_get_mvendorid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MVENDORID);
+}
+
+long sbi_get_marchid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MARCHID);
+}
+
+long sbi_get_mimpid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MIMPID);
+}
+
 static void sbi_send_cpumask_ipi(const struct cpumask *target)
 {
        struct cpumask hartid_mask;
@@ -556,7 +571,7 @@ static void sbi_send_cpumask_ipi(const struct cpumask *target)
        sbi_send_ipi(cpumask_bits(&hartid_mask));
 }
 
-static struct riscv_ipi_ops sbi_ipi_ops = {
+static const struct riscv_ipi_ops sbi_ipi_ops = {
        .ipi_inject = sbi_send_cpumask_ipi
 };
 
@@ -577,19 +592,19 @@ void __init sbi_init(void)
                        sbi_get_firmware_id(), sbi_get_firmware_version());
                if (sbi_probe_extension(SBI_EXT_TIME) > 0) {
                        __sbi_set_timer = __sbi_set_timer_v02;
-                       pr_info("SBI v0.2 TIME extension detected\n");
+                       pr_info("SBI TIME extension detected\n");
                } else {
                        __sbi_set_timer = __sbi_set_timer_v01;
                }
                if (sbi_probe_extension(SBI_EXT_IPI) > 0) {
                        __sbi_send_ipi  = __sbi_send_ipi_v02;
-                       pr_info("SBI v0.2 IPI extension detected\n");
+                       pr_info("SBI IPI extension detected\n");
                } else {
                        __sbi_send_ipi  = __sbi_send_ipi_v01;
                }
                if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) {
                        __sbi_rfence    = __sbi_rfence_v02;
-                       pr_info("SBI v0.2 RFENCE extension detected\n");
+                       pr_info("SBI RFENCE extension detected\n");
                } else {
                        __sbi_rfence    = __sbi_rfence_v01;
                }
index f8f1533..03901d3 100644 (file)
 #include <linux/swiotlb.h>
 #include <linux/smp.h>
 #include <linux/efi.h>
+#include <linux/crash_dump.h>
 
 #include <asm/cpu_ops.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable.h>
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/sections.h>
@@ -50,7 +52,11 @@ struct screen_info screen_info __section(".data") = {
  * This is used before the kernel initializes the BSS so it can't be in the
  * BSS.
  */
-atomic_t hart_lottery __section(".sdata");
+atomic_t hart_lottery __section(".sdata")
+#ifdef CONFIG_XIP_KERNEL
+= ATOMIC_INIT(0xC001BEEF)
+#endif
+;
 unsigned long boot_cpu_hartid;
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
 
@@ -60,10 +66,14 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * also add "System RAM" regions for compatibility with other
  * archs, and the rest of the known regions for completeness.
  */
+static struct resource kimage_res = { .name = "Kernel image", };
 static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
 static struct resource bss_res = { .name = "Kernel bss", };
+#ifdef CONFIG_CRASH_DUMP
+static struct resource elfcorehdr_res = { .name = "ELF Core hdr", };
+#endif
 
 static int __init add_resource(struct resource *parent,
                                struct resource *res)
@@ -80,45 +90,54 @@ static int __init add_resource(struct resource *parent,
        return 1;
 }
 
-static int __init add_kernel_resources(struct resource *res)
+static int __init add_kernel_resources(void)
 {
        int ret = 0;
 
        /*
         * The memory region of the kernel image is continuous and
-        * was reserved on setup_bootmem, find it here and register
-        * it as a resource, then register the various segments of
-        * the image as child nodes
+        * was reserved on setup_bootmem, register it here as a
+        * resource, with the various segments of the image as
+        * child nodes.
         */
-       if (!(res->start <= code_res.start && res->end >= data_res.end))
-               return 0;
 
-       res->name = "Kernel image";
-       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       code_res.start = __pa_symbol(_text);
+       code_res.end = __pa_symbol(_etext) - 1;
+       code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       /*
-        * We removed a part of this region on setup_bootmem so
-        * we need to expand the resource for the bss to fit in.
-        */
-       res->end = bss_res.end;
+       rodata_res.start = __pa_symbol(__start_rodata);
+       rodata_res.end = __pa_symbol(__end_rodata) - 1;
+       rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       ret = add_resource(&iomem_resource, res);
+       data_res.start = __pa_symbol(_data);
+       data_res.end = __pa_symbol(_edata) - 1;
+       data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       bss_res.start = __pa_symbol(__bss_start);
+       bss_res.end = __pa_symbol(__bss_stop) - 1;
+       bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       kimage_res.start = code_res.start;
+       kimage_res.end = bss_res.end;
+       kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       ret = add_resource(&iomem_resource, &kimage_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &code_res);
+       ret = add_resource(&kimage_res, &code_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &rodata_res);
+       ret = add_resource(&kimage_res, &rodata_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &data_res);
+       ret = add_resource(&kimage_res, &data_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &bss_res);
+       ret = add_resource(&kimage_res, &bss_res);
 
        return ret;
 }
@@ -129,54 +148,59 @@ static void __init init_resources(void)
        struct resource *res = NULL;
        struct resource *mem_res = NULL;
        size_t mem_res_sz = 0;
-       int ret = 0, i = 0;
-
-       code_res.start = __pa_symbol(_text);
-       code_res.end = __pa_symbol(_etext) - 1;
-       code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       rodata_res.start = __pa_symbol(__start_rodata);
-       rodata_res.end = __pa_symbol(__end_rodata) - 1;
-       rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       data_res.start = __pa_symbol(_data);
-       data_res.end = __pa_symbol(_edata) - 1;
-       data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       bss_res.start = __pa_symbol(__bss_start);
-       bss_res.end = __pa_symbol(__bss_stop) - 1;
-       bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       int num_resources = 0, res_idx = 0;
+       int ret = 0;
 
        /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */
-       mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res);
+       num_resources = memblock.memory.cnt + memblock.reserved.cnt + 1;
+       res_idx = num_resources - 1;
+
+       mem_res_sz = num_resources * sizeof(*mem_res);
        mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
        if (!mem_res)
                panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
+
        /*
         * Start by adding the reserved regions, if they overlap
         * with /memory regions, insert_resource later on will take
         * care of it.
         */
+       ret = add_kernel_resources();
+       if (ret < 0)
+               goto error;
+
+#ifdef CONFIG_KEXEC_CORE
+       if (crashk_res.start != crashk_res.end) {
+               ret = add_resource(&iomem_resource, &crashk_res);
+               if (ret < 0)
+                       goto error;
+       }
+#endif
+
+#ifdef CONFIG_CRASH_DUMP
+       if (elfcorehdr_size > 0) {
+               elfcorehdr_res.start = elfcorehdr_addr;
+               elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size - 1;
+               elfcorehdr_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+               add_resource(&iomem_resource, &elfcorehdr_res);
+       }
+#endif
+
        for_each_reserved_mem_region(region) {
-               res = &mem_res[i++];
+               res = &mem_res[res_idx--];
 
                res->name = "Reserved";
                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region));
                res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1;
 
-               ret = add_kernel_resources(res);
-               if (ret < 0)
-                       goto error;
-               else if (ret)
-                       continue;
-
                /*
                 * Ignore any other reserved regions within
                 * system memory.
                 */
                if (memblock_is_memory(res->start)) {
-                       memblock_free((phys_addr_t) res, sizeof(struct resource));
+                       /* Re-use this pre-allocated resource */
+                       res_idx++;
                        continue;
                }
 
@@ -187,7 +211,7 @@ static void __init init_resources(void)
 
        /* Add /memory regions to the resource tree */
        for_each_mem_region(region) {
-               res = &mem_res[i++];
+               res = &mem_res[res_idx--];
 
                if (unlikely(memblock_is_nomap(region))) {
                        res->name = "Reserved";
@@ -205,6 +229,9 @@ static void __init init_resources(void)
                        goto error;
        }
 
+       /* Clean-up any unused pre-allocated resources */
+       mem_res_sz = (num_resources - res_idx + 1) * sizeof(*mem_res);
+       memblock_free((phys_addr_t) mem_res, mem_res_sz);
        return;
 
  error:
@@ -251,21 +278,24 @@ void __init setup_arch(char **cmdline_p)
        efi_init();
        setup_bootmem();
        paging_init();
-       init_resources();
 #if IS_ENABLED(CONFIG_BUILTIN_DTB)
        unflatten_and_copy_device_tree();
 #else
-       if (early_init_dt_verify(__va(dtb_early_pa)))
+       if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa))))
                unflatten_device_tree();
        else
                pr_err("No DTB found in kernel mappings\n");
 #endif
        misc_mem_init();
 
+       init_resources();
        sbi_init();
 
-       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
                protect_kernel_text_data();
+               protect_kernel_linear_mapping_text_rodata();
+       }
+
 #ifdef CONFIG_SWIOTLB
        swiotlb_init(1);
 #endif
index ea028d9..921d9d7 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/profile.h>
@@ -27,10 +28,11 @@ enum ipi_message_type {
        IPI_CALL_FUNC,
        IPI_CPU_STOP,
        IPI_IRQ_WORK,
+       IPI_TIMER,
        IPI_MAX
 };
 
-unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
+unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = {
        [0 ... NR_CPUS-1] = INVALID_HARTID
 };
 
@@ -54,7 +56,7 @@ int riscv_hartid_to_cpuid(int hartid)
                        return i;
 
        pr_err("Couldn't find cpu id for hartid [%d]\n", hartid);
-       return i;
+       return -ENOENT;
 }
 
 void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out)
@@ -85,9 +87,9 @@ static void ipi_stop(void)
                wait_for_interrupt();
 }
 
-static struct riscv_ipi_ops *ipi_ops;
+static const struct riscv_ipi_ops *ipi_ops __ro_after_init;
 
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
 {
        ipi_ops = ops;
 }
@@ -176,6 +178,12 @@ void handle_IPI(struct pt_regs *regs)
                        irq_work_run();
                }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+               if (ops & (1 << IPI_TIMER)) {
+                       stats[IPI_TIMER]++;
+                       tick_receive_broadcast();
+               }
+#endif
                BUG_ON((ops >> IPI_MAX) != 0);
 
                /* Order data access and bit testing. */
@@ -192,6 +200,7 @@ static const char * const ipi_names[] = {
        [IPI_CALL_FUNC]         = "Function call interrupts",
        [IPI_CPU_STOP]          = "CPU stop interrupts",
        [IPI_IRQ_WORK]          = "IRQ work interrupts",
+       [IPI_TIMER]             = "Timer broadcast interrupts",
 };
 
 void show_ipi_stats(struct seq_file *p, int prec)
@@ -217,6 +226,13 @@ void arch_send_call_function_single_ipi(int cpu)
        send_ipi_single(cpu, IPI_CALL_FUNC);
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
+{
+       send_ipi_mask(mask, IPI_TIMER);
+}
+#endif
+
 void smp_send_stop(void)
 {
        unsigned long timeout;
index 5e276c2..9a408e2 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/sections.h>
 #include <asm/sbi.h>
 #include <asm/smp.h>
+#include <asm/alternative.h>
 
 #include "head.h"
 
@@ -40,6 +41,9 @@ static DECLARE_COMPLETION(cpu_running);
 void __init smp_prepare_boot_cpu(void)
 {
        init_cpu_topology();
+#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE
+       apply_boot_alternatives();
+#endif
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
index f1ead9d..a63c667 100644 (file)
@@ -13,7 +13,7 @@
 #undef __SYSCALL
 #define __SYSCALL(nr, call)    [nr] = (call),
 
-void *sys_call_table[__NR_syscalls] = {
+void * const sys_call_table[__NR_syscalls] = {
        [0 ... __NR_syscalls - 1] = sys_ni_syscall,
 #include <asm/unistd.h>
 };
index 1b43226..8217b0f 100644 (file)
@@ -11,7 +11,7 @@
 #include <asm/processor.h>
 #include <asm/timex.h>
 
-unsigned long riscv_timebase;
+unsigned long riscv_timebase __ro_after_init;
 EXPORT_SYMBOL_GPL(riscv_timebase);
 
 void __init time_init(void)
index 1357abf..0721b97 100644 (file)
@@ -25,8 +25,6 @@
 
 int show_unhandled_signals = 1;
 
-extern asmlinkage void handle_exception(void);
-
 static DEFINE_SPINLOCK(die_lock);
 
 void die(struct pt_regs *regs, const char *str)
@@ -197,6 +195,6 @@ int is_valid_bugaddr(unsigned long pc)
 #endif /* CONFIG_GENERIC_BUG */
 
 /* stvec & scratch is already set from head.S */
-void trap_init(void)
+void __init trap_init(void)
 {
 }
index 3f1d35e..25a3b88 100644 (file)
@@ -20,8 +20,8 @@
 
 extern char vdso_start[], vdso_end[];
 
-static unsigned int vdso_pages;
-static struct page **vdso_pagelist;
+static unsigned int vdso_pages __ro_after_init;
+static struct page **vdso_pagelist __ro_after_init;
 
 /*
  * The vDSO data page.
index 71a315e..24d936c 100644 (file)
@@ -23,7 +23,7 @@ ifneq ($(c-gettimeofday-y),)
 endif
 
 # Build rules
-targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-syms.S
 obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
 
 obj-y += vdso.o vdso-syms.o
@@ -41,11 +41,10 @@ KASAN_SANITIZE := n
 $(obj)/vdso.o: $(obj)/vdso.so
 
 # link rule for the .so file, .lds has to be first
-SYSCFLAGS_vdso.so.dbg = $(c_flags)
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE
        $(call if_changed,vdsold)
-SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
-       -Wl,--build-id=sha1 -Wl,--hash-style=both
+LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \
+       --build-id=sha1 --hash-style=both --eh-frame-hdr
 
 # We also create a special relocatable object that should mirror the symbol
 # table and layout of the linked DSO. With ld --just-symbols we can then
@@ -60,13 +59,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 
 # actual build commands
 # The DSO images are built using a special linker script
-# Add -lgcc so rv32 gets static muldi3 and lshrdi3 definitions.
 # Make sure only to export the intended __vdso_xxx symbol offsets.
 quiet_cmd_vdsold = VDSOLD  $@
-      cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \
-                           -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \
-                   $(CROSS_COMPILE)objcopy \
-                           $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \
+      cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \
+                   $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \
                    rm $@.tmp
 
 # Extracts symbol offsets from the VDSO, converting them into an assembly file
diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S
new file mode 100644 (file)
index 0000000..4b29b99
--- /dev/null
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Regents of the University of California
+ * Copyright (C) 2017 SiFive
+ * Copyright (C) 2020 Vitaly Wool, Konsulko AB
+ */
+
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
+/* No __ro_after_init data in the .rodata section - which will always be ro */
+#define RO_AFTER_INIT_DATA
+
+#include <asm/vmlinux.lds.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+
+OUTPUT_ARCH(riscv)
+ENTRY(_start)
+
+jiffies = jiffies_64;
+
+SECTIONS
+{
+       /* Beginning of code and text segment */
+       . = LOAD_OFFSET;
+       _xiprom = .;
+       _start = .;
+       HEAD_TEXT_SECTION
+       INIT_TEXT_SECTION(PAGE_SIZE)
+       /* we have to discard exit text and such at runtime, not link time */
+       .exit.text :
+       {
+               EXIT_TEXT
+       }
+
+       .text : {
+               _text = .;
+               _stext = .;
+               TEXT_TEXT
+               SCHED_TEXT
+               CPUIDLE_TEXT
+               LOCK_TEXT
+               KPROBES_TEXT
+               ENTRY_TEXT
+               IRQENTRY_TEXT
+               SOFTIRQENTRY_TEXT
+               *(.fixup)
+               _etext = .;
+       }
+       RO_DATA(L1_CACHE_BYTES)
+       .srodata : {
+               *(.srodata*)
+       }
+       .init.rodata : {
+               INIT_SETUP(16)
+               INIT_CALLS
+               CON_INITCALL
+               INIT_RAM_FS
+       }
+       _exiprom = .;                   /* End of XIP ROM area */
+
+
+/*
+ * From this point, stuff is considered writable and will be copied to RAM
+ */
+       __data_loc = ALIGN(16);         /* location in file */
+       . = LOAD_OFFSET + XIP_OFFSET;   /* location in memory */
+
+       _sdata = .;                     /* Start of data section */
+       _data = .;
+       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+       _edata = .;
+       __start_ro_after_init = .;
+       .data.ro_after_init : AT(ADDR(.data.ro_after_init) - LOAD_OFFSET) {
+               *(.data..ro_after_init)
+       }
+       __end_ro_after_init = .;
+
+       . = ALIGN(PAGE_SIZE);
+       __init_begin = .;
+       .init.data : {
+               INIT_DATA
+       }
+       .exit.data : {
+               EXIT_DATA
+       }
+       . = ALIGN(8);
+       __soc_early_init_table : {
+               __soc_early_init_table_start = .;
+               KEEP(*(__soc_early_init_table))
+               __soc_early_init_table_end = .;
+       }
+       __soc_builtin_dtb_table : {
+               __soc_builtin_dtb_table_start = .;
+               KEEP(*(__soc_builtin_dtb_table))
+               __soc_builtin_dtb_table_end = .;
+       }
+       PERCPU_SECTION(L1_CACHE_BYTES)
+
+       . = ALIGN(PAGE_SIZE);
+       __init_end = .;
+
+       .sdata : {
+               __global_pointer$ = . + 0x800;
+               *(.sdata*)
+               *(.sbss*)
+       }
+
+       BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
+       EXCEPTION_TABLE(0x10)
+
+       .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) {
+               *(.rel.dyn*)
+       }
+
+       /*
+        * End of copied data. We need a dummy section to get its LMA.
+        * Also located before final ALIGN() as trailing padding is not stored
+        * in the resulting binary file and useless to copy.
+        */
+       .data.endmark : AT(ADDR(.data.endmark) - LOAD_OFFSET) { }
+       _edata_loc = LOADADDR(.data.endmark);
+
+       . = ALIGN(PAGE_SIZE);
+       _end = .;
+
+       STABS_DEBUG
+       DWARF_DEBUG
+
+       DISCARDS
+}
index de03cb2..891742f 100644 (file)
@@ -4,7 +4,13 @@
  * Copyright (C) 2017 SiFive
  */
 
-#define LOAD_OFFSET PAGE_OFFSET
+#ifdef CONFIG_XIP_KERNEL
+#include "vmlinux-xip.lds.S"
+#else
+
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
+
 #include <asm/vmlinux.lds.h>
 #include <asm/page.h>
 #include <asm/cache.h>
@@ -90,6 +96,13 @@ SECTIONS
        }
 
        __init_data_end = .;
+
+       . = ALIGN(8);
+       .alternative : {
+               __alt_start = .;
+               *(.alternative)
+               __alt_end = .;
+       }
        __init_end = .;
 
        /* Start of data section */
@@ -132,3 +145,4 @@ SECTIONS
 
        DISCARDS
 }
+#endif /* CONFIG_XIP_KERNEL */
index c5dbd55..096463c 100644 (file)
@@ -231,6 +231,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
                return;
        }
 
+#ifdef CONFIG_64BIT
+       /*
+        * Modules in 64bit kernels lie in their own virtual region which is not
+        * in the vmalloc region, but dealing with page faults in this region
+        * or the vmalloc region amounts to doing the same thing: checking that
+        * the mapping exists in init_mm.pgd and updating user page table, so
+        * just use vmalloc_fault.
+        */
+       if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) {
+               vmalloc_fault(regs, code, addr);
+               return;
+       }
+#endif
        /* Enable interrupts if they were enabled in the parent context. */
        if (likely(regs->status & SR_PIE))
                local_irq_enable();
index 92e39cf..4faf8bd 100644 (file)
@@ -2,6 +2,8 @@
 /*
  * Copyright (C) 2012 Regents of the University of California
  * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
  */
 
 #include <linux/init.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
 #include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/libfdt.h>
 #include <linux/set_memory.h>
 #include <linux/dma-map-ops.h>
+#include <linux/crash_dump.h>
 
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 
 #include "../kernel/head.h"
 
+unsigned long kernel_virt_addr = KERNEL_LINK_ADDR;
+EXPORT_SYMBOL(kernel_virt_addr);
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_virt_addr       (*((unsigned long *)XIP_FIXUP(&kernel_virt_addr)))
+#endif
+
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
                                                        __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
 extern char _start[];
 #define DTB_EARLY_BASE_VA      PGDIR_SIZE
-void *dtb_early_va __initdata;
-uintptr_t dtb_early_pa __initdata;
+void *_dtb_early_va __initdata;
+uintptr_t _dtb_early_pa __initdata;
 
 struct pt_alloc_ops {
        pte_t *(*get_pte_virt)(phys_addr_t pa);
@@ -57,7 +67,7 @@ static void __init zone_sizes_init(void)
        free_area_init(max_zone_pfns);
 }
 
-static void setup_zero_page(void)
+static void __init setup_zero_page(void)
 {
        memset((void *)empty_zero_page, 0, PAGE_SIZE);
 }
@@ -75,7 +85,7 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t)
                  (((t) - (b)) >> 20));
 }
 
-static void print_vm_layout(void)
+static void __init print_vm_layout(void)
 {
        pr_notice("Virtual kernel memory layout:\n");
        print_mlk("fixmap", (unsigned long)FIXADDR_START,
@@ -88,6 +98,10 @@ static void print_vm_layout(void)
                  (unsigned long)VMALLOC_END);
        print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
                  (unsigned long)high_memory);
+#ifdef CONFIG_64BIT
+       print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR,
+                 (unsigned long)ADDRESS_SPACE_END);
+#endif
 }
 #else
 static void print_vm_layout(void) { }
@@ -112,10 +126,24 @@ void __init setup_bootmem(void)
        phys_addr_t dram_end = memblock_end_of_DRAM();
        phys_addr_t max_mapped_addr = __pa(~(ulong)0);
 
+#ifdef CONFIG_XIP_KERNEL
+       vmlinux_start = __pa_symbol(&_sdata);
+#endif
+
        /* The maximal physical memory size is -PAGE_OFFSET. */
        memblock_enforce_memory_limit(-PAGE_OFFSET);
 
-       /* Reserve from the start of the kernel to the end of the kernel */
+       /*
+        * Reserve from the start of the kernel to the end of the kernel
+        */
+#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX)
+       /*
+        * Make sure we align the reservation on PMD_SIZE since we will
+        * map the kernel in the linear mapping as read-only: we do not want
+        * any allocation to happen between _end and the next pmd aligned page.
+        */
+       vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK;
+#endif
        memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
 
        /*
@@ -127,8 +155,9 @@ void __init setup_bootmem(void)
        if (max_mapped_addr == (dram_end - 1))
                memblock_set_current_limit(max_mapped_addr - 4096);
 
-       max_pfn = PFN_DOWN(dram_end);
-       max_low_pfn = max_pfn;
+       min_low_pfn = PFN_UP(memblock_start_of_DRAM());
+       max_low_pfn = max_pfn = PFN_DOWN(dram_end);
+
        dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn));
        set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
 
@@ -147,12 +176,42 @@ void __init setup_bootmem(void)
        memblock_allow_resize();
 }
 
+#ifdef CONFIG_XIP_KERNEL
+
+extern char _xiprom[], _exiprom[];
+extern char _sdata[], _edata[];
+
+#endif /* CONFIG_XIP_KERNEL */
+
 #ifdef CONFIG_MMU
-static struct pt_alloc_ops pt_ops;
+static struct pt_alloc_ops _pt_ops __ro_after_init;
 
-unsigned long va_pa_offset;
+#ifdef CONFIG_XIP_KERNEL
+#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
+#else
+#define pt_ops _pt_ops
+#endif
+
+/* Offset between linear mapping virtual address and kernel load address */
+unsigned long va_pa_offset __ro_after_init;
 EXPORT_SYMBOL(va_pa_offset);
-unsigned long pfn_base;
+#ifdef CONFIG_XIP_KERNEL
+#define va_pa_offset   (*((unsigned long *)XIP_FIXUP(&va_pa_offset)))
+#endif
+/* Offset between kernel mapping virtual address and kernel load address */
+#ifdef CONFIG_64BIT
+unsigned long va_kernel_pa_offset;
+EXPORT_SYMBOL(va_kernel_pa_offset);
+#endif
+#ifdef CONFIG_XIP_KERNEL
+#define va_kernel_pa_offset    (*((unsigned long *)XIP_FIXUP(&va_kernel_pa_offset)))
+#endif
+unsigned long va_kernel_xip_pa_offset;
+EXPORT_SYMBOL(va_kernel_xip_pa_offset);
+#ifdef CONFIG_XIP_KERNEL
+#define va_kernel_xip_pa_offset        (*((unsigned long *)XIP_FIXUP(&va_kernel_xip_pa_offset)))
+#endif
+unsigned long pfn_base __ro_after_init;
 EXPORT_SYMBOL(pfn_base);
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -161,6 +220,12 @@ pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
 
 pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
 
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pg_dir      ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
+#define fixmap_pte             ((pte_t *)XIP_FIXUP(fixmap_pte))
+#define early_pg_dir           ((pgd_t *)XIP_FIXUP(early_pg_dir))
+#endif /* CONFIG_XIP_KERNEL */
+
 void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
 {
        unsigned long addr = __fix_to_virt(idx);
@@ -212,8 +277,8 @@ static phys_addr_t alloc_pte_late(uintptr_t va)
        unsigned long vaddr;
 
        vaddr = __get_free_page(GFP_KERNEL);
-       if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr)))
-               BUG();
+       BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr)));
+
        return __pa(vaddr);
 }
 
@@ -236,6 +301,12 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
 pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd))
+#define fixmap_pmd     ((pmd_t *)XIP_FIXUP(fixmap_pmd))
+#define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
+#endif /* CONFIG_XIP_KERNEL */
+
 static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
 {
        /* Before MMU is enabled */
@@ -255,7 +326,7 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa)
 
 static phys_addr_t __init alloc_pmd_early(uintptr_t va)
 {
-       BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT);
+       BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
 
        return (uintptr_t)early_pmd;
 }
@@ -352,6 +423,19 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
        return PMD_SIZE;
 }
 
+#ifdef CONFIG_XIP_KERNEL
+/* called from head.S with MMU off */
+asmlinkage void __init __copy_data(void)
+{
+       void *from = (void *)(&_sdata);
+       void *end = (void *)(&_end);
+       void *to = (void *)CONFIG_PHYS_RAM_BASE;
+       size_t sz = (size_t)(end - from + 1);
+
+       memcpy(to, from, sz);
+}
+#endif
+
 /*
  * setup_vm() is called from head.S with MMU-off.
  *
@@ -370,17 +454,74 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
 #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
 #endif
 
+uintptr_t load_pa, load_sz;
+#ifdef CONFIG_XIP_KERNEL
+#define load_pa        (*((uintptr_t *)XIP_FIXUP(&load_pa)))
+#define load_sz        (*((uintptr_t *)XIP_FIXUP(&load_sz)))
+#endif
+
+#ifdef CONFIG_XIP_KERNEL
+uintptr_t xiprom, xiprom_sz;
+#define xiprom_sz      (*((uintptr_t *)XIP_FIXUP(&xiprom_sz)))
+#define xiprom         (*((uintptr_t *)XIP_FIXUP(&xiprom)))
+
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+       uintptr_t va, end_va;
+
+       /* Map the flash resident part */
+       end_va = kernel_virt_addr + xiprom_sz;
+       for (va = kernel_virt_addr; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  xiprom + (va - kernel_virt_addr),
+                                  map_size, PAGE_KERNEL_EXEC);
+
+       /* Map the data in RAM */
+       end_va = kernel_virt_addr + XIP_OFFSET + load_sz;
+       for (va = kernel_virt_addr + XIP_OFFSET; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  load_pa + (va - (kernel_virt_addr + XIP_OFFSET)),
+                                  map_size, PAGE_KERNEL);
+}
+#else
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+       uintptr_t va, end_va;
+
+       end_va = kernel_virt_addr + load_sz;
+       for (va = kernel_virt_addr; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  load_pa + (va - kernel_virt_addr),
+                                  map_size, PAGE_KERNEL_EXEC);
+}
+#endif
+
 asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 {
-       uintptr_t va, pa, end_va;
-       uintptr_t load_pa = (uintptr_t)(&_start);
-       uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
+       uintptr_t __maybe_unused pa;
        uintptr_t map_size;
 #ifndef __PAGETABLE_PMD_FOLDED
        pmd_t fix_bmap_spmd, fix_bmap_epmd;
 #endif
 
+#ifdef CONFIG_XIP_KERNEL
+       xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
+       xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom);
+
+       load_pa = (uintptr_t)CONFIG_PHYS_RAM_BASE;
+       load_sz = (uintptr_t)(&_end) - (uintptr_t)(&_sdata);
+
+       va_kernel_xip_pa_offset = kernel_virt_addr - xiprom;
+#else
+       load_pa = (uintptr_t)(&_start);
+       load_sz = (uintptr_t)(&_end) - load_pa;
+#endif
+
        va_pa_offset = PAGE_OFFSET - load_pa;
+#ifdef CONFIG_64BIT
+       va_kernel_pa_offset = kernel_virt_addr - load_pa;
+#endif
+
        pfn_base = PFN_DOWN(load_pa);
 
        /*
@@ -408,26 +549,27 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
        create_pmd_mapping(fixmap_pmd, FIXADDR_START,
                           (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
        /* Setup trampoline PGD and PMD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
-       create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+#ifdef CONFIG_XIP_KERNEL
+       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
+                          xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
+#else
+       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
                           load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
+#endif
 #else
        /* Setup trampoline PGD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
 #endif
 
        /*
-        * Setup early PGD covering entire kernel which will allows
+        * Setup early PGD covering entire kernel which will allow
         * us to reach paging_init(). We map all memory banks later
         * in setup_vm_final() below.
         */
-       end_va = PAGE_OFFSET + load_sz;
-       for (va = PAGE_OFFSET; va < end_va; va += map_size)
-               create_pgd_mapping(early_pg_dir, va,
-                                  load_pa + (va - PAGE_OFFSET),
-                                  map_size, PAGE_KERNEL_EXEC);
+       create_kernel_page_table(early_pg_dir, map_size);
 
 #ifndef __PAGETABLE_PMD_FOLDED
        /* Setup early PMD for DTB */
@@ -442,7 +584,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       /*
+        * __va can't be used since it would return a linear mapping address
+        * whereas dtb_early_va will be used before setup_vm_final installs
+        * the linear mapping.
+        */
+       dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa));
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #else
 #ifndef CONFIG_BUILTIN_DTB
@@ -454,7 +605,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa));
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #endif
        dtb_early_pa = dtb_pa;
@@ -490,6 +645,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 #endif
 }
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX)
+void protect_kernel_linear_mapping_text_rodata(void)
+{
+       unsigned long text_start = (unsigned long)lm_alias(_start);
+       unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin);
+       unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata);
+       unsigned long data_start = (unsigned long)lm_alias(_data);
+
+       set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+       set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+
+       set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+}
+#endif
+
 static void __init setup_vm_final(void)
 {
        uintptr_t va, map_size;
@@ -511,7 +682,7 @@ static void __init setup_vm_final(void)
                           __pa_symbol(fixmap_pgd_next),
                           PGDIR_SIZE, PAGE_TABLE);
 
-       /* Map all memory banks */
+       /* Map all memory banks in the linear mapping */
        for_each_mem_range(i, &start, &end) {
                if (start >= end)
                        break;
@@ -523,10 +694,22 @@ static void __init setup_vm_final(void)
                for (pa = start; pa < end; pa += map_size) {
                        va = (uintptr_t)__va(pa);
                        create_pgd_mapping(swapper_pg_dir, va, pa,
-                                          map_size, PAGE_KERNEL_EXEC);
+                                          map_size,
+#ifdef CONFIG_64BIT
+                                          PAGE_KERNEL
+#else
+                                          PAGE_KERNEL_EXEC
+#endif
+                                       );
+
                }
        }
 
+#ifdef CONFIG_64BIT
+       /* Map the kernel */
+       create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
+#endif
+
        /* Clear fixmap PTE and PMD mappings */
        clear_fixmap(FIX_PTE);
        clear_fixmap(FIX_PMD);
@@ -556,7 +739,7 @@ static inline void setup_vm_final(void)
 #endif /* CONFIG_MMU */
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-void protect_kernel_text_data(void)
+void __init protect_kernel_text_data(void)
 {
        unsigned long text_start = (unsigned long)_start;
        unsigned long init_text_start = (unsigned long)__init_text_begin;
@@ -584,6 +767,103 @@ void mark_rodata_ro(void)
 }
 #endif
 
+#ifdef CONFIG_KEXEC_CORE
+/*
+ * reserve_crashkernel() - reserves memory for crash kernel
+ *
+ * This function reserves memory area given in "crashkernel=" kernel command
+ * line parameter. The memory reserved is used by dump capture kernel when
+ * primary kernel is crashing.
+ */
+static void __init reserve_crashkernel(void)
+{
+       unsigned long long crash_base = 0;
+       unsigned long long crash_size = 0;
+       unsigned long search_start = memblock_start_of_DRAM();
+       unsigned long search_end = memblock_end_of_DRAM();
+
+       int ret = 0;
+
+       /*
+        * Don't reserve a region for a crash kernel on a crash kernel
+        * since it doesn't make much sense and we have limited memory
+        * resources.
+        */
+#ifdef CONFIG_CRASH_DUMP
+       if (is_kdump_kernel()) {
+               pr_info("crashkernel: ignoring reservation request\n");
+               return;
+       }
+#endif
+
+       ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+                               &crash_size, &crash_base);
+       if (ret || !crash_size)
+               return;
+
+       crash_size = PAGE_ALIGN(crash_size);
+
+       if (crash_base == 0) {
+               /*
+                * Current riscv boot protocol requires 2MB alignment for
+                * RV64 and 4MB alignment for RV32 (hugepage size)
+                */
+               crash_base = memblock_find_in_range(search_start, search_end,
+                                                   crash_size, PMD_SIZE);
+
+               if (crash_base == 0) {
+                       pr_warn("crashkernel: couldn't allocate %lldKB\n",
+                               crash_size >> 10);
+                       return;
+               }
+       } else {
+               /* User specifies base address explicitly. */
+               if (!memblock_is_region_memory(crash_base, crash_size)) {
+                       pr_warn("crashkernel: requested region is not memory\n");
+                       return;
+               }
+
+               if (memblock_is_region_reserved(crash_base, crash_size)) {
+                       pr_warn("crashkernel: requested region is reserved\n");
+                       return;
+               }
+
+
+               if (!IS_ALIGNED(crash_base, PMD_SIZE)) {
+                       pr_warn("crashkernel: requested region is misaligned\n");
+                       return;
+               }
+       }
+       memblock_reserve(crash_base, crash_size);
+
+       pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n",
+               crash_base, crash_base + crash_size, crash_size >> 20);
+
+       crashk_res.start = crash_base;
+       crashk_res.end = crash_base + crash_size - 1;
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * We keep track of the ELF core header of the crashed
+ * kernel with a reserved-memory region with compatible
+ * string "linux,elfcorehdr". Here we register a callback
+ * to populate elfcorehdr_addr/size when this region is
+ * present. Note that this region will be marked as
+ * reserved once we call early_init_fdt_scan_reserved_mem()
+ * later on.
+ */
+static int elfcore_hdr_setup(struct reserved_mem *rmem)
+{
+       elfcorehdr_addr = rmem->base;
+       elfcorehdr_size = rmem->size;
+       return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(elfcorehdr, "linux,elfcorehdr", elfcore_hdr_setup);
+#endif
+
 void __init paging_init(void)
 {
        setup_vm_final();
@@ -592,9 +872,13 @@ void __init paging_init(void)
 
 void __init misc_mem_init(void)
 {
+       early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT);
        arch_numa_init();
        sparse_init();
        zone_sizes_init();
+#ifdef CONFIG_KEXEC_CORE
+       reserve_crashkernel();
+#endif
        memblock_dump_all();
 }
 
index 937d13c..9daacae 100644 (file)
 #include <asm/fixmap.h>
 #include <asm/pgalloc.h>
 
-static __init void *early_alloc(size_t size, int node)
-{
-       void *ptr = memblock_alloc_try_nid(size, size,
-               __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node);
-
-       if (!ptr)
-               panic("%pS: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n",
-                       __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS));
-
-       return ptr;
-}
-
 extern pgd_t early_pg_dir[PTRS_PER_PGD];
 asmlinkage void __init kasan_early_init(void)
 {
@@ -60,7 +48,7 @@ asmlinkage void __init kasan_early_init(void)
        local_flush_tlb_all();
 }
 
-static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pte_t *ptep, *base_pte;
@@ -82,7 +70,7 @@ static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long en
        set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
 }
 
-static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pmd_t *pmdp, *base_pmd;
@@ -117,7 +105,7 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en
        set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
 }
 
-static void kasan_populate_pgd(unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pgd_t *pgdp = pgd_offset_k(vaddr);
@@ -155,39 +143,27 @@ static void __init kasan_populate(void *start, void *end)
        memset(start, KASAN_SHADOW_INIT, end - start);
 }
 
+static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
+{
+       unsigned long next;
+       void *p;
+       pgd_t *pgd_k = pgd_offset_k(vaddr);
+
+       do {
+               next = pgd_addr_end(vaddr, end);
+               if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
+                       p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+                       set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
+               }
+       } while (pgd_k++, vaddr = next, vaddr != end);
+}
+
 static void __init kasan_shallow_populate(void *start, void *end)
 {
        unsigned long vaddr = (unsigned long)start & PAGE_MASK;
        unsigned long vend = PAGE_ALIGN((unsigned long)end);
-       unsigned long pfn;
-       int index;
-       void *p;
-       pud_t *pud_dir, *pud_k;
-       pgd_t *pgd_dir, *pgd_k;
-       p4d_t *p4d_dir, *p4d_k;
-
-       while (vaddr < vend) {
-               index = pgd_index(vaddr);
-               pfn = csr_read(CSR_SATP) & SATP_PPN;
-               pgd_dir = (pgd_t *)pfn_to_virt(pfn) + index;
-               pgd_k = init_mm.pgd + index;
-               pgd_dir = pgd_offset_k(vaddr);
-               set_pgd(pgd_dir, *pgd_k);
-
-               p4d_dir = p4d_offset(pgd_dir, vaddr);
-               p4d_k  = p4d_offset(pgd_k, vaddr);
-
-               vaddr = (vaddr + PUD_SIZE) & PUD_MASK;
-               pud_dir = pud_offset(p4d_dir, vaddr);
-               pud_k = pud_offset(p4d_k, vaddr);
-
-               if (pud_present(*pud_dir)) {
-                       p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
-                       pud_populate(&init_mm, pud_dir, p);
-               }
-               vaddr += PAGE_SIZE;
-       }
 
+       kasan_shallow_populate_pgd(vaddr, vend);
        local_flush_tlb_all();
 }
 
@@ -196,6 +172,10 @@ void __init kasan_init(void)
        phys_addr_t _start, _end;
        u64 i;
 
+       /*
+        * Populate all kernel virtual address space with kasan_early_shadow_page
+        * except for the linear mapping and the modules/kernel/BPF mapping.
+        */
        kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
                                    (void *)kasan_mem_to_shadow((void *)
                                                                VMEMMAP_END));
@@ -208,6 +188,7 @@ void __init kasan_init(void)
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
 
+       /* Populate the linear mapping */
        for_each_mem_range(i, &_start, &_end) {
                void *start = (void *)__va(_start);
                void *end = (void *)__va(_end);
@@ -218,6 +199,10 @@ void __init kasan_init(void)
                kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end));
        }
 
+       /* Populate kernel, BPF, modules mapping */
+       kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
+                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+
        for (i = 0; i < PTRS_PER_PTE; i++)
                set_pte(&kasan_early_shadow_pte[i],
                        mk_pte(virt_to_page(kasan_early_shadow_page),
index e8e4dcd..35703d5 100644 (file)
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
 
 phys_addr_t __phys_addr_symbol(unsigned long x)
 {
-       unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
+       unsigned long kernel_start = (unsigned long)kernel_virt_addr;
        unsigned long kernel_end = (unsigned long)_end;
 
        /*
index ace74de..0536ac8 100644 (file)
@@ -58,29 +58,56 @@ struct ptd_mm_info {
        unsigned long end;
 };
 
+enum address_markers_idx {
+#ifdef CONFIG_KASAN
+       KASAN_SHADOW_START_NR,
+       KASAN_SHADOW_END_NR,
+#endif
+       FIXMAP_START_NR,
+       FIXMAP_END_NR,
+       PCI_IO_START_NR,
+       PCI_IO_END_NR,
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       VMEMMAP_START_NR,
+       VMEMMAP_END_NR,
+#endif
+       VMALLOC_START_NR,
+       VMALLOC_END_NR,
+       PAGE_OFFSET_NR,
+#ifdef CONFIG_64BIT
+       MODULES_MAPPING_NR,
+       KERNEL_MAPPING_NR,
+#endif
+       END_OF_SPACE_NR
+};
+
 static struct addr_marker address_markers[] = {
 #ifdef CONFIG_KASAN
-       {KASAN_SHADOW_START,    "Kasan shadow start"},
-       {KASAN_SHADOW_END,      "Kasan shadow end"},
+       {0, "Kasan shadow start"},
+       {0, "Kasan shadow end"},
 #endif
-       {FIXADDR_START,         "Fixmap start"},
-       {FIXADDR_TOP,           "Fixmap end"},
-       {PCI_IO_START,          "PCI I/O start"},
-       {PCI_IO_END,            "PCI I/O end"},
+       {0, "Fixmap start"},
+       {0, "Fixmap end"},
+       {0, "PCI I/O start"},
+       {0, "PCI I/O end"},
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-       {VMEMMAP_START,         "vmemmap start"},
-       {VMEMMAP_END,           "vmemmap end"},
+       {0, "vmemmap start"},
+       {0, "vmemmap end"},
+#endif
+       {0, "vmalloc() area"},
+       {0, "vmalloc() end"},
+       {0, "Linear mapping"},
+#ifdef CONFIG_64BIT
+       {0, "Modules mapping"},
+       {0, "Kernel mapping (kernel, BPF)"},
 #endif
-       {VMALLOC_START,         "vmalloc() area"},
-       {VMALLOC_END,           "vmalloc() end"},
-       {PAGE_OFFSET,           "Linear mapping"},
        {-1, NULL},
 };
 
 static struct ptd_mm_info kernel_ptd_info = {
        .mm             = &init_mm,
        .markers        = address_markers,
-       .base_addr      = KERN_VIRT_START,
+       .base_addr      = 0,
        .end            = ULONG_MAX,
 };
 
@@ -331,10 +358,32 @@ static int ptdump_show(struct seq_file *m, void *v)
 
 DEFINE_SHOW_ATTRIBUTE(ptdump);
 
-static int ptdump_init(void)
+static int __init ptdump_init(void)
 {
        unsigned int i, j;
 
+#ifdef CONFIG_KASAN
+       address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+       address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
+       address_markers[FIXMAP_START_NR].start_address = FIXADDR_START;
+       address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP;
+       address_markers[PCI_IO_START_NR].start_address = PCI_IO_START;
+       address_markers[PCI_IO_END_NR].start_address = PCI_IO_END;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+       address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END;
+#endif
+       address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+       address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+       address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET;
+#ifdef CONFIG_64BIT
+       address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR;
+       address_markers[KERNEL_MAPPING_NR].start_address = kernel_virt_addr;
+#endif
+
+       kernel_ptd_info.base_addr = KERN_VIRT_START;
+
        for (i = 0; i < ARRAY_SIZE(pg_level); i++)
                for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
                        pg_level[i].mask |= pte_bits[j].mask;
index b44ff52..87e3bf5 100644 (file)
@@ -1148,16 +1148,3 @@ void bpf_jit_build_epilogue(struct rv_jit_context *ctx)
 {
        __build_epilogue(false, ctx);
 }
-
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-       return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-                                   BPF_JIT_REGION_END, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-                                   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-       return vfree(addr);
-}
index 3630d44..fed86f4 100644 (file)
@@ -152,6 +152,7 @@ skip_init_ctx:
        bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
 
        if (!prog->is_func || extra_pass) {
+               bpf_jit_binary_lock_ro(jit_data->header);
 out_offset:
                kfree(ctx->offset);
                kfree(jit_data);
@@ -164,3 +165,16 @@ out:
                                           tmp : orig_prog);
        return prog;
 }
+
+void *bpf_jit_alloc_exec(unsigned long size)
+{
+       return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
+                                   BPF_JIT_REGION_END, GFP_KERNEL,
+                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
+                                   __builtin_return_address(0));
+}
+
+void bpf_jit_free_exec(void *addr)
+{
+       return vfree(addr);
+}
index c1ff874..b4c7c34 100644 (file)
@@ -60,6 +60,9 @@ config S390
        imply IMA_SECURE_AND_OR_TRUSTED_BOOT
        select ARCH_32BIT_USTAT_F_TINODE
        select ARCH_BINFMT_ELF_STATE
+       select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEBUG_WX
        select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -137,6 +140,7 @@ config S390
        select HAVE_ARCH_JUMP_LABEL_RELATIVE
        select HAVE_ARCH_KASAN
        select HAVE_ARCH_KASAN_VMALLOC
+       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_SOFT_DIRTY
        select HAVE_ARCH_TRACEHOOK
@@ -626,15 +630,6 @@ config ARCH_SPARSEMEM_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
        def_bool y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y if SPARSEMEM
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-
 config MAX_PHYSMEM_BITS
        int "Maximum size of supported physical memory in bits (42-53)"
        range 42 53
index 6422618..86afcc6 100644 (file)
@@ -387,6 +387,7 @@ CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_PCI=y
+CONFIG_PCI_IOV=y
 # CONFIG_PCIEASPM is not set
 CONFIG_PCI_DEBUG=y
 CONFIG_HOTPLUG_PCI=y
@@ -548,7 +549,7 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
index 371a529..71b49ea 100644 (file)
@@ -377,6 +377,7 @@ CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_PCI=y
+CONFIG_PCI_IOV=y
 # CONFIG_PCIEASPM is not set
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
@@ -540,7 +541,7 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
index 649b9fc..3e4cbcb 100644 (file)
@@ -123,4 +123,6 @@ static inline int stccm_avail(void)
        return test_facility(142);
 }
 
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+                          struct cpumf_ctr_info *info);
 #endif /* _ASM_S390_CPU_MCF_H */
index 9cceb26..baa8005 100644 (file)
@@ -4,9 +4,11 @@
 
 #include <linux/sched.h>
 #include <linux/audit.h>
+#include <linux/randomize_kstack.h>
 #include <linux/tracehook.h>
 #include <linux/processor.h>
 #include <linux/uaccess.h>
+#include <asm/timex.h>
 #include <asm/fpu/api.h>
 
 #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
@@ -48,6 +50,14 @@ static __always_inline void arch_exit_to_user_mode(void)
 
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+                                                 unsigned long ti_work)
+{
+       choose_random_kstack_offset(get_tod_clock_fast() & 0xff);
+}
+
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
 static inline bool on_thread_stack(void)
 {
        return !(((unsigned long)(current->stack) ^ current_stack_pointer()) & ~(THREAD_SIZE - 1));
index 28664ee..e3882b0 100644 (file)
@@ -20,11 +20,6 @@ void *xlate_dev_mem_ptr(phys_addr_t phys);
 #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define IO_SPACE_LIMIT 0
 
 void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
index 35c2af9..10b67f8 100644 (file)
@@ -204,7 +204,7 @@ extern unsigned int s390_pci_no_rid;
 struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
 int zpci_enable_device(struct zpci_dev *);
 int zpci_disable_device(struct zpci_dev *);
-int zpci_configure_device(struct zpci_dev *zdev, u32 fh);
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
 int zpci_deconfigure_device(struct zpci_dev *zdev);
 
 int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
index b3beef6..31a605b 100644 (file)
@@ -230,9 +230,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
                /* No support for kernel space counters only */
                } else if (!attr->exclude_kernel && attr->exclude_user) {
                        return -EOPNOTSUPP;
-
-               /* Count user and kernel space */
-               } else {
+               } else {        /* Count user and kernel space */
                        if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
                                return -EOPNOTSUPP;
                        ev = cpumf_generic_events_basic[ev];
@@ -402,12 +400,12 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags)
                 */
                if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
                        ctr_set_stop(&cpuhw->state, hwc->config_base);
-               event->hw.state |= PERF_HES_STOPPED;
+               hwc->state |= PERF_HES_STOPPED;
        }
 
        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
                hw_perf_event_update(event);
-               event->hw.state |= PERF_HES_UPTODATE;
+               hwc->state |= PERF_HES_UPTODATE;
        }
 }
 
@@ -430,8 +428,6 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
        if (flags & PERF_EF_START)
                cpumf_pmu_start(event, PERF_EF_RELOAD);
 
-       perf_event_update_userpage(event);
-
        return 0;
 }
 
@@ -451,8 +447,6 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
         */
        if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
                ctr_set_disable(&cpuhw->state, event->hw.config_base);
-
-       perf_event_update_userpage(event);
 }
 
 /*
index 3bced89..6d53215 100644 (file)
@@ -170,6 +170,52 @@ static int cpum_cf_offline_cpu(unsigned int cpu)
        return cpum_cf_setup(cpu, PMC_RELEASE);
 }
 
+/* Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+                          struct cpumf_ctr_info *info)
+{
+       size_t ctrset_size = 0;
+
+       switch (ctrset) {
+       case CPUMF_CTR_SET_BASIC:
+               if (info->cfvn >= 1)
+                       ctrset_size = 6;
+               break;
+       case CPUMF_CTR_SET_USER:
+               if (info->cfvn == 1)
+                       ctrset_size = 6;
+               else if (info->cfvn >= 3)
+                       ctrset_size = 2;
+               break;
+       case CPUMF_CTR_SET_CRYPTO:
+               if (info->csvn >= 1 && info->csvn <= 5)
+                       ctrset_size = 16;
+               else if (info->csvn == 6)
+                       ctrset_size = 20;
+               break;
+       case CPUMF_CTR_SET_EXT:
+               if (info->csvn == 1)
+                       ctrset_size = 32;
+               else if (info->csvn == 2)
+                       ctrset_size = 48;
+               else if (info->csvn >= 3 && info->csvn <= 5)
+                       ctrset_size = 128;
+               else if (info->csvn == 6)
+                       ctrset_size = 160;
+               break;
+       case CPUMF_CTR_SET_MT_DIAG:
+               if (info->csvn > 3)
+                       ctrset_size = 48;
+               break;
+       case CPUMF_CTR_SET_MAX:
+               break;
+       }
+
+       return ctrset_size;
+}
+
 static int __init cpum_cf_init(void)
 {
        int rc;
index 2e3e7ed..08c985c 100644 (file)
@@ -316,52 +316,6 @@ static void cf_diag_read(struct perf_event *event)
        debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
 }
 
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
-                                struct cpumf_ctr_info *info)
-{
-       size_t ctrset_size = 0;
-
-       switch (ctrset) {
-       case CPUMF_CTR_SET_BASIC:
-               if (info->cfvn >= 1)
-                       ctrset_size = 6;
-               break;
-       case CPUMF_CTR_SET_USER:
-               if (info->cfvn == 1)
-                       ctrset_size = 6;
-               else if (info->cfvn >= 3)
-                       ctrset_size = 2;
-               break;
-       case CPUMF_CTR_SET_CRYPTO:
-               if (info->csvn >= 1 && info->csvn <= 5)
-                       ctrset_size = 16;
-               else if (info->csvn == 6)
-                       ctrset_size = 20;
-               break;
-       case CPUMF_CTR_SET_EXT:
-               if (info->csvn == 1)
-                       ctrset_size = 32;
-               else if (info->csvn == 2)
-                       ctrset_size = 48;
-               else if (info->csvn >= 3 && info->csvn <= 5)
-                       ctrset_size = 128;
-               else if (info->csvn == 6)
-                       ctrset_size = 160;
-               break;
-       case CPUMF_CTR_SET_MT_DIAG:
-               if (info->csvn > 3)
-                       ctrset_size = 48;
-               break;
-       case CPUMF_CTR_SET_MAX:
-               break;
-       }
-
-       return ctrset_size;
-}
-
 /* Calculate memory needed to store all counter sets together with header and
  * trailer data. This is independend of the counter set authorization which
  * can vary depending on the configuration.
@@ -372,7 +326,7 @@ static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
        enum cpumf_ctr_set i;
 
        for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-               size_t size = cf_diag_ctrset_size(i, info);
+               size_t size = cpum_cf_ctrset_size(i, info);
 
                if (size)
                        max_size += size * sizeof(u64) +
@@ -405,7 +359,7 @@ static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
        ctrdata->def = CF_DIAG_CTRSET_DEF;
        ctrdata->set = ctrset;
        ctrdata->res1 = 0;
-       ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
+       ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
 
        if (ctrset_size) {                      /* Save data */
                need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
@@ -845,7 +799,7 @@ static void cf_diag_cpu_read(void *parm)
 
                if (!(p->sets & cpumf_ctr_ctl[set]))
                        continue;       /* Counter set not in list */
-               set_size = cf_diag_ctrset_size(set, &cpuhw->info);
+               set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
                space = sizeof(csd->data) - csd->used;
                space = cf_diag_cpuset_read(sp, set, set_size, space);
                if (space) {
@@ -975,7 +929,7 @@ static size_t cf_diag_needspace(unsigned int sets)
        for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
                if (!(sets & cpumf_ctr_ctl[i]))
                        continue;
-               bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+               bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
                         sizeof(((struct s390_ctrset_setdata *)0)->set) +
                         sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
        }
index 72134f9..5aab59a 100644 (file)
@@ -937,9 +937,9 @@ static int __init setup_hwcaps(void)
        if (MACHINE_HAS_VX) {
                elf_hwcap |= HWCAP_S390_VXRS;
                if (test_facility(134))
-                       elf_hwcap |= HWCAP_S390_VXRS_EXT;
-               if (test_facility(135))
                        elf_hwcap |= HWCAP_S390_VXRS_BCD;
+               if (test_facility(135))
+                       elf_hwcap |= HWCAP_S390_VXRS_EXT;
                if (test_facility(148))
                        elf_hwcap |= HWCAP_S390_VXRS_EXT2;
                if (test_facility(152))
index bc8e650..4e5cc7d 100644 (file)
@@ -142,6 +142,7 @@ void do_syscall(struct pt_regs *regs)
 
 void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 {
+       add_random_kstack_offset();
        enter_from_user_mode(regs);
 
        memcpy(&regs->gprs[8], S390_lowcore.save_area_sync, 8 * sizeof(unsigned long));
index 63021d4..8dd23c7 100644 (file)
@@ -17,6 +17,7 @@
 #include "asm/ptrace.h"
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
 #include <linux/extable.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -301,6 +302,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
        unsigned int trapnr, syscall_redirect = 0;
        irqentry_state_t state;
 
+       add_random_kstack_offset();
        regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc;
        regs->int_parm_long = S390_lowcore.trans_exc_code;
 
index 3b5a4d2..da36d13 100644 (file)
@@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
        return pte;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgdp;
index c01b6db..b0993e0 100644 (file)
@@ -738,17 +738,19 @@ error:
 }
 
 /**
- * zpci_configure_device() - Configure a zpci_dev
+ * zpci_scan_configured_device() - Scan a freshly configured zpci_dev
  * @zdev: The zpci_dev to be configured
  * @fh: The general function handle supplied by the platform
  *
  * Given a device in the configuration state Configured, enables, scans and
- * adds it to the common code PCI subsystem. If any failure occurs, the
- * zpci_dev is left disabled.
+ * adds it to the common code PCI subsystem if possible. If the PCI device is
+ * parked because we can not yet create a PCI bus because we have not seen
+ * function 0, it is ignored but will be scanned once function 0 appears.
+ * If any failure occurs, the zpci_dev is left disabled.
  *
  * Return: 0 on success, or an error code otherwise
  */
-int zpci_configure_device(struct zpci_dev *zdev, u32 fh)
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh)
 {
        int rc;
 
index 1178b48..cd447b9 100644 (file)
@@ -76,8 +76,6 @@ void zpci_event_error(void *data)
 
 static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
 {
-       enum zpci_state state;
-
        zdev->fh = fh;
        /* Give the driver a hint that the function is
         * already unusable.
@@ -88,15 +86,12 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
         */
        zpci_disable_device(zdev);
        zdev->state = ZPCI_FN_STATE_STANDBY;
-       if (!clp_get_state(zdev->fid, &state) &&
-           state == ZPCI_FN_STATE_RESERVED) {
-               zpci_zdev_put(zdev);
-       }
 }
 
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
        struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
+       enum zpci_state state;
 
        zpci_err("avail CCDF:\n");
        zpci_err_hex(ccdf, sizeof(*ccdf));
@@ -113,7 +108,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
                                break;
                        zdev->state = ZPCI_FN_STATE_CONFIGURED;
                }
-               zpci_configure_device(zdev, ccdf->fh);
+               zpci_scan_configured_device(zdev, ccdf->fh);
                break;
        case 0x0302: /* Reserved -> Standby */
                if (!zdev)
@@ -123,13 +118,28 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
                break;
        case 0x0303: /* Deconfiguration requested */
                if (zdev) {
+                       /* The event may have been queued before we confirgured
+                        * the device.
+                        */
+                       if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+                               break;
                        zdev->fh = ccdf->fh;
                        zpci_deconfigure_device(zdev);
                }
                break;
        case 0x0304: /* Configured -> Standby|Reserved */
-               if (zdev)
-                       zpci_event_hard_deconfigured(zdev, ccdf->fh);
+               if (zdev) {
+                       /* The event may have been queued before we confirgured
+                        * the device.:
+                        */
+                       if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
+                               zpci_event_hard_deconfigured(zdev, ccdf->fh);
+                       /* The 0x0304 event may immediately reserve the device */
+                       if (!clp_get_state(zdev->fid, &state) &&
+                           state == ZPCI_FN_STATE_RESERVED) {
+                               zpci_zdev_put(zdev);
+                       }
+               }
                break;
        case 0x0306: /* 0x308 or 0x302 for multiple devices */
                zpci_remove_reserved_devices();
index e798e55..6812953 100644 (file)
@@ -2,6 +2,8 @@
 config SUPERH
        def_bool y
        select ARCH_32BIT_OFF_T
+       select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
+       select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
        select ARCH_HAVE_CUSTOM_GPIO_H
        select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
        select ARCH_HAS_BINFMT_FLAT if !MMU
@@ -101,9 +103,6 @@ config SYS_SUPPORTS_APM_EMULATION
        bool
        select ARCH_SUSPEND_POSSIBLE
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-
 config SYS_SUPPORTS_SMP
        bool
 
@@ -175,12 +174,12 @@ config CPU_SH3
 
 config CPU_SH4
        bool
+       select ARCH_SUPPORTS_HUGETLBFS if MMU
        select CPU_HAS_INTEVT
        select CPU_HAS_SR_RB
        select CPU_HAS_FPU if !CPU_SH4AL_DSP
        select SH_INTC
        select SYS_SUPPORTS_SH_TMU
-       select SYS_SUPPORTS_HUGETLBFS if MMU
 
 config CPU_SH4A
        bool
index 3bcbf52..44bcb80 100644 (file)
@@ -9,7 +9,7 @@
 # License.  See the file "COPYING" in the main directory of this archive
 # for more details.
 #
-ifneq ($(SUBARCH),$(ARCH))
+ifdef cross_compiling
   ifeq ($(CROSS_COMPILE),)
     CROSS_COMPILE := $(call cc-cross-prefix, sh-linux- sh-linux-gnu- sh-unknown-linux-gnu-)
   endif
index ef7cc31..9ee3526 100644 (file)
@@ -23,7 +23,6 @@ CONFIG_SH_PCLK_FREQ=31250000
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
index 315b04a..601d062 100644 (file)
@@ -71,7 +71,6 @@ CONFIG_SMC91X=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=4
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 99975db..79f02f1 100644 (file)
@@ -75,7 +75,6 @@ CONFIG_INPUT_FF_MEMLESS=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
index 2c46c00..cbc9389 100644 (file)
@@ -18,7 +18,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 8819315..ee2357d 100644 (file)
@@ -20,7 +20,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 9b885c1..5c725c7 100644 (file)
@@ -66,7 +66,6 @@ CONFIG_INPUT_FF_MEMLESS=m
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_EVBUG=m
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 450b585..3b6c7b5 100644 (file)
@@ -58,15 +58,16 @@ static inline unsigned long __ffs(unsigned long word)
        return result;
 }
 
-#include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
+#include <asm-generic/bitops/le.h>
+#include <asm-generic/bitops/find.h>
+
 #endif /* __ASM_SH_BITOPS_H */
index 6d5c646..cf9a3ec 100644 (file)
@@ -283,11 +283,6 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 int valid_phys_addr_range(phys_addr_t addr, size_t size);
 int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
index 285aaba..6713c65 100644 (file)
@@ -6,20 +6,14 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')     \
          $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
 
 syscall := $(src)/syscall.tbl
-syshdr := $(srctree)/$(src)/syscallhdr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'       \
-                  '$(syshdr_abis_$(basetarget))'               \
-                  '$(syshdr_pfx_$(basetarget))'                \
-                  '$(syshdr_offset_$(basetarget))'
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr $< $@
 
 quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'       \
-                  '$(systbl_abis_$(basetarget))'               \
-                  '$(systbl_abi_$(basetarget))'                \
-                  '$(systbl_offset_$(basetarget))'
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@
 
 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
diff --git a/arch/sh/kernel/syscalls/syscallhdr.sh b/arch/sh/kernel/syscalls/syscallhdr.sh
deleted file mode 100644 (file)
index 4c05198..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_UAPI_ASM_SH_`basename "$out" | sed \
-       -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-       -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       printf "#ifndef %s\n" "${fileguard}"
-       printf "#define %s\n" "${fileguard}"
-       printf "\n"
-
-       nxt=0
-       while read nr abi name entry ; do
-               if [ -z "$offset" ]; then
-                       printf "#define __NR_%s%s\t%s\n" \
-                               "${prefix}" "${name}" "${nr}"
-               else
-                       printf "#define __NR_%s%s\t(%s + %s)\n" \
-                               "${prefix}" "${name}" "${offset}" "${nr}"
-               fi
-               nxt=$((nr+1))
-       done
-
-       printf "\n"
-       printf "#ifdef __KERNEL__\n"
-       printf "#define __NR_syscalls\t%s\n" "${nxt}"
-       printf "#endif\n"
-       printf "\n"
-       printf "#endif /* %s */\n" "${fileguard}"
-) > "$out"
diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh
deleted file mode 100644 (file)
index 904b8e6..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-my_abi="$4"
-offset="$5"
-
-emit() {
-       t_nxt="$1"
-       t_nr="$2"
-       t_entry="$3"
-
-       while [ $t_nxt -lt $t_nr ]; do
-               printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
-               t_nxt=$((t_nxt+1))
-       done
-       printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
-}
-
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       nxt=0
-       if [ -z "$offset" ]; then
-               offset=0
-       fi
-
-       while read nr abi name entry ; do
-               emit $((nxt+offset)) $((nr+offset)) $entry
-               nxt=$((nr+1))
-       done
-) > "$out"
index 77aa2f8..d551a9c 100644 (file)
@@ -136,14 +136,6 @@ config ARCH_SPARSEMEM_DEFAULT
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-       depends on SPARSEMEM && MMU
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-       depends on SPARSEMEM && MMU
-
 config ARCH_MEMORY_PROBE
        def_bool y
        depends on MEMORY_HOTPLUG
index 220d7bc..999ab59 100644 (file)
@@ -21,7 +21,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index 12a4fb0..1809909 100644 (file)
@@ -122,7 +122,6 @@ CONFIG_INPUT_SPARCSPKR=y
 # CONFIG_SERIO_SERPORT is not set
 CONFIG_SERIO_PCIPS2=m
 CONFIG_SERIO_RAW=m
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSAB=y
index aec2040..0b9d98c 100644 (file)
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 generated-y += syscall_table_32.h
 generated-y += syscall_table_64.h
-generated-y += syscall_table_c32.h
 generic-y += export.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
index 9fbfc95..5ffa820 100644 (file)
@@ -454,11 +454,6 @@ void sbus_set_sbus64(struct device *, int);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #endif
 
 #endif /* !(__SPARC64_IO_H) */
index 283f644..0f2ea5b 100644 (file)
@@ -6,46 +6,34 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')     \
          $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
 
 syscall := $(src)/syscall.tbl
-syshdr := $(srctree)/$(src)/syscallhdr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'       \
-                  '$(syshdr_abis_$(basetarget))'               \
-                  '$(syshdr_pfx_$(basetarget))'                \
-                  '$(syshdr_offset_$(basetarget))'
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@
 
 quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'       \
-                  '$(systbl_abis_$(basetarget))'               \
-                  '$(systbl_abi_$(basetarget))'                \
-                  '$(systbl_offset_$(basetarget))'
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
 
-syshdr_abis_unistd_32 := common,32
+$(uapi)/unistd_32.h: abis := common,32
 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-syshdr_abis_unistd_64 := common,64
+$(uapi)/unistd_64.h: abis := common,64
 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-systbl_abis_syscall_table_32 := common,32
+$(kapi)/syscall_table_32.h: abis := common,32
 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abis_syscall_table_64 := common,64
+$(kapi)/syscall_table_64.h: abis := common,64
 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abis_syscall_table_c32 := common,32
-systbl_abi_syscall_table_c32 := c32
-$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE
-       $(call if_changed,systbl)
-
 uapisyshdr-y           += unistd_32.h unistd_64.h
 kapisyshdr-y           += syscall_table_32.h           \
-                          syscall_table_64.h           \
-                          syscall_table_c32.h
+                          syscall_table_64.h
 
 uapisyshdr-y   := $(addprefix $(uapi)/, $(uapisyshdr-y))
 kapisyshdr-y   := $(addprefix $(kapi)/, $(kapisyshdr-y))
diff --git a/arch/sparc/kernel/syscalls/syscallhdr.sh b/arch/sparc/kernel/syscalls/syscallhdr.sh
deleted file mode 100644 (file)
index cf50a75..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_UAPI_ASM_SPARC_`basename "$out" | sed \
-       -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-       -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       printf "#ifndef %s\n" "${fileguard}"
-       printf "#define %s\n" "${fileguard}"
-       printf "\n"
-
-       nxt=0
-       while read nr abi name entry compat ; do
-               if [ -z "$offset" ]; then
-                       printf "#define __NR_%s%s\t%s\n" \
-                               "${prefix}" "${name}" "${nr}"
-               else
-                       printf "#define __NR_%s%s\t(%s + %s)\n" \
-                               "${prefix}" "${name}" "${offset}" "${nr}"
-               fi
-               nxt=$((nr+1))
-       done
-
-       printf "\n"
-       printf "#ifdef __KERNEL__\n"
-       printf "#define __NR_syscalls\t%s\n" "${nxt}"
-       printf "#endif\n"
-       printf "\n"
-       printf "#endif /* %s */\n" "${fileguard}"
-) > "$out"
diff --git a/arch/sparc/kernel/syscalls/syscalltbl.sh b/arch/sparc/kernel/syscalls/syscalltbl.sh
deleted file mode 100644 (file)
index 77cf014..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-my_abi="$4"
-offset="$5"
-
-emit() {
-       t_nxt="$1"
-       t_nr="$2"
-       t_entry="$3"
-
-       while [ $t_nxt -lt $t_nr ]; do
-               printf "__SYSCALL(%s, sys_nis_syscall, )\n" "${t_nxt}"
-               t_nxt=$((t_nxt+1))
-       done
-       printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
-}
-
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       nxt=0
-       if [ -z "$offset" ]; then
-               offset=0
-       fi
-
-       while read nr abi name entry compat ; do
-               if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then
-                       emit $((nxt+offset)) $((nr+offset)) $compat
-               else
-                       emit $((nxt+offset)) $((nr+offset)) $entry
-               fi
-               nxt=$((nr+1))
-       done
-) > "$out"
index ab9e4d5..3aaffa0 100644 (file)
@@ -9,10 +9,10 @@
  * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
  */
 
-#define __SYSCALL(nr, entry, nargs) .long entry
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, native)
+#define __SYSCALL(nr, entry) .long entry
        .data
        .align 4
        .globl sys_call_table
 sys_call_table:
 #include <asm/syscall_table_32.h>      /* 32-bit native syscalls */
-#undef __SYSCALL
index a27394b..398fe44 100644 (file)
  * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
  */
 
-#define __SYSCALL(nr, entry, nargs) .word entry
+#define __SYSCALL(nr, entry) .word entry
        .text
        .align  4
 #ifdef CONFIG_COMPAT
        .globl sys_call_table32
 sys_call_table32:
-#include <asm/syscall_table_c32.h>     /* Compat syscalls */
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, compat)
+#include <asm/syscall_table_32.h>      /* Compat syscalls */
+#undef __SYSCALL_WITH_COMPAT
 #endif /* CONFIG_COMPAT */
 
        .align  4
        .globl sys_call_table64, sys_call_table
 sys_call_table64:
 sys_call_table:
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, native)
 #include <asm/syscall_table_64.h>      /* 64-bit native syscalls */
-#undef __SYSCALL
index ad4b42f..04d8790 100644 (file)
@@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p
 unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); }
 unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index 103adac..9a67c01 100644 (file)
@@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align,
                      int *data_offset_out);
 
 #endif
-
-/*
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
index dac15f6..0045e1b 100644 (file)
@@ -60,7 +60,13 @@ config X86
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
        select ARCH_32BIT_OFF_T                 if X86_32
        select ARCH_CLOCKSOURCE_INIT
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+       select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
+       select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
        select ARCH_HAS_ACPI_TABLE_UPGRADE      if ACPI
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE        if !X86_PAE
        select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -165,6 +171,7 @@ config X86
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
        select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD
+       select HAVE_ARCH_USERFAULTFD_MINOR      if X86_64 && USERFAULTFD
        select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
        select HAVE_ARCH_WITHIN_STACK_FRAMES
@@ -315,9 +322,6 @@ config GENERIC_CALIBRATE_DELAY
 config ARCH_HAS_CPU_RELAX
        def_bool y
 
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config ARCH_HAS_FILTER_PGPROT
        def_bool y
 
@@ -2428,30 +2432,13 @@ config ARCH_HAS_ADD_PAGES
        def_bool y
        depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-       depends on X86_64 || (X86_32 && HIGHMEM)
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
        def_bool y
-       depends on MEMORY_HOTPLUG
 
 config USE_PERCPU_NUMA_NODE_ID
        def_bool y
        depends on NUMA
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-       depends on X86_64 || X86_PAE
-
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on X86_64 && HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
-       def_bool y
-       depends on X86_64 && TRANSPARENT_HUGEPAGE
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
index 1c1a7e4..913745f 100644 (file)
@@ -19,8 +19,6 @@
 #include "../perf_event.h"
 #include "iommu.h"
 
-#define COUNTER_SHIFT          16
-
 /* iommu pmu conf masks */
 #define GET_CSOURCE(x)     ((x)->conf & 0xFFULL)
 #define GET_DEVID(x)       (((x)->conf >> 8)  & 0xFFFFULL)
@@ -286,22 +284,31 @@ static void perf_iommu_start(struct perf_event *event, int flags)
        WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
        hwc->state = 0;
 
+       /*
+        * To account for power-gating, which prevents write to
+        * the counter, we need to enable the counter
+        * before setting up counter register.
+        */
+       perf_iommu_enable_event(event);
+
        if (flags & PERF_EF_RELOAD) {
-               u64 prev_raw_count = local64_read(&hwc->prev_count);
+               u64 count = 0;
                struct amd_iommu *iommu = perf_event_2_iommu(event);
 
+               /*
+                * Since the IOMMU PMU only support counting mode,
+                * the counter always start with value zero.
+                */
                amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
-                                    IOMMU_PC_COUNTER_REG, &prev_raw_count);
+                                    IOMMU_PC_COUNTER_REG, &count);
        }
 
-       perf_iommu_enable_event(event);
        perf_event_update_userpage(event);
-
 }
 
 static void perf_iommu_read(struct perf_event *event)
 {
-       u64 count, prev, delta;
+       u64 count;
        struct hw_perf_event *hwc = &event->hw;
        struct amd_iommu *iommu = perf_event_2_iommu(event);
 
@@ -312,14 +319,11 @@ static void perf_iommu_read(struct perf_event *event)
        /* IOMMU pc counter register is only 48 bits */
        count &= GENMASK_ULL(47, 0);
 
-       prev = local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
-               return;
-
-       /* Handle 48-bit counter overflow */
-       delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
+       /*
+        * Since the counter always start with value zero,
+        * simply just accumulate the count for the event.
+        */
+       local64_add(count, &event->count);
 }
 
 static void perf_iommu_stop(struct perf_event *event, int flags)
@@ -329,15 +333,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
        if (hwc->state & PERF_HES_UPTODATE)
                return;
 
+       /*
+        * To account for power-gating, in which reading the counter would
+        * return zero, we need to read the register before disabling.
+        */
+       perf_iommu_read(event);
+       hwc->state |= PERF_HES_UPTODATE;
+
        perf_iommu_disable_event(event);
        WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
        hwc->state |= PERF_HES_STOPPED;
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       perf_iommu_read(event);
-       hwc->state |= PERF_HES_UPTODATE;
 }
 
 static int perf_iommu_add(struct perf_event *event, int flags)
index 297fa12..84b8753 100644 (file)
@@ -7,18 +7,9 @@
 
 /*
  * Despite that some emulators terminate on UD2, we use it for WARN().
- *
- * Since various instruction decoders/specs disagree on the encoding of
- * UD0/UD1.
  */
-
-#define ASM_UD0                ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
-#define ASM_UD1                ".byte 0x0f, 0xb9" /* + ModRM */
 #define ASM_UD2                ".byte 0x0f, 0x0b"
-
-#define INSN_UD0       0xff0f
 #define INSN_UD2       0x0b0f
-
 #define LEN_UD2                2
 
 #ifdef CONFIG_GENERIC_BUG
index e35e342..73d45b0 100644 (file)
@@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC,  xenpv_exc_machine_check);
 #endif
 
 /* NMI */
+
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * Special NOIST entry point for VMX which invokes this on the kernel
+ * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
+ * 'executing' marker.
+ *
+ * On 32bit this just uses the regular NMI entry point because 32-bit does
+ * not have ISTs.
+ */
+DECLARE_IDTENTRY(X86_TRAP_NMI,         exc_nmi_noist);
+#else
+#define asm_exc_nmi_noist              asm_exc_nmi
+#endif
+
 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,     exc_nmi);
 #ifdef CONFIG_XEN_PV
 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,     xenpv_exc_nmi);
index e16cccd..a3f87f1 100644 (file)
@@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
        return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
 }
 
-#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
-
-#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
-
 struct msr *msrs_alloc(void);
 void msrs_free(struct msr *msrs);
 int msr_set_bit(u32 msr, u8 bit);
index 939b1cf..ca840fe 100644 (file)
@@ -56,6 +56,39 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
+#ifdef CONFIG_X86_5LEVEL
+/*
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+static inline unsigned long task_size_max(void)
+{
+       unsigned long ret;
+
+       alternative_io("movq %[small],%0","movq %[large],%0",
+                       X86_FEATURE_LA57,
+                       "=r" (ret),
+                       [small] "i" ((1ul << 47)-PAGE_SIZE),
+                       [large] "i" ((1ul << 56)-PAGE_SIZE));
+
+       return ret;
+}
+#endif /* CONFIG_X86_5LEVEL */
+
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
index 64297ea..a8d4ad8 100644 (file)
 
 #ifdef CONFIG_X86_5LEVEL
 #define __VIRTUAL_MASK_SHIFT   (pgtable_l5_enabled() ? 56 : 47)
+/* See task_size_max() in <asm/page_64.h> */
 #else
 #define __VIRTUAL_MASK_SHIFT   47
+#define task_size_max()                ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 #endif
 
-/*
- * User space process size.  This is the first address outside the user range.
- * There are a few constraints that determine this:
- *
- * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- * address, then that syscall will enter the kernel with a
- * non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything
- * from being mapped at the maximum canonical address.
- *
- * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- * CPUs malfunction if they execute code from the highest canonical page.
- * They'll speculate right off the end of the canonical space, and
- * bad things happen.  This is worked around in the same way as the
- * Intel problem.
- *
- * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
-#define TASK_SIZE_MAX  ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
-
+#define TASK_SIZE_MAX          task_size_max()
 #define DEFAULT_MAP_WINDOW     ((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
index 6bdb69a..a1b756c 100644 (file)
@@ -1851,8 +1851,8 @@ static inline void setup_getcpu(int cpu)
        unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
        struct desc_struct d = { };
 
-       if (boot_cpu_has(X86_FEATURE_RDTSCP))
-               write_rdtscp_aux(cpudata);
+       if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+               wrmsr(MSR_TSC_AUX, cpudata, 0);
 
        /* Store CPU and node number in limit. */
        d.limit0 = cpudata;
index dbeaa84..f07c10b 100644 (file)
@@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
 static const struct mbm_correction_factor_table {
        u32 rmidthreshold;
        u64 cf;
-} mbm_cf_table[] __initdata = {
+} mbm_cf_table[] __initconst = {
        {7,     CF(1.000000)},
        {15,    CF(1.000000)},
        {15,    CF(0.969650)},
index bf250a3..2ef961c 100644 (file)
@@ -524,6 +524,16 @@ nmi_restart:
                mds_user_clear_cpu_buffers();
 }
 
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+{
+       exc_nmi(regs);
+}
+#endif
+#if IS_MODULE(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+#endif
+
 void stop_nmi(void)
 {
        ignore_nmis++;
index 43cbfc8..5e1f381 100644 (file)
@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
 #endif
 
        /* Kernel thread ? */
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(p->flags & PF_KTHREAD)) {
                memset(childregs, 0, sizeof(struct pt_regs));
                kthread_frame_init(frame, sp, arg);
                return 0;
@@ -172,6 +172,23 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
        task_user_gs(p) = get_user_gs(current_pt_regs());
 #endif
 
+       if (unlikely(p->flags & PF_IO_WORKER)) {
+               /*
+                * An IO thread is a user space thread, but it doesn't
+                * return to ret_after_fork().
+                *
+                * In order to indicate that to tools like gdb,
+                * we reset the stack and instruction pointers.
+                *
+                * It does the same kernel frame setup to return to a kernel
+                * function that a kernel thread does.
+                */
+               childregs->sp = 0;
+               childregs->ip = 0;
+               kthread_frame_init(frame, sp, arg);
+               return 0;
+       }
+
        /* Set a new TLS for the child thread? */
        if (clone_flags & CLONE_SETTLS)
                ret = set_new_tls(p, tls);
index 7ffb0cf..0ad5214 100644 (file)
@@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
        return true;
 }
 
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
 #define X86_MATCH(model)                                       \
        X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,            \
                INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
index 9790c73..b649f92 100644 (file)
@@ -3710,25 +3710,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long vmcb_pa = svm->current_vmcb->pa;
 
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
 
        if (sev_es_guest(vcpu->kvm)) {
                __svm_sev_es_vcpu_run(vmcb_pa);
@@ -3748,24 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
                vmload(__sme_page_pa(sd->save_area));
        }
 
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
 }
 
 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
index cbe0cda..d000cdd 100644 (file)
@@ -36,6 +36,7 @@
 #include <asm/debugreg.h>
 #include <asm/desc.h>
 #include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
 #include <asm/io.h>
 #include <asm/irq_remapping.h>
 #include <asm/kexec.h>
@@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 
 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
 
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+                                       unsigned long entry)
 {
-       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-       gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
        kvm_before_interrupt(vcpu);
-       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       vmx_do_interrupt_nmi_irqoff(entry);
        kvm_after_interrupt(vcpu);
 }
 
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
+       const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
        u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
 
        /* if exit due to PF check for async PF */
@@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
                kvm_machine_check();
        /* We need to handle NMIs before interrupts are enabled */
        else if (is_nmi(intr_info))
-               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
 }
 
 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
        u32 intr_info = vmx_get_intr_info(vcpu);
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
 
        if (WARN_ONCE(!is_external_intr(intr_info),
            "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                return;
 
-       handle_interrupt_nmi_irqoff(vcpu, intr_info);
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
 }
 
 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6662,25 +6664,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                        struct vcpu_vmx *vmx)
 {
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
 
        /* L1D Flush includes CPU buffer clear to mitigate MDS */
        if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6696,24 +6680,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
        vcpu->arch.cr2 = native_read_cr2();
 
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
 }
 
 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
index cebdaa1..6eda283 100644 (file)
@@ -9315,6 +9315,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        local_irq_disable();
        kvm_after_interrupt(vcpu);
 
+       /*
+        * Wait until after servicing IRQs to account guest time so that any
+        * ticks that occurred while running the guest are properly accounted
+        * to the guest.  Waiting until IRQs are enabled degrades the accuracy
+        * of accounting via context tracking, but the loss of accuracy is
+        * acceptable for all known use cases.
+        */
+       vtime_account_guest_exit();
+
        if (lapic_in_kernel(vcpu)) {
                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
                if (delta != S64_MIN) {
index 8ddd381..521f74e 100644 (file)
@@ -8,6 +8,51 @@
 #include "kvm_cache_regs.h"
 #include "kvm_emulate.h"
 
+static __always_inline void kvm_guest_enter_irqoff(void)
+{
+       /*
+        * VMENTER enables interrupts (host state), but the kernel state is
+        * interrupts disabled when this is invoked. Also tell RCU about
+        * it. This is the same logic as for exit_to_user_mode().
+        *
+        * This ensures that e.g. latency analysis on the host observes
+        * guest mode as interrupt enabled.
+        *
+        * guest_enter_irqoff() informs context tracking about the
+        * transition to guest mode and if enabled adjusts RCU state
+        * accordingly.
+        */
+       instrumentation_begin();
+       trace_hardirqs_on_prepare();
+       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+       instrumentation_end();
+
+       guest_enter_irqoff();
+       lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+static __always_inline void kvm_guest_exit_irqoff(void)
+{
+       /*
+        * VMEXIT disables interrupts (host state), but tracing and lockdep
+        * have them in state 'on' as recorded before entering guest mode.
+        * Same as enter_from_user_mode().
+        *
+        * context_tracking_guest_exit() restores host context and reinstates
+        * RCU if enabled and required.
+        *
+        * This needs to be done immediately after VM-Exit, before any code
+        * that might contain tracepoints or call out to the greater world,
+        * e.g. before x86_spec_ctrl_restore_host().
+        */
+       lockdep_hardirqs_off(CALLER_ADDR0);
+       context_tracking_guest_exit();
+
+       instrumentation_begin();
+       trace_hardirqs_off_finish();
+       instrumentation_end();
+}
+
 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
 ({                                                                     \
        bool failed = (consistency_check);                              \
index 4279806..156cd23 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -91,6 +93,12 @@ static void split_page_count(int level)
                return;
 
        direct_pages_count[level]--;
+       if (system_state == SYSTEM_RUNNING) {
+               if (level == PG_LEVEL_2M)
+                       count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+               else if (level == PG_LEVEL_1G)
+                       count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+       }
        direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
index ba9fee7..e9c8f06 100644 (file)
@@ -19,12 +19,8 @@ variant-y := $(patsubst "%",%,$(CONFIG_XTENSA_VARIANT_NAME))
 VARIANT = $(variant-y)
 export VARIANT
 
-# Test for cross compiling
-
 ifneq ($(VARIANT),)
-  COMPILE_ARCH = $(shell uname -m)
-
-  ifneq ($(COMPILE_ARCH), xtensa)
+  ifdef cross_compiling
     ifndef CROSS_COMPILE
       CROSS_COMPILE = xtensa_$(VARIANT)-
     endif
index 4f1ff95..062148e 100644 (file)
@@ -72,7 +72,6 @@ CONFIG_MARVELL_PHY=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-CONFIG_DEVKMEM=y
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y
index 44205df..221dc56 100644 (file)
@@ -255,6 +255,13 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 }
 EXPORT_SYMBOL(bio_init);
 
+unsigned int bio_max_size(struct bio *bio)
+{
+       struct block_device *bdev = bio->bi_bdev;
+
+       return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX;
+}
+
 /**
  * bio_reset - reinitialize a bio
  * @bio:       bio to reset
@@ -866,7 +873,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
                if (page_is_mergeable(bv, page, len, off, same_page)) {
-                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
+                       if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
                                *same_page = false;
                                return false;
                        }
@@ -995,6 +1002,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+       unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
        bool same_page = false;
@@ -1010,7 +1018,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-       size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+       size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages,
+                                 &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;
 
index 9c00909..c6f80e3 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/gcd.h>
 #include <linux/lcm.h>
 #include <linux/jiffies.h>
@@ -31,6 +32,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
+       lim->bio_max_bytes = UINT_MAX;
        lim->max_segments = BLK_MAX_SEGMENTS;
        lim->max_discard_segments = 1;
        lim->max_integrity_segments = 0;
@@ -139,6 +141,10 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
                                 limits->logical_block_size >> SECTOR_SHIFT);
        limits->max_sectors = max_sectors;
 
+       if (check_shl_overflow(max_sectors, SECTOR_SHIFT,
+                               &limits->bio_max_bytes))
+               limits->bio_max_bytes = UINT_MAX;
+
        q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
index 6cbd1f1..8c3763f 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-x509_certificate_list
-x509_revocation_list
+/x509_certificate_list
+/x509_revocation_list
index b02fd51..8cc195c 100644 (file)
@@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
        acpi_handle handle = mem_device->device->handle;
        int result, num_enabled = 0;
        struct acpi_memory_info *info;
+       mhp_t mhp_flags = MHP_NONE;
        int node;
 
        node = acpi_get_node(handle);
@@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
                if (node < 0)
                        node = memory_add_physaddr_to_nid(info->start_addr);
 
+               if (mhp_supports_memmap_on_memory(info->length))
+                       mhp_flags |= MHP_MEMMAP_ON_MEMORY;
                result = __add_memory(node, info->start_addr, info->length,
-                                     MHP_NONE);
+                                     mhp_flags);
 
                /*
                 * If the memory block has been used by the kernel, add_memory()
index f2d0e59..0a0a982 100644 (file)
@@ -329,7 +329,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
                                        int index)
 {
        struct platform_device *pdev;
-       int irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+       int irq;
 
        /*
         * According to SBSA specification the size of refresh and control
@@ -338,7 +338,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
        struct resource res[] = {
                DEFINE_RES_MEM(wd->control_frame_address, SZ_4K),
                DEFINE_RES_MEM(wd->refresh_frame_address, SZ_4K),
-               DEFINE_RES_IRQ(irq),
+               {},
        };
        int nr_res = ARRAY_SIZE(res);
 
@@ -348,10 +348,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
 
        if (!(wd->refresh_frame_address && wd->control_frame_address)) {
                pr_err(FW_BUG "failed to get the Watchdog base address.\n");
-               acpi_unregister_gsi(wd->timer_interrupt);
                return -EINVAL;
        }
 
+       irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+       res[2] = (struct resource)DEFINE_RES_IRQ(irq);
        if (irq <= 0) {
                pr_warn("failed to map the Watchdog interrupt.\n");
                nr_res--;
@@ -364,7 +365,8 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
         */
        pdev = platform_device_register_simple("sbsa-gwdt", index, res, nr_res);
        if (IS_ERR(pdev)) {
-               acpi_unregister_gsi(wd->timer_interrupt);
+               if (irq > 0)
+                       acpi_unregister_gsi(wd->timer_interrupt);
                return PTR_ERR(pdev);
        }
 
index 443fdf6..d39a9b4 100644 (file)
@@ -42,6 +42,8 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
                                   sizeof(struct acpi_table_header)))
                        return -EFAULT;
                uncopied_bytes = max_size = table.length;
+               /* make sure the buf is not allocated */
+               kfree(buf);
                buf = kzalloc(max_size, GFP_KERNEL);
                if (!buf)
                        return -ENOMEM;
@@ -55,6 +57,7 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
            (*ppos + count < count) ||
            (count > uncopied_bytes)) {
                kfree(buf);
+               buf = NULL;
                return -EINVAL;
        }
 
@@ -76,7 +79,6 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
                add_taint(TAINT_OVERRIDDEN_ACPI_TABLE, LOCKDEP_NOW_UNRELIABLE);
        }
 
-       kfree(buf);
        return count;
 }
 
index f973bbe..b852cff 100644 (file)
@@ -142,7 +142,6 @@ int acpi_device_sleep_wake(struct acpi_device *dev,
 int acpi_power_get_inferred_state(struct acpi_device *device, int *state);
 int acpi_power_on_resources(struct acpi_device *device, int state);
 int acpi_power_transition(struct acpi_device *device, int state);
-void acpi_turn_off_unused_power_resources(void);
 
 /* --------------------------------------------------------------------------
                               Device Power Management
index e209081..c68e694 100644 (file)
@@ -75,8 +75,12 @@ void acpi_unregister_gsi(u32 gsi)
 {
        struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
                                                        DOMAIN_BUS_ANY);
-       int irq = irq_find_mapping(d, gsi);
+       int irq;
 
+       if (WARN_ON(acpi_irq_model == ACPI_IRQ_MODEL_GIC && gsi < 16))
+               return;
+
+       irq = irq_find_mapping(d, gsi);
        irq_dispose_mapping(irq);
 }
 EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
index 56102ea..32974b5 100644 (file)
@@ -995,7 +995,6 @@ void acpi_resume_power_resources(void)
 
        mutex_unlock(&power_resource_list_lock);
 }
-#endif
 
 void acpi_turn_off_unused_power_resources(void)
 {
@@ -1016,3 +1015,4 @@ void acpi_turn_off_unused_power_resources(void)
 
        mutex_unlock(&power_resource_list_lock);
 }
+#endif
index bc973fb..a22778e 100644 (file)
@@ -2359,8 +2359,6 @@ int __init acpi_scan_init(void)
                }
        }
 
-       acpi_turn_off_unused_power_resources();
-
        acpi_scan_initialized = true;
 
  out:
index 7fe41ee..1856f76 100644 (file)
@@ -8,6 +8,7 @@ extern struct list_head acpi_wakeup_device_list;
 extern struct mutex acpi_device_lock;
 
 extern void acpi_resume_power_resources(void);
+extern void acpi_turn_off_unused_power_resources(void);
 
 static inline acpi_status acpi_set_waking_vector(u32 wakeup_address)
 {
index 0ddd611..3bc3c31 100644 (file)
@@ -795,6 +795,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q)
                switch (STATUS_CODE (qe)) {
                case 0x1:
                        /* Fall through for streaming mode */
+                       fallthrough;
                case 0x2:/* Packet received OK.... */
                        if (atm_vcc) {
                                skb = pe->skb;
index ff5755e..eba04c0 100644 (file)
@@ -1737,10 +1737,3 @@ module_init(panel_init_module);
 module_exit(panel_cleanup_module);
 MODULE_AUTHOR("Willy Tarreau");
 MODULE_LICENSE("GPL");
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 7835509..4fdb821 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/kernel_read_file.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/initrd.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
 #include <linux/interrupt.h>
@@ -504,6 +505,7 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv,
        if (!path)
                return -ENOMEM;
 
+       wait_for_initramfs();
        for (i = 0; i < ARRAY_SIZE(fw_path); i++) {
                size_t file_size = 0;
                size_t *file_size_ptr = NULL;
index f352984..b31b3af 100644 (file)
@@ -169,30 +169,98 @@ int memory_notify(unsigned long val, void *v)
        return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
+static int memory_block_online(struct memory_block *mem)
+{
+       unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
+
+       /*
+        * Although vmemmap pages have a different lifecycle than the pages
+        * they describe (they remain until the memory is unplugged), doing
+        * their initialization and accounting at memory onlining/offlining
+        * stage helps to keep accounting easier to follow - e.g vmemmaps
+        * belong to the same zone as the memory they backed.
+        */
+       if (nr_vmemmap_pages) {
+               ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+               if (ret)
+                       return ret;
+       }
+
+       ret = online_pages(start_pfn + nr_vmemmap_pages,
+                          nr_pages - nr_vmemmap_pages, zone);
+       if (ret) {
+               if (nr_vmemmap_pages)
+                       mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+               return ret;
+       }
+
+       /*
+        * Account once onlining succeeded. If the zone was unpopulated, it is
+        * now already properly populated.
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, nr_vmemmap_pages);
+
+       return ret;
+}
+
+static int memory_block_offline(struct memory_block *mem)
+{
+       unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+
+       /*
+        * Unaccount before offlining, such that unpopulated zone and kthreads
+        * can properly be torn down in offline_pages().
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, -nr_vmemmap_pages);
+
+       ret = offline_pages(start_pfn + nr_vmemmap_pages,
+                           nr_pages - nr_vmemmap_pages);
+       if (ret) {
+               /* offline_pages() failed. Account back. */
+               if (nr_vmemmap_pages)
+                       adjust_present_page_count(zone, nr_vmemmap_pages);
+               return ret;
+       }
+
+       if (nr_vmemmap_pages)
+               mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+       return ret;
+}
+
 /*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long start_section_nr, unsigned long action,
-                   int online_type, int nid)
+memory_block_action(struct memory_block *mem, unsigned long action)
 {
-       unsigned long start_pfn;
-       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
        int ret;
 
-       start_pfn = section_nr_to_pfn(start_section_nr);
-
        switch (action) {
        case MEM_ONLINE:
-               ret = online_pages(start_pfn, nr_pages, online_type, nid);
+               ret = memory_block_online(mem);
                break;
        case MEM_OFFLINE:
-               ret = offline_pages(start_pfn, nr_pages);
+               ret = memory_block_offline(mem);
                break;
        default:
                WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-                    "%ld\n", __func__, start_section_nr, action, action);
+                    "%ld\n", __func__, mem->start_section_nr, action, action);
                ret = -EINVAL;
        }
 
@@ -210,9 +278,7 @@ static int memory_block_change_state(struct memory_block *mem,
        if (to_state == MEM_OFFLINE)
                mem->state = MEM_GOING_OFFLINE;
 
-       ret = memory_block_action(mem->start_section_nr, to_state,
-                                 mem->online_type, mem->nid);
-
+       ret = memory_block_action(mem, to_state);
        mem->state = ret ? from_state_req : to_state;
 
        return ret;
@@ -567,7 +633,8 @@ int register_memory(struct memory_block *memory)
        return ret;
 }
 
-static int init_memory_block(unsigned long block_id, unsigned long state)
+static int init_memory_block(unsigned long block_id, unsigned long state,
+                            unsigned long nr_vmemmap_pages)
 {
        struct memory_block *mem;
        int ret = 0;
@@ -584,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
        mem->start_section_nr = block_id * sections_per_block;
        mem->state = state;
        mem->nid = NUMA_NO_NODE;
+       mem->nr_vmemmap_pages = nr_vmemmap_pages;
 
        ret = register_memory(mem);
 
@@ -603,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr)
        if (section_count == 0)
                return 0;
        return init_memory_block(memory_block_id(base_section_nr),
-                                MEM_ONLINE);
+                                MEM_ONLINE, 0);
 }
 
 static void unregister_memory(struct memory_block *memory)
@@ -625,7 +693,8 @@ static void unregister_memory(struct memory_block *memory)
  *
  * Called under device_hotplug_lock.
  */
-int create_memory_block_devices(unsigned long start, unsigned long size)
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages)
 {
        const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
        unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -638,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
                return -EINVAL;
 
        for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-               ret = init_memory_block(block_id, MEM_OFFLINE);
+               ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
                if (ret)
                        break;
        }
index 6e622c1..7562cf3 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
+#include <linux/pagemap.h>
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
index a370cde..d58d68f 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
index c01786a..c604a40 100644 (file)
@@ -88,7 +88,7 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
        dev->discard_alignment      = le32_to_cpu(rsp->discard_alignment);
        dev->secure_discard         = le16_to_cpu(rsp->secure_discard);
        dev->rotational             = rsp->rotational;
-       dev->wc                     = !!(rsp->cache_policy & RNBD_WRITEBACK);
+       dev->wc                     = !!(rsp->cache_policy & RNBD_WRITEBACK);
        dev->fua                    = !!(rsp->cache_policy & RNBD_FUA);
 
        dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
@@ -241,7 +241,7 @@ static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess)
             cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
                if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
                        continue;
-               if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm)))
+               if (!test_bit(cpu_q->cpu, sess->cpu_queues_bm))
                        goto unlock;
                q = list_first_entry_or_null(&cpu_q->requeue_list,
                                             typeof(*q), requeue_list);
@@ -320,7 +320,7 @@ static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess,
        struct rtrs_permit *permit;
 
        permit = rtrs_clt_get_permit(sess->rtrs, con_type, wait);
-       if (likely(permit))
+       if (permit)
                /* We have a subtle rare case here, when all permits can be
                 * consumed before busy counter increased.  This is safe,
                 * because loser will get NULL as a permit, observe 0 busy
@@ -351,12 +351,11 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
        struct rtrs_permit *permit;
 
        iu = kzalloc(sizeof(*iu), GFP_KERNEL);
-       if (!iu) {
+       if (!iu)
                return NULL;
-       }
 
        permit = rnbd_get_permit(sess, con_type, wait);
-       if (unlikely(!permit)) {
+       if (!permit) {
                kfree(iu);
                return NULL;
        }
@@ -692,7 +691,11 @@ static void remap_devs(struct rnbd_clt_session *sess)
                return;
        }
 
-       rtrs_clt_query(sess->rtrs, &attrs);
+       err = rtrs_clt_query(sess->rtrs, &attrs);
+       if (err) {
+               pr_err("rtrs_clt_query(\"%s\"): %d\n", sess->sessname, err);
+               return;
+       }
        mutex_lock(&sess->lock);
        sess->max_io_size = attrs.max_io_size;
 
@@ -805,7 +808,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname)
        mutex_init(&sess->lock);
        INIT_LIST_HEAD(&sess->devs_list);
        INIT_LIST_HEAD(&sess->list);
-       bitmap_zero(sess->cpu_queues_bm, NR_CPUS);
+       bitmap_zero(sess->cpu_queues_bm, num_possible_cpus());
        init_waitqueue_head(&sess->rtrs_waitq);
        refcount_set(&sess->refcount, 1);
 
@@ -1047,7 +1050,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
        };
        err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit,
                               &vec, 1, size, iu->sgt.sgl, sg_cnt);
-       if (unlikely(err)) {
+       if (err) {
                rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n",
                                 err);
                return err;
@@ -1078,7 +1081,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
        cpu_q = get_cpu_ptr(sess->cpu_queues);
        spin_lock_irqsave(&cpu_q->requeue_lock, flags);
 
-       if (likely(!test_and_set_bit_lock(0, &q->in_list))) {
+       if (!test_and_set_bit_lock(0, &q->in_list)) {
                if (WARN_ON(!list_empty(&q->requeue_list)))
                        goto unlock;
 
@@ -1090,7 +1093,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
                         */
                        smp_mb__before_atomic();
                }
-               if (likely(atomic_read(&sess->busy))) {
+               if (atomic_read(&sess->busy)) {
                        list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
                } else {
                        /* Very unlikely, but possible: busy counter was
@@ -1118,7 +1121,7 @@ static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev,
 
        if (delay != RNBD_DELAY_IFBUSY)
                blk_mq_delay_run_hw_queue(hctx, delay);
-       else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q)))
+       else if (!rnbd_clt_dev_add_to_requeue(dev, q))
                /*
                 * If session is not busy we have to restart
                 * the queue ourselves.
@@ -1135,12 +1138,12 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        int err;
        blk_status_t ret = BLK_STS_IOERR;
 
-       if (unlikely(dev->dev_state != DEV_STATE_MAPPED))
+       if (dev->dev_state != DEV_STATE_MAPPED)
                return BLK_STS_IOERR;
 
        iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON,
                                      RTRS_PERMIT_NOWAIT);
-       if (unlikely(!iu->permit)) {
+       if (!iu->permit) {
                rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY);
                return BLK_STS_RESOURCE;
        }
@@ -1148,7 +1151,8 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        iu->sgt.sgl = iu->first_sgl;
        err = sg_alloc_table_chained(&iu->sgt,
                                     /* Even-if the request has no segment,
-                                     * sglist must have one entry at least */
+                                     * sglist must have one entry at least.
+                                     */
                                     blk_rq_nr_phys_segments(rq) ? : 1,
                                     iu->sgt.sgl,
                                     RNBD_INLINE_SG_CNT);
@@ -1161,9 +1165,9 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 
        blk_mq_start_request(rq);
        err = rnbd_client_xfer_request(dev, rq, iu);
-       if (likely(err == 0))
+       if (err == 0)
                return BLK_STS_OK;
-       if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
+       if (err == -EAGAIN || err == -ENOMEM) {
                rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
                ret = BLK_STS_RESOURCE;
        }
@@ -1294,7 +1298,11 @@ find_and_get_or_create_sess(const char *sessname,
                err = PTR_ERR(sess->rtrs);
                goto wake_up_and_put;
        }
-       rtrs_clt_query(sess->rtrs, &attrs);
+
+       err = rtrs_clt_query(sess->rtrs, &attrs);
+       if (err)
+               goto close_rtrs;
+
        sess->max_io_size = attrs.max_io_size;
        sess->queue_depth = attrs.queue_depth;
        sess->nr_poll_queues = nr_poll_queues;
@@ -1576,7 +1584,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
        struct rnbd_clt_dev *dev;
        int ret;
 
-       if (unlikely(exists_devpath(pathname, sessname)))
+       if (exists_devpath(pathname, sessname))
                return ERR_PTR(-EEXIST);
 
        sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues);
index 451e738..b5322c5 100644 (file)
@@ -87,7 +87,7 @@ struct rnbd_clt_session {
        DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
        int     __percpu        *cpu_rr; /* per-cpu var for CPU round-robin */
        atomic_t                busy;
-       int                     queue_depth;
+       size_t                  queue_depth;
        u32                     max_io_size;
        struct blk_mq_tag_set   tag_set;
        u32                     nr_poll_queues;
index 899dd9d..aafecfe 100644 (file)
@@ -104,7 +104,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
 
        rcu_read_lock();
        sess_dev = xa_load(&srv_sess->index_idr, dev_id);
-       if (likely(sess_dev))
+       if (sess_dev)
                ret = kref_get_unless_zero(&sess_dev->kref);
        rcu_read_unlock();
 
index d229a2d..b151e0f 100644 (file)
@@ -334,16 +334,6 @@ config DEVMEM
          memory.
          When in doubt, say "Y".
 
-config DEVKMEM
-       bool "/dev/kmem virtual device support"
-       # On arm64, VMALLOC_START < PAGE_OFFSET, which confuses kmem read/write
-       depends on !ARM64
-       help
-         Say Y here if you want to support the /dev/kmem device. The
-         /dev/kmem device is rarely used, but can be used for certain
-         kind of kernel debugging operations.
-         When in doubt, say "N".
-
 config NVRAM
        tristate "/dev/nvram support"
        depends on X86 || HAVE_ARCH_NVRAM_OPS
index 869b9f5..15dc54f 100644 (file)
@@ -403,221 +403,6 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
-{
-       unsigned long pfn;
-
-       /* Turn a kernel-virtual address into a physical page frame */
-       pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
-
-       /*
-        * RED-PEN: on some architectures there is more mapped memory than
-        * available in mem_map which pfn_valid checks for. Perhaps should add a
-        * new macro here.
-        *
-        * RED-PEN: vmalloc is not supported right now.
-        */
-       if (!pfn_valid(pfn))
-               return -EIO;
-
-       vma->vm_pgoff = pfn;
-       return mmap_mem(file, vma);
-}
-
-/*
- * This function reads the *virtual* memory as seen by the kernel.
- */
-static ssize_t read_kmem(struct file *file, char __user *buf,
-                        size_t count, loff_t *ppos)
-{
-       unsigned long p = *ppos;
-       ssize_t low_count, read, sz;
-       char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */
-       int err = 0;
-
-       read = 0;
-       if (p < (unsigned long) high_memory) {
-               low_count = count;
-               if (count > (unsigned long)high_memory - p)
-                       low_count = (unsigned long)high_memory - p;
-
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-               /* we don't have page 0 mapped on sparc and m68k.. */
-               if (p < PAGE_SIZE && low_count > 0) {
-                       sz = size_inside_page(p, low_count);
-                       if (clear_user(buf, sz))
-                               return -EFAULT;
-                       buf += sz;
-                       p += sz;
-                       read += sz;
-                       low_count -= sz;
-                       count -= sz;
-               }
-#endif
-               while (low_count > 0) {
-                       sz = size_inside_page(p, low_count);
-
-                       /*
-                        * On ia64 if a page has been mapped somewhere as
-                        * uncached, then it must also be accessed uncached
-                        * by the kernel or data corruption may occur
-                        */
-                       kbuf = xlate_dev_kmem_ptr((void *)p);
-                       if (!virt_addr_valid(kbuf))
-                               return -ENXIO;
-
-                       if (copy_to_user(buf, kbuf, sz))
-                               return -EFAULT;
-                       buf += sz;
-                       p += sz;
-                       read += sz;
-                       low_count -= sz;
-                       count -= sz;
-                       if (should_stop_iteration()) {
-                               count = 0;
-                               break;
-                       }
-               }
-       }
-
-       if (count > 0) {
-               kbuf = (char *)__get_free_page(GFP_KERNEL);
-               if (!kbuf)
-                       return -ENOMEM;
-               while (count > 0) {
-                       sz = size_inside_page(p, count);
-                       if (!is_vmalloc_or_module_addr((void *)p)) {
-                               err = -ENXIO;
-                               break;
-                       }
-                       sz = vread(kbuf, (char *)p, sz);
-                       if (!sz)
-                               break;
-                       if (copy_to_user(buf, kbuf, sz)) {
-                               err = -EFAULT;
-                               break;
-                       }
-                       count -= sz;
-                       buf += sz;
-                       read += sz;
-                       p += sz;
-                       if (should_stop_iteration())
-                               break;
-               }
-               free_page((unsigned long)kbuf);
-       }
-       *ppos = p;
-       return read ? read : err;
-}
-
-
-static ssize_t do_write_kmem(unsigned long p, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       ssize_t written, sz;
-       unsigned long copied;
-
-       written = 0;
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-       /* we don't have page 0 mapped on sparc and m68k.. */
-       if (p < PAGE_SIZE) {
-               sz = size_inside_page(p, count);
-               /* Hmm. Do something? */
-               buf += sz;
-               p += sz;
-               count -= sz;
-               written += sz;
-       }
-#endif
-
-       while (count > 0) {
-               void *ptr;
-
-               sz = size_inside_page(p, count);
-
-               /*
-                * On ia64 if a page has been mapped somewhere as uncached, then
-                * it must also be accessed uncached by the kernel or data
-                * corruption may occur.
-                */
-               ptr = xlate_dev_kmem_ptr((void *)p);
-               if (!virt_addr_valid(ptr))
-                       return -ENXIO;
-
-               copied = copy_from_user(ptr, buf, sz);
-               if (copied) {
-                       written += sz - copied;
-                       if (written)
-                               break;
-                       return -EFAULT;
-               }
-               buf += sz;
-               p += sz;
-               count -= sz;
-               written += sz;
-               if (should_stop_iteration())
-                       break;
-       }
-
-       *ppos += written;
-       return written;
-}
-
-/*
- * This function writes to the *virtual* memory as seen by the kernel.
- */
-static ssize_t write_kmem(struct file *file, const char __user *buf,
-                         size_t count, loff_t *ppos)
-{
-       unsigned long p = *ppos;
-       ssize_t wrote = 0;
-       ssize_t virtr = 0;
-       char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */
-       int err = 0;
-
-       if (p < (unsigned long) high_memory) {
-               unsigned long to_write = min_t(unsigned long, count,
-                                              (unsigned long)high_memory - p);
-               wrote = do_write_kmem(p, buf, to_write, ppos);
-               if (wrote != to_write)
-                       return wrote;
-               p += wrote;
-               buf += wrote;
-               count -= wrote;
-       }
-
-       if (count > 0) {
-               kbuf = (char *)__get_free_page(GFP_KERNEL);
-               if (!kbuf)
-                       return wrote ? wrote : -ENOMEM;
-               while (count > 0) {
-                       unsigned long sz = size_inside_page(p, count);
-                       unsigned long n;
-
-                       if (!is_vmalloc_or_module_addr((void *)p)) {
-                               err = -ENXIO;
-                               break;
-                       }
-                       n = copy_from_user(kbuf, buf, sz);
-                       if (n) {
-                               err = -EFAULT;
-                               break;
-                       }
-                       vwrite(kbuf, (char *)p, sz);
-                       count -= sz;
-                       buf += sz;
-                       virtr += sz;
-                       p += sz;
-                       if (should_stop_iteration())
-                               break;
-               }
-               free_page((unsigned long)kbuf);
-       }
-
-       *ppos = p;
-       return virtr + wrote ? : err;
-}
-
 static ssize_t read_port(struct file *file, char __user *buf,
                         size_t count, loff_t *ppos)
 {
@@ -855,7 +640,6 @@ static int open_port(struct inode *inode, struct file *filp)
 #define write_zero     write_null
 #define write_iter_zero        write_iter_null
 #define open_mem       open_port
-#define open_kmem      open_mem
 
 static const struct file_operations __maybe_unused mem_fops = {
        .llseek         = memory_lseek,
@@ -869,18 +653,6 @@ static const struct file_operations __maybe_unused mem_fops = {
 #endif
 };
 
-static const struct file_operations __maybe_unused kmem_fops = {
-       .llseek         = memory_lseek,
-       .read           = read_kmem,
-       .write          = write_kmem,
-       .mmap           = mmap_kmem,
-       .open           = open_kmem,
-#ifndef CONFIG_MMU
-       .get_unmapped_area = get_unmapped_area_mem,
-       .mmap_capabilities = memory_mmap_capabilities,
-#endif
-};
-
 static const struct file_operations null_fops = {
        .llseek         = null_lseek,
        .read           = read_null,
@@ -925,9 +697,6 @@ static const struct memdev {
 #ifdef CONFIG_DEVMEM
         [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
-#ifdef CONFIG_DEVKMEM
-        [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
-#endif
         [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
         [4] = { "port", 0, &port_fops, 0 },
index f5bd0dc..3c1c5da 100644 (file)
@@ -139,7 +139,7 @@ static int psci_to_linux_errno(int errno)
                return -EINVAL;
        case PSCI_RET_DENIED:
                return -EPERM;
-       };
+       }
 
        return -EINVAL;
 }
index a560468..6a2dee8 100644 (file)
@@ -3474,7 +3474,18 @@ intel_dp_check_mst_status(struct intel_dp *intel_dp)
        drm_WARN_ON_ONCE(&i915->drm, intel_dp->active_mst_links < 0);
 
        for (;;) {
-               u8 esi[DP_DPRX_ESI_LEN] = {};
+               /*
+                * The +2 is because DP_DPRX_ESI_LEN is 14, but we then
+                * pass in "esi+10" to drm_dp_channel_eq_ok(), which
+                * takes a 6-byte array. So we actually need 16 bytes
+                * here.
+                *
+                * Somebody who knows what the limits actually are
+                * should check this, but for now this is at least
+                * harmless and avoids a valid compiler warning about
+                * using more of the array than we have allocated.
+                */
+               u8 esi[DP_DPRX_ESI_LEN+2] = {};
                bool handled;
                int retry;
 
index 1864467..6754f57 100644 (file)
@@ -1,4 +1,3 @@
-/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */
 /* qxl_drv.c -- QXL driver -*- linux-c -*-
  *
  * Copyright 2011 Red Hat, Inc.
index f8e9b73..e2e12a5 100644 (file)
@@ -2535,7 +2535,7 @@ int i3c_master_register(struct i3c_master_controller *master,
 
        ret = i3c_master_bus_init(master);
        if (ret)
-               goto err_destroy_wq;
+               goto err_put_dev;
 
        ret = device_add(&master->dev);
        if (ret)
@@ -2566,9 +2566,6 @@ err_del_dev:
 err_cleanup_bus:
        i3c_master_bus_cleanup(master);
 
-err_destroy_wq:
-       destroy_workqueue(master->wq);
-
 err_put_dev:
        put_device(&master->dev);
 
index 8d99069..1f6ba42 100644 (file)
@@ -1124,7 +1124,6 @@ static int svc_i3c_master_send_direct_ccc_cmd(struct svc_i3c_master *master,
        cmd->in = NULL;
        cmd->out = &ccc->id;
        cmd->len = 1;
-       cmd->read_len = xfer_len;
        cmd->read_len = 0;
        cmd->continued = true;
 
index 40f4383..0a794d7 100644 (file)
@@ -2976,7 +2976,8 @@ EXPORT_SYMBOL(rtrs_clt_request);
 
 int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index)
 {
-       int cnt;
+       /* If no path, return -1 for block layer not to try again */
+       int cnt = -1;
        struct rtrs_con *con;
        struct rtrs_clt_sess *sess;
        struct path_it it;
index d8f5310..037cc59 100644 (file)
@@ -7,6 +7,7 @@
 
 obj-$(CONFIG_INPUT)            += input-core.o
 input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o
+input-core-y += touchscreen.o
 
 obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o
 obj-$(CONFIG_INPUT_SPARSEKMAP) += sparse-keymap.o
index 9f0d07d..d69d765 100644 (file)
@@ -268,6 +268,7 @@ static const struct xpad_device {
        { 0x1689, 0xfd00, "Razer Onza Tournament Edition", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfd01, "Razer Onza Classic Edition", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfe00, "Razer Sabertooth", 0, XTYPE_XBOX360 },
+       { 0x1949, 0x041a, "Amazon Game Controller", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0002, "Harmonix Rock Band Guitar", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
        { 0x1bad, 0x0130, "Ion Drum Rocker", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
@@ -440,6 +441,7 @@ static const struct usb_device_id xpad_table[] = {
        XPAD_XBOX360_VENDOR(0x15e4),            /* Numark X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x162e),            /* Joytech X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x1689),            /* Razer Onza */
+       XPAD_XBOX360_VENDOR(0x1949),            /* Amazon controllers */
        XPAD_XBOX360_VENDOR(0x1bad),            /* Harminix Rock Band Guitar and Drums */
        XPAD_XBOX360_VENDOR(0x20d6),            /* PowerA Controllers */
        XPAD_XBOXONE_VENDOR(0x20d6),            /* PowerA Controllers */
index 77bac4d..8dbf1e6 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
@@ -36,10 +37,11 @@ struct gpio_button_data {
 
        unsigned short *code;
 
-       struct timer_list release_timer;
+       struct hrtimer release_timer;
        unsigned int release_delay;     /* in msecs, for IRQ-only buttons */
 
        struct delayed_work work;
+       struct hrtimer debounce_timer;
        unsigned int software_debounce; /* in msecs, for GPIO-driven buttons */
 
        unsigned int irq;
@@ -48,6 +50,7 @@ struct gpio_button_data {
        bool disabled;
        bool key_pressed;
        bool suspended;
+       bool debounce_use_hrtimer;
 };
 
 struct gpio_keys_drvdata {
@@ -122,6 +125,18 @@ static const unsigned long *get_bm_events_by_type(struct input_dev *dev,
        return (type == EV_KEY) ? dev->keybit : dev->swbit;
 }
 
+static void gpio_keys_quiesce_key(void *data)
+{
+       struct gpio_button_data *bdata = data;
+
+       if (!bdata->gpiod)
+               hrtimer_cancel(&bdata->release_timer);
+       if (bdata->debounce_use_hrtimer)
+               hrtimer_cancel(&bdata->debounce_timer);
+       else
+               cancel_delayed_work_sync(&bdata->work);
+}
+
 /**
  * gpio_keys_disable_button() - disables given GPIO button
  * @bdata: button data for button to be disabled
@@ -142,12 +157,7 @@ static void gpio_keys_disable_button(struct gpio_button_data *bdata)
                 * Disable IRQ and associated timer/work structure.
                 */
                disable_irq(bdata->irq);
-
-               if (bdata->gpiod)
-                       cancel_delayed_work_sync(&bdata->work);
-               else
-                       del_timer_sync(&bdata->release_timer);
-
+               gpio_keys_quiesce_key(bdata);
                bdata->disabled = true;
        }
 }
@@ -360,7 +370,9 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata)
        unsigned int type = button->type ?: EV_KEY;
        int state;
 
-       state = gpiod_get_value_cansleep(bdata->gpiod);
+       state = bdata->debounce_use_hrtimer ?
+                       gpiod_get_value(bdata->gpiod) :
+                       gpiod_get_value_cansleep(bdata->gpiod);
        if (state < 0) {
                dev_err(input->dev.parent,
                        "failed to get gpio state: %d\n", state);
@@ -373,7 +385,15 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata)
        } else {
                input_event(input, type, *bdata->code, state);
        }
-       input_sync(input);
+}
+
+static void gpio_keys_debounce_event(struct gpio_button_data *bdata)
+{
+       gpio_keys_gpio_report_event(bdata);
+       input_sync(bdata->input);
+
+       if (bdata->button->wakeup)
+               pm_relax(bdata->input->dev.parent);
 }
 
 static void gpio_keys_gpio_work_func(struct work_struct *work)
@@ -381,10 +401,17 @@ static void gpio_keys_gpio_work_func(struct work_struct *work)
        struct gpio_button_data *bdata =
                container_of(work, struct gpio_button_data, work.work);
 
-       gpio_keys_gpio_report_event(bdata);
+       gpio_keys_debounce_event(bdata);
+}
 
-       if (bdata->button->wakeup)
-               pm_relax(bdata->input->dev.parent);
+static enum hrtimer_restart gpio_keys_debounce_timer(struct hrtimer *t)
+{
+       struct gpio_button_data *bdata =
+               container_of(t, struct gpio_button_data, debounce_timer);
+
+       gpio_keys_debounce_event(bdata);
+
+       return HRTIMER_NORESTART;
 }
 
 static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
@@ -408,26 +435,33 @@ static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
                }
        }
 
-       mod_delayed_work(system_wq,
-                        &bdata->work,
-                        msecs_to_jiffies(bdata->software_debounce));
+       if (bdata->debounce_use_hrtimer) {
+               hrtimer_start(&bdata->debounce_timer,
+                             ms_to_ktime(bdata->software_debounce),
+                             HRTIMER_MODE_REL);
+       } else {
+               mod_delayed_work(system_wq,
+                                &bdata->work,
+                                msecs_to_jiffies(bdata->software_debounce));
+       }
 
        return IRQ_HANDLED;
 }
 
-static void gpio_keys_irq_timer(struct timer_list *t)
+static enum hrtimer_restart gpio_keys_irq_timer(struct hrtimer *t)
 {
-       struct gpio_button_data *bdata = from_timer(bdata, t, release_timer);
+       struct gpio_button_data *bdata = container_of(t,
+                                                     struct gpio_button_data,
+                                                     release_timer);
        struct input_dev *input = bdata->input;
-       unsigned long flags;
 
-       spin_lock_irqsave(&bdata->lock, flags);
        if (bdata->key_pressed) {
                input_event(input, EV_KEY, *bdata->code, 0);
                input_sync(input);
                bdata->key_pressed = false;
        }
-       spin_unlock_irqrestore(&bdata->lock, flags);
+
+       return HRTIMER_NORESTART;
 }
 
 static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id)
@@ -457,23 +491,14 @@ static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id)
        }
 
        if (bdata->release_delay)
-               mod_timer(&bdata->release_timer,
-                       jiffies + msecs_to_jiffies(bdata->release_delay));
+               hrtimer_start(&bdata->release_timer,
+                             ms_to_ktime(bdata->release_delay),
+                             HRTIMER_MODE_REL_HARD);
 out:
        spin_unlock_irqrestore(&bdata->lock, flags);
        return IRQ_HANDLED;
 }
 
-static void gpio_keys_quiesce_key(void *data)
-{
-       struct gpio_button_data *bdata = data;
-
-       if (bdata->gpiod)
-               cancel_delayed_work_sync(&bdata->work);
-       else
-               del_timer_sync(&bdata->release_timer);
-}
-
 static int gpio_keys_setup_key(struct platform_device *pdev,
                                struct input_dev *input,
                                struct gpio_keys_drvdata *ddata,
@@ -543,6 +568,14 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
                        if (error < 0)
                                bdata->software_debounce =
                                                button->debounce_interval;
+
+                       /*
+                        * If reading the GPIO won't sleep, we can use a
+                        * hrtimer instead of a standard timer for the software
+                        * debounce, to reduce the latency as much as possible.
+                        */
+                       bdata->debounce_use_hrtimer =
+                                       !gpiod_cansleep(bdata->gpiod);
                }
 
                if (button->irq) {
@@ -561,6 +594,10 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 
                INIT_DELAYED_WORK(&bdata->work, gpio_keys_gpio_work_func);
 
+               hrtimer_init(&bdata->debounce_timer,
+                            CLOCK_REALTIME, HRTIMER_MODE_REL);
+               bdata->debounce_timer.function = gpio_keys_debounce_timer;
+
                isr = gpio_keys_gpio_isr;
                irqflags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
 
@@ -595,7 +632,9 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
                }
 
                bdata->release_delay = button->debounce_interval;
-               timer_setup(&bdata->release_timer, gpio_keys_irq_timer, 0);
+               hrtimer_init(&bdata->release_timer,
+                            CLOCK_REALTIME, HRTIMER_MODE_REL_HARD);
+               bdata->release_timer.function = gpio_keys_irq_timer;
 
                isr = gpio_keys_irq_isr;
                irqflags = 0;
index 1f5c9ea..ae93038 100644 (file)
@@ -408,27 +408,18 @@ open_err:
        return -EIO;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id imx_keypad_of_match[] = {
        { .compatible = "fsl,imx21-kpp", },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, imx_keypad_of_match);
-#endif
 
 static int imx_keypad_probe(struct platform_device *pdev)
 {
-       const struct matrix_keymap_data *keymap_data =
-                       dev_get_platdata(&pdev->dev);
        struct imx_keypad *keypad;
        struct input_dev *input_dev;
        int irq, error, i, row, col;
 
-       if (!keymap_data && !pdev->dev.of_node) {
-               dev_err(&pdev->dev, "no keymap defined\n");
-               return -EINVAL;
-       }
-
        irq = platform_get_irq(pdev, 0);
        if (irq < 0)
                return irq;
@@ -469,7 +460,7 @@ static int imx_keypad_probe(struct platform_device *pdev)
        input_dev->open = imx_keypad_open;
        input_dev->close = imx_keypad_close;
 
-       error = matrix_keypad_build_keymap(keymap_data, NULL,
+       error = matrix_keypad_build_keymap(NULL, NULL,
                                           MAX_MATRIX_KEY_ROWS,
                                           MAX_MATRIX_KEY_COLS,
                                           keypad->keycodes, input_dev);
@@ -582,7 +573,7 @@ static struct platform_driver imx_keypad_driver = {
        .driver         = {
                .name   = "imx-keypad",
                .pm     = &imx_kbd_pm_ops,
-               .of_match_table = of_match_ptr(imx_keypad_of_match),
+               .of_match_table = imx_keypad_of_match,
        },
        .probe          = imx_keypad_probe,
 };
index 9b0f966..2a97559 100644 (file)
@@ -274,7 +274,7 @@ static int tca6416_keypad_probe(struct i2c_client *client,
                error = request_threaded_irq(chip->irqnum, NULL,
                                             tca6416_keys_isr,
                                             IRQF_TRIGGER_FALLING |
-                                               IRQF_ONESHOT,
+                                            IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                             "tca6416-keypad", chip);
                if (error) {
                        dev_dbg(&client->dev,
@@ -282,7 +282,6 @@ static int tca6416_keypad_probe(struct i2c_client *client,
                                chip->irqnum, error);
                        goto fail1;
                }
-               disable_irq(chip->irqnum);
        }
 
        error = input_register_device(input);
index 9671842..570fe18 100644 (file)
@@ -694,14 +694,13 @@ static int tegra_kbc_probe(struct platform_device *pdev)
        input_set_drvdata(kbc->idev, kbc);
 
        err = devm_request_irq(&pdev->dev, kbc->irq, tegra_kbc_isr,
-                              IRQF_TRIGGER_HIGH, pdev->name, kbc);
+                              IRQF_TRIGGER_HIGH | IRQF_NO_AUTOEN,
+                              pdev->name, kbc);
        if (err) {
                dev_err(&pdev->dev, "failed to request keyboard IRQ\n");
                return err;
        }
 
-       disable_irq(kbc->irq);
-
        err = input_register_device(kbc->idev);
        if (err) {
                dev_err(&pdev->dev, "failed to register input device\n");
index 7237dc4..498cde3 100644 (file)
@@ -763,6 +763,17 @@ config INPUT_IQS269A
          To compile this driver as a module, choose M here: the
          module will be called iqs269a.
 
+config INPUT_IQS626A
+       tristate "Azoteq IQS626A capacitive touch controller"
+       depends on I2C
+       select REGMAP_I2C
+       help
+         Say Y to enable support for the Azoteq IQS626A capacitive
+         touch controller.
+
+         To compile this driver as a module, choose M here: the
+         module will be called iqs626a.
+
 config INPUT_CMA3000
        tristate "VTI CMA3000 Tri-axis accelerometer"
        help
index 46db664..f593bee 100644 (file)
@@ -43,6 +43,7 @@ obj-$(CONFIG_INPUT_HISI_POWERKEY)     += hisi_powerkey.o
 obj-$(CONFIG_HP_SDC_RTC)               += hp_sdc_rtc.o
 obj-$(CONFIG_INPUT_IMS_PCU)            += ims-pcu.o
 obj-$(CONFIG_INPUT_IQS269A)            += iqs269a.o
+obj-$(CONFIG_INPUT_IQS626A)            += iqs626a.o
 obj-$(CONFIG_INPUT_IXP4XX_BEEPER)      += ixp4xx-beeper.o
 obj-$(CONFIG_INPUT_KEYSPAN_REMOTE)     += keyspan_remote.o
 obj-$(CONFIG_INPUT_KXTJ9)              += kxtj9.o
index 08b9b5c..81de8c4 100644 (file)
@@ -2018,7 +2018,6 @@ static int ims_pcu_probe(struct usb_interface *intf,
        }
 
        usb_set_intfdata(pcu->ctrl_intf, pcu);
-       usb_set_intfdata(pcu->data_intf, pcu);
 
        error = ims_pcu_buffers_alloc(pcu);
        if (error)
diff --git a/drivers/input/misc/iqs626a.c b/drivers/input/misc/iqs626a.c
new file mode 100644 (file)
index 0000000..d57e996
--- /dev/null
@@ -0,0 +1,1838 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS626A Capacitive Touch Controller
+ *
+ * Copyright (C) 2020 Jeff LaBundy <jeff@labundy.com>
+ *
+ * This driver registers up to 2 input devices: one representing capacitive or
+ * inductive keys as well as Hall-effect switches, and one for a trackpad that
+ * can express various gestures.
+ */
+
+#include <linux/bits.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
+#include <linux/input/touchscreen.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+#define IQS626_VER_INFO                                0x00
+#define IQS626_VER_INFO_PROD_NUM               0x51
+
+#define IQS626_SYS_FLAGS                       0x02
+#define IQS626_SYS_FLAGS_SHOW_RESET            BIT(15)
+#define IQS626_SYS_FLAGS_IN_ATI                        BIT(12)
+#define IQS626_SYS_FLAGS_PWR_MODE_MASK         GENMASK(9, 8)
+#define IQS626_SYS_FLAGS_PWR_MODE_SHIFT                8
+
+#define IQS626_HALL_OUTPUT                     0x23
+
+#define IQS626_SYS_SETTINGS                    0x80
+#define IQS626_SYS_SETTINGS_CLK_DIV            BIT(15)
+#define IQS626_SYS_SETTINGS_ULP_AUTO           BIT(14)
+#define IQS626_SYS_SETTINGS_DIS_AUTO           BIT(13)
+#define IQS626_SYS_SETTINGS_PWR_MODE_MASK      GENMASK(12, 11)
+#define IQS626_SYS_SETTINGS_PWR_MODE_SHIFT     11
+#define IQS626_SYS_SETTINGS_PWR_MODE_MAX       3
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_MASK    GENMASK(10, 8)
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT   8
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_MAX     7
+#define IQS626_SYS_SETTINGS_EVENT_MODE         BIT(5)
+#define IQS626_SYS_SETTINGS_EVENT_MODE_LP      BIT(4)
+#define IQS626_SYS_SETTINGS_REDO_ATI           BIT(2)
+#define IQS626_SYS_SETTINGS_ACK_RESET          BIT(0)
+
+#define IQS626_MISC_A_ATI_BAND_DISABLE         BIT(7)
+#define IQS626_MISC_A_TPx_LTA_UPDATE_MASK      GENMASK(6, 4)
+#define IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT     4
+#define IQS626_MISC_A_TPx_LTA_UPDATE_MAX       7
+#define IQS626_MISC_A_ATI_LP_ONLY              BIT(3)
+#define IQS626_MISC_A_GPIO3_SELECT_MASK                GENMASK(2, 0)
+#define IQS626_MISC_A_GPIO3_SELECT_MAX         7
+
+#define IQS626_EVENT_MASK_SYS                  BIT(6)
+#define IQS626_EVENT_MASK_GESTURE              BIT(3)
+#define IQS626_EVENT_MASK_DEEP                 BIT(2)
+#define IQS626_EVENT_MASK_TOUCH                        BIT(1)
+#define IQS626_EVENT_MASK_PROX                 BIT(0)
+
+#define IQS626_RATE_NP_MS_MAX                  255
+#define IQS626_RATE_LP_MS_MAX                  255
+#define IQS626_RATE_ULP_MS_MAX                 4080
+#define IQS626_TIMEOUT_PWR_MS_MAX              130560
+#define IQS626_TIMEOUT_LTA_MS_MAX              130560
+
+#define IQS626_MISC_B_RESEED_UI_SEL_MASK       GENMASK(7, 6)
+#define IQS626_MISC_B_RESEED_UI_SEL_SHIFT      6
+#define IQS626_MISC_B_RESEED_UI_SEL_MAX                3
+#define IQS626_MISC_B_THRESH_EXTEND            BIT(5)
+#define IQS626_MISC_B_TRACKING_UI_ENABLE       BIT(4)
+#define IQS626_MISC_B_TPx_SWIPE                        BIT(3)
+#define IQS626_MISC_B_RESEED_OFFSET            BIT(2)
+#define IQS626_MISC_B_FILT_STR_TPx             GENMASK(1, 0)
+
+#define IQS626_THRESH_SWIPE_MAX                        255
+#define IQS626_TIMEOUT_TAP_MS_MAX              4080
+#define IQS626_TIMEOUT_SWIPE_MS_MAX            4080
+
+#define IQS626_CHx_ENG_0_MEAS_CAP_SIZE         BIT(7)
+#define IQS626_CHx_ENG_0_RX_TERM_VSS           BIT(5)
+#define IQS626_CHx_ENG_0_LINEARIZE             BIT(4)
+#define IQS626_CHx_ENG_0_DUAL_DIR              BIT(3)
+#define IQS626_CHx_ENG_0_FILT_DISABLE          BIT(2)
+#define IQS626_CHx_ENG_0_ATI_MODE_MASK         GENMASK(1, 0)
+#define IQS626_CHx_ENG_0_ATI_MODE_MAX          3
+
+#define IQS626_CHx_ENG_1_CCT_HIGH_1            BIT(7)
+#define IQS626_CHx_ENG_1_CCT_HIGH_0            BIT(6)
+#define IQS626_CHx_ENG_1_PROJ_BIAS_MASK                GENMASK(5, 4)
+#define IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT       4
+#define IQS626_CHx_ENG_1_PROJ_BIAS_MAX         3
+#define IQS626_CHx_ENG_1_CCT_ENABLE            BIT(3)
+#define IQS626_CHx_ENG_1_SENSE_FREQ_MASK       GENMASK(2, 1)
+#define IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT      1
+#define IQS626_CHx_ENG_1_SENSE_FREQ_MAX                3
+#define IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN      BIT(0)
+
+#define IQS626_CHx_ENG_2_LOCAL_CAP_MASK                GENMASK(7, 6)
+#define IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT       6
+#define IQS626_CHx_ENG_2_LOCAL_CAP_MAX         3
+#define IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE      BIT(5)
+#define IQS626_CHx_ENG_2_SENSE_MODE_MASK       GENMASK(3, 0)
+#define IQS626_CHx_ENG_2_SENSE_MODE_MAX                15
+
+#define IQS626_CHx_ENG_3_TX_FREQ_MASK          GENMASK(5, 4)
+#define IQS626_CHx_ENG_3_TX_FREQ_SHIFT         4
+#define IQS626_CHx_ENG_3_TX_FREQ_MAX           3
+#define IQS626_CHx_ENG_3_INV_LOGIC             BIT(0)
+
+#define IQS626_CHx_ENG_4_RX_TERM_VREG          BIT(6)
+#define IQS626_CHx_ENG_4_CCT_LOW_1             BIT(5)
+#define IQS626_CHx_ENG_4_CCT_LOW_0             BIT(4)
+#define IQS626_CHx_ENG_4_COMP_DISABLE          BIT(1)
+#define IQS626_CHx_ENG_4_STATIC_ENABLE         BIT(0)
+
+#define IQS626_TPx_ATI_BASE_MIN                        45
+#define IQS626_TPx_ATI_BASE_MAX                        300
+#define IQS626_CHx_ATI_BASE_MASK               GENMASK(7, 6)
+#define IQS626_CHx_ATI_BASE_75                 0x00
+#define IQS626_CHx_ATI_BASE_100                        0x40
+#define IQS626_CHx_ATI_BASE_150                        0x80
+#define IQS626_CHx_ATI_BASE_200                        0xC0
+#define IQS626_CHx_ATI_TARGET_MASK             GENMASK(5, 0)
+#define IQS626_CHx_ATI_TARGET_MAX              2016
+
+#define IQS626_CHx_THRESH_MAX                  255
+#define IQS626_CHx_HYST_DEEP_MASK              GENMASK(7, 4)
+#define IQS626_CHx_HYST_DEEP_SHIFT             4
+#define IQS626_CHx_HYST_TOUCH_MASK             GENMASK(3, 0)
+#define IQS626_CHx_HYST_MAX                    15
+
+#define IQS626_FILT_STR_NP_TPx_MASK            GENMASK(7, 6)
+#define IQS626_FILT_STR_NP_TPx_SHIFT           6
+#define IQS626_FILT_STR_LP_TPx_MASK            GENMASK(5, 4)
+#define IQS626_FILT_STR_LP_TPx_SHIFT           4
+
+#define IQS626_FILT_STR_NP_CNT_MASK            GENMASK(7, 6)
+#define IQS626_FILT_STR_NP_CNT_SHIFT           6
+#define IQS626_FILT_STR_LP_CNT_MASK            GENMASK(5, 4)
+#define IQS626_FILT_STR_LP_CNT_SHIFT           4
+#define IQS626_FILT_STR_NP_LTA_MASK            GENMASK(3, 2)
+#define IQS626_FILT_STR_NP_LTA_SHIFT           2
+#define IQS626_FILT_STR_LP_LTA_MASK            GENMASK(1, 0)
+#define IQS626_FILT_STR_MAX                    3
+
+#define IQS626_ULP_PROJ_ENABLE                 BIT(4)
+#define IQS626_GEN_WEIGHT_MAX                  255
+
+#define IQS626_MAX_REG                         0xFF
+
+#define IQS626_NUM_CH_TP_3                     9
+#define IQS626_NUM_CH_TP_2                     6
+#define IQS626_NUM_CH_GEN                      3
+#define IQS626_NUM_CRx_TX                      8
+
+#define IQS626_PWR_MODE_POLL_SLEEP_US          50000
+#define IQS626_PWR_MODE_POLL_TIMEOUT_US                500000
+
+#define iqs626_irq_wait()                      usleep_range(350, 400)
+
+enum iqs626_ch_id {
+       IQS626_CH_ULP_0,
+       IQS626_CH_TP_2,
+       IQS626_CH_TP_3,
+       IQS626_CH_GEN_0,
+       IQS626_CH_GEN_1,
+       IQS626_CH_GEN_2,
+       IQS626_CH_HALL,
+};
+
+enum iqs626_rx_inactive {
+       IQS626_RX_INACTIVE_VSS,
+       IQS626_RX_INACTIVE_FLOAT,
+       IQS626_RX_INACTIVE_VREG,
+};
+
+enum iqs626_st_offs {
+       IQS626_ST_OFFS_PROX,
+       IQS626_ST_OFFS_DIR,
+       IQS626_ST_OFFS_TOUCH,
+       IQS626_ST_OFFS_DEEP,
+};
+
+enum iqs626_th_offs {
+       IQS626_TH_OFFS_PROX,
+       IQS626_TH_OFFS_TOUCH,
+       IQS626_TH_OFFS_DEEP,
+};
+
+enum iqs626_event_id {
+       IQS626_EVENT_PROX_DN,
+       IQS626_EVENT_PROX_UP,
+       IQS626_EVENT_TOUCH_DN,
+       IQS626_EVENT_TOUCH_UP,
+       IQS626_EVENT_DEEP_DN,
+       IQS626_EVENT_DEEP_UP,
+};
+
+enum iqs626_gesture_id {
+       IQS626_GESTURE_FLICK_X_POS,
+       IQS626_GESTURE_FLICK_X_NEG,
+       IQS626_GESTURE_FLICK_Y_POS,
+       IQS626_GESTURE_FLICK_Y_NEG,
+       IQS626_GESTURE_TAP,
+       IQS626_GESTURE_HOLD,
+       IQS626_NUM_GESTURES,
+};
+
+struct iqs626_event_desc {
+       const char *name;
+       enum iqs626_st_offs st_offs;
+       enum iqs626_th_offs th_offs;
+       bool dir_up;
+       u8 mask;
+};
+
+static const struct iqs626_event_desc iqs626_events[] = {
+       [IQS626_EVENT_PROX_DN] = {
+               .name = "event-prox",
+               .st_offs = IQS626_ST_OFFS_PROX,
+               .th_offs = IQS626_TH_OFFS_PROX,
+               .mask = IQS626_EVENT_MASK_PROX,
+       },
+       [IQS626_EVENT_PROX_UP] = {
+               .name = "event-prox-alt",
+               .st_offs = IQS626_ST_OFFS_PROX,
+               .th_offs = IQS626_TH_OFFS_PROX,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_PROX,
+       },
+       [IQS626_EVENT_TOUCH_DN] = {
+               .name = "event-touch",
+               .st_offs = IQS626_ST_OFFS_TOUCH,
+               .th_offs = IQS626_TH_OFFS_TOUCH,
+               .mask = IQS626_EVENT_MASK_TOUCH,
+       },
+       [IQS626_EVENT_TOUCH_UP] = {
+               .name = "event-touch-alt",
+               .st_offs = IQS626_ST_OFFS_TOUCH,
+               .th_offs = IQS626_TH_OFFS_TOUCH,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_TOUCH,
+       },
+       [IQS626_EVENT_DEEP_DN] = {
+               .name = "event-deep",
+               .st_offs = IQS626_ST_OFFS_DEEP,
+               .th_offs = IQS626_TH_OFFS_DEEP,
+               .mask = IQS626_EVENT_MASK_DEEP,
+       },
+       [IQS626_EVENT_DEEP_UP] = {
+               .name = "event-deep-alt",
+               .st_offs = IQS626_ST_OFFS_DEEP,
+               .th_offs = IQS626_TH_OFFS_DEEP,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_DEEP,
+       },
+};
+
+struct iqs626_ver_info {
+       u8 prod_num;
+       u8 sw_num;
+       u8 hw_num;
+       u8 padding;
+} __packed;
+
+struct iqs626_flags {
+       __be16 system;
+       u8 gesture;
+       u8 padding_a;
+       u8 states[4];
+       u8 ref_active;
+       u8 padding_b;
+       u8 comp_min;
+       u8 comp_max;
+       u8 trackpad_x;
+       u8 trackpad_y;
+} __packed;
+
+struct iqs626_ch_reg_ulp {
+       u8 thresh[2];
+       u8 hyst;
+       u8 filter;
+       u8 engine[2];
+       u8 ati_target;
+       u8 padding;
+       __be16 ati_comp;
+       u8 rx_enable;
+       u8 tx_enable;
+} __packed;
+
+struct iqs626_ch_reg_tp {
+       u8 thresh;
+       u8 ati_base;
+       __be16 ati_comp;
+} __packed;
+
+struct iqs626_tp_grp_reg {
+       u8 hyst;
+       u8 ati_target;
+       u8 engine[2];
+       struct iqs626_ch_reg_tp ch_reg_tp[IQS626_NUM_CH_TP_3];
+} __packed;
+
+struct iqs626_ch_reg_gen {
+       u8 thresh[3];
+       u8 padding;
+       u8 hyst;
+       u8 ati_target;
+       __be16 ati_comp;
+       u8 engine[5];
+       u8 filter;
+       u8 rx_enable;
+       u8 tx_enable;
+       u8 assoc_select;
+       u8 assoc_weight;
+} __packed;
+
+struct iqs626_ch_reg_hall {
+       u8 engine;
+       u8 thresh;
+       u8 hyst;
+       u8 ati_target;
+       __be16 ati_comp;
+} __packed;
+
+struct iqs626_sys_reg {
+       __be16 general;
+       u8 misc_a;
+       u8 event_mask;
+       u8 active;
+       u8 reseed;
+       u8 rate_np;
+       u8 rate_lp;
+       u8 rate_ulp;
+       u8 timeout_pwr;
+       u8 timeout_rdy;
+       u8 timeout_lta;
+       u8 misc_b;
+       u8 thresh_swipe;
+       u8 timeout_tap;
+       u8 timeout_swipe;
+       u8 redo_ati;
+       u8 padding;
+       struct iqs626_ch_reg_ulp ch_reg_ulp;
+       struct iqs626_tp_grp_reg tp_grp_reg;
+       struct iqs626_ch_reg_gen ch_reg_gen[IQS626_NUM_CH_GEN];
+       struct iqs626_ch_reg_hall ch_reg_hall;
+} __packed;
+
+struct iqs626_channel_desc {
+       const char *name;
+       int num_ch;
+       u8 active;
+       bool events[ARRAY_SIZE(iqs626_events)];
+};
+
+static const struct iqs626_channel_desc iqs626_channels[] = {
+       [IQS626_CH_ULP_0] = {
+               .name = "ulp-0",
+               .num_ch = 1,
+               .active = BIT(0),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+               },
+       },
+       [IQS626_CH_TP_2] = {
+               .name = "trackpad-3x2",
+               .num_ch = IQS626_NUM_CH_TP_2,
+               .active = BIT(1),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+               },
+       },
+       [IQS626_CH_TP_3] = {
+               .name = "trackpad-3x3",
+               .num_ch = IQS626_NUM_CH_TP_3,
+               .active = BIT(2) | BIT(1),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+               },
+       },
+       [IQS626_CH_GEN_0] = {
+               .name = "generic-0",
+               .num_ch = 1,
+               .active = BIT(4),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_GEN_1] = {
+               .name = "generic-1",
+               .num_ch = 1,
+               .active = BIT(5),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_GEN_2] = {
+               .name = "generic-2",
+               .num_ch = 1,
+               .active = BIT(6),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_HALL] = {
+               .name = "hall",
+               .num_ch = 1,
+               .active = BIT(7),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+               },
+       },
+};
+
+struct iqs626_private {
+       struct i2c_client *client;
+       struct regmap *regmap;
+       struct iqs626_sys_reg sys_reg;
+       struct completion ati_done;
+       struct input_dev *keypad;
+       struct input_dev *trackpad;
+       struct touchscreen_properties prop;
+       unsigned int kp_type[ARRAY_SIZE(iqs626_channels)]
+                           [ARRAY_SIZE(iqs626_events)];
+       unsigned int kp_code[ARRAY_SIZE(iqs626_channels)]
+                           [ARRAY_SIZE(iqs626_events)];
+       unsigned int tp_code[IQS626_NUM_GESTURES];
+       unsigned int suspend_mode;
+};
+
+static int iqs626_parse_events(struct iqs626_private *iqs626,
+                              const struct fwnode_handle *ch_node,
+                              enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       const struct fwnode_handle *ev_node;
+       const char *ev_name;
+       u8 *thresh, *hyst;
+       unsigned int thresh_tp[IQS626_NUM_CH_TP_3];
+       unsigned int val;
+       int num_ch = iqs626_channels[ch_id].num_ch;
+       int error, i, j;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               thresh = sys_reg->ch_reg_ulp.thresh;
+               hyst = &sys_reg->ch_reg_ulp.hyst;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               thresh = &sys_reg->tp_grp_reg.ch_reg_tp[0].thresh;
+               hyst = &sys_reg->tp_grp_reg.hyst;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               thresh = sys_reg->ch_reg_gen[i].thresh;
+               hyst = &sys_reg->ch_reg_gen[i].hyst;
+               break;
+
+       case IQS626_CH_HALL:
+               thresh = &sys_reg->ch_reg_hall.thresh;
+               hyst = &sys_reg->ch_reg_hall.hyst;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_events); i++) {
+               if (!iqs626_channels[ch_id].events[i])
+                       continue;
+
+               if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3) {
+                       /*
+                        * Trackpad touch events are simply described under the
+                        * trackpad child node.
+                        */
+                       ev_node = ch_node;
+               } else {
+                       ev_name = iqs626_events[i].name;
+                       ev_node = fwnode_get_named_child_node(ch_node, ev_name);
+                       if (!ev_node)
+                               continue;
+
+                       if (!fwnode_property_read_u32(ev_node, "linux,code",
+                                                     &val)) {
+                               iqs626->kp_code[ch_id][i] = val;
+
+                               if (fwnode_property_read_u32(ev_node,
+                                                            "linux,input-type",
+                                                            &val)) {
+                                       if (ch_id == IQS626_CH_HALL)
+                                               val = EV_SW;
+                                       else
+                                               val = EV_KEY;
+                               }
+
+                               if (val != EV_KEY && val != EV_SW) {
+                                       dev_err(&client->dev,
+                                               "Invalid input type: %u\n",
+                                               val);
+                                       return -EINVAL;
+                               }
+
+                               iqs626->kp_type[ch_id][i] = val;
+
+                               sys_reg->event_mask &= ~iqs626_events[i].mask;
+                       }
+               }
+
+               if (!fwnode_property_read_u32(ev_node, "azoteq,hyst", &val)) {
+                       if (val > IQS626_CHx_HYST_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel hysteresis: %u\n",
+                                       fwnode_get_name(ch_node), val);
+                               return -EINVAL;
+                       }
+
+                       if (i == IQS626_EVENT_DEEP_DN ||
+                           i == IQS626_EVENT_DEEP_UP) {
+                               *hyst &= ~IQS626_CHx_HYST_DEEP_MASK;
+                               *hyst |= (val << IQS626_CHx_HYST_DEEP_SHIFT);
+                       } else if (i == IQS626_EVENT_TOUCH_DN ||
+                                  i == IQS626_EVENT_TOUCH_UP) {
+                               *hyst &= ~IQS626_CHx_HYST_TOUCH_MASK;
+                               *hyst |= val;
+                       }
+               }
+
+               if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 &&
+                   !fwnode_property_read_u32(ev_node, "azoteq,thresh", &val)) {
+                       if (val > IQS626_CHx_THRESH_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel threshold: %u\n",
+                                       fwnode_get_name(ch_node), val);
+                               return -EINVAL;
+                       }
+
+                       if (ch_id == IQS626_CH_HALL)
+                               *thresh = val;
+                       else
+                               *(thresh + iqs626_events[i].th_offs) = val;
+
+                       continue;
+               }
+
+               if (!fwnode_property_present(ev_node, "azoteq,thresh"))
+                       continue;
+
+               error = fwnode_property_read_u32_array(ev_node, "azoteq,thresh",
+                                                      thresh_tp, num_ch);
+               if (error) {
+                       dev_err(&client->dev,
+                               "Failed to read %s channel thresholds: %d\n",
+                               fwnode_get_name(ch_node), error);
+                       return error;
+               }
+
+               for (j = 0; j < num_ch; j++) {
+                       if (thresh_tp[j] > IQS626_CHx_THRESH_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel threshold: %u\n",
+                                       fwnode_get_name(ch_node), thresh_tp[j]);
+                               return -EINVAL;
+                       }
+
+                       sys_reg->tp_grp_reg.ch_reg_tp[j].thresh = thresh_tp[j];
+               }
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_ati_target(struct iqs626_private *iqs626,
+                                  const struct fwnode_handle *ch_node,
+                                  enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       unsigned int ati_base[IQS626_NUM_CH_TP_3];
+       unsigned int val;
+       u8 *ati_target;
+       int num_ch = iqs626_channels[ch_id].num_ch;
+       int error, i;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               ati_target = &sys_reg->ch_reg_ulp.ati_target;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               ati_target = &sys_reg->tp_grp_reg.ati_target;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               ati_target = &sys_reg->ch_reg_gen[i].ati_target;
+               break;
+
+       case IQS626_CH_HALL:
+               ati_target = &sys_reg->ch_reg_hall.ati_target;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,ati-target", &val)) {
+               if (val > IQS626_CHx_ATI_TARGET_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI target: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *ati_target &= ~IQS626_CHx_ATI_TARGET_MASK;
+               *ati_target |= (val / 32);
+       }
+
+       if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 &&
+           !fwnode_property_read_u32(ch_node, "azoteq,ati-base", &val)) {
+               switch (val) {
+               case 75:
+                       val = IQS626_CHx_ATI_BASE_75;
+                       break;
+
+               case 100:
+                       val = IQS626_CHx_ATI_BASE_100;
+                       break;
+
+               case 150:
+                       val = IQS626_CHx_ATI_BASE_150;
+                       break;
+
+               case 200:
+                       val = IQS626_CHx_ATI_BASE_200;
+                       break;
+
+               default:
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI base: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *ati_target &= ~IQS626_CHx_ATI_BASE_MASK;
+               *ati_target |= val;
+
+               return 0;
+       }
+
+       if (!fwnode_property_present(ch_node, "azoteq,ati-base"))
+               return 0;
+
+       error = fwnode_property_read_u32_array(ch_node, "azoteq,ati-base",
+                                              ati_base, num_ch);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to read %s channel ATI bases: %d\n",
+                       fwnode_get_name(ch_node), error);
+               return error;
+       }
+
+       for (i = 0; i < num_ch; i++) {
+               if (ati_base[i] < IQS626_TPx_ATI_BASE_MIN ||
+                   ati_base[i] > IQS626_TPx_ATI_BASE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI base: %u\n",
+                               fwnode_get_name(ch_node), ati_base[i]);
+                       return -EINVAL;
+               }
+
+               ati_base[i] -= IQS626_TPx_ATI_BASE_MIN;
+               sys_reg->tp_grp_reg.ch_reg_tp[i].ati_base = ati_base[i];
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_pins(struct iqs626_private *iqs626,
+                            const struct fwnode_handle *ch_node,
+                            const char *propname, u8 *enable)
+{
+       struct i2c_client *client = iqs626->client;
+       unsigned int val[IQS626_NUM_CRx_TX];
+       int error, count, i;
+
+       if (!fwnode_property_present(ch_node, propname))
+               return 0;
+
+       count = fwnode_property_count_u32(ch_node, propname);
+       if (count > IQS626_NUM_CRx_TX) {
+               dev_err(&client->dev,
+                       "Too many %s channel CRX/TX pins present\n",
+                       fwnode_get_name(ch_node));
+               return -EINVAL;
+       } else if (count < 0) {
+               dev_err(&client->dev,
+                       "Failed to count %s channel CRX/TX pins: %d\n",
+                       fwnode_get_name(ch_node), count);
+               return count;
+       }
+
+       error = fwnode_property_read_u32_array(ch_node, propname, val, count);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to read %s channel CRX/TX pins: %d\n",
+                       fwnode_get_name(ch_node), error);
+               return error;
+       }
+
+       *enable = 0;
+
+       for (i = 0; i < count; i++) {
+               if (val[i] >= IQS626_NUM_CRx_TX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel CRX/TX pin: %u\n",
+                               fwnode_get_name(ch_node), val[i]);
+                       return -EINVAL;
+               }
+
+               *enable |= BIT(val[i]);
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_trackpad(struct iqs626_private *iqs626,
+                                const struct fwnode_handle *ch_node)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       u8 *hyst = &sys_reg->tp_grp_reg.hyst;
+       unsigned int val;
+       int error, count;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,lta-update", &val)) {
+               if (val > IQS626_MISC_A_TPx_LTA_UPDATE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel update rate: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_a &= ~IQS626_MISC_A_TPx_LTA_UPDATE_MASK;
+               sys_reg->misc_a |= (val << IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-trackpad",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_b &= ~IQS626_MISC_B_FILT_STR_TPx;
+               sys_reg->misc_b |= val;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *hyst &= ~IQS626_FILT_STR_NP_TPx_MASK;
+               *hyst |= (val << IQS626_FILT_STR_NP_TPx_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *hyst &= ~IQS626_FILT_STR_LP_TPx_MASK;
+               *hyst |= (val << IQS626_FILT_STR_LP_TPx_SHIFT);
+       }
+
+       if (!fwnode_property_present(ch_node, "linux,keycodes"))
+               return 0;
+
+       count = fwnode_property_count_u32(ch_node, "linux,keycodes");
+       if (count > IQS626_NUM_GESTURES) {
+               dev_err(&client->dev, "Too many keycodes present\n");
+               return -EINVAL;
+       } else if (count < 0) {
+               dev_err(&client->dev, "Failed to count keycodes: %d\n", count);
+               return count;
+       }
+
+       error = fwnode_property_read_u32_array(ch_node, "linux,keycodes",
+                                              iqs626->tp_code, count);
+       if (error) {
+               dev_err(&client->dev, "Failed to read keycodes: %d\n", error);
+               return error;
+       }
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_TPx_SWIPE;
+       if (fwnode_property_present(ch_node, "azoteq,gesture-swipe"))
+               sys_reg->misc_b |= IQS626_MISC_B_TPx_SWIPE;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-tap-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_TAP_MS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel timeout: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_tap = val / 16;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-swipe-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_SWIPE_MS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel timeout: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_swipe = val / 16;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,thresh-swipe",
+                                     &val)) {
+               if (val > IQS626_THRESH_SWIPE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel threshold: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->thresh_swipe = val;
+       }
+
+       sys_reg->event_mask &= ~IQS626_EVENT_MASK_GESTURE;
+
+       return 0;
+}
+
+static int iqs626_parse_channel(struct iqs626_private *iqs626,
+                               const struct fwnode_handle *ch_node,
+                               enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       u8 *engine, *filter, *rx_enable, *tx_enable;
+       u8 *assoc_select, *assoc_weight;
+       unsigned int val;
+       int error, i;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               engine = sys_reg->ch_reg_ulp.engine;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               engine = sys_reg->tp_grp_reg.engine;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               engine = sys_reg->ch_reg_gen[i].engine;
+               break;
+
+       case IQS626_CH_HALL:
+               engine = &sys_reg->ch_reg_hall.engine;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       *engine |= IQS626_CHx_ENG_0_MEAS_CAP_SIZE;
+       if (fwnode_property_present(ch_node, "azoteq,meas-cap-decrease"))
+               *engine &= ~IQS626_CHx_ENG_0_MEAS_CAP_SIZE;
+
+       *engine |= IQS626_CHx_ENG_0_RX_TERM_VSS;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,rx-inactive", &val)) {
+               switch (val) {
+               case IQS626_RX_INACTIVE_VSS:
+                       break;
+
+               case IQS626_RX_INACTIVE_FLOAT:
+                       *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS;
+                       if (ch_id == IQS626_CH_GEN_0 ||
+                           ch_id == IQS626_CH_GEN_1 ||
+                           ch_id == IQS626_CH_GEN_2)
+                               *(engine + 4) &= ~IQS626_CHx_ENG_4_RX_TERM_VREG;
+                       break;
+
+               case IQS626_RX_INACTIVE_VREG:
+                       if (ch_id == IQS626_CH_GEN_0 ||
+                           ch_id == IQS626_CH_GEN_1 ||
+                           ch_id == IQS626_CH_GEN_2) {
+                               *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS;
+                               *(engine + 4) |= IQS626_CHx_ENG_4_RX_TERM_VREG;
+                               break;
+                       }
+                       fallthrough;
+
+               default:
+                       dev_err(&client->dev,
+                               "Invalid %s channel CRX pin termination: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+       }
+
+       *engine &= ~IQS626_CHx_ENG_0_LINEARIZE;
+       if (fwnode_property_present(ch_node, "azoteq,linearize"))
+               *engine |= IQS626_CHx_ENG_0_LINEARIZE;
+
+       *engine &= ~IQS626_CHx_ENG_0_DUAL_DIR;
+       if (fwnode_property_present(ch_node, "azoteq,dual-direction"))
+               *engine |= IQS626_CHx_ENG_0_DUAL_DIR;
+
+       *engine &= ~IQS626_CHx_ENG_0_FILT_DISABLE;
+       if (fwnode_property_present(ch_node, "azoteq,filt-disable"))
+               *engine |= IQS626_CHx_ENG_0_FILT_DISABLE;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,ati-mode", &val)) {
+               if (val > IQS626_CHx_ENG_0_ATI_MODE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI mode: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *engine &= ~IQS626_CHx_ENG_0_ATI_MODE_MASK;
+               *engine |= val;
+       }
+
+       if (ch_id == IQS626_CH_HALL)
+               return 0;
+
+       *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_ENABLE;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,cct-increase",
+                                     &val) && val) {
+               unsigned int orig_val = val--;
+
+               /*
+                * In the case of the generic channels, the charge cycle time
+                * field doubles in size and straddles two separate registers.
+                */
+               if (ch_id == IQS626_CH_GEN_0 ||
+                   ch_id == IQS626_CH_GEN_1 ||
+                   ch_id == IQS626_CH_GEN_2) {
+                       *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_1;
+                       if (val & BIT(1))
+                               *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_1;
+
+                       *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_0;
+                       if (val & BIT(0))
+                               *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_0;
+
+                       val >>= 2;
+               }
+
+               if (val & ~GENMASK(1, 0)) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel charge cycle time: %u\n",
+                               fwnode_get_name(ch_node), orig_val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_1;
+               if (val & BIT(1))
+                       *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_1;
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_0;
+               if (val & BIT(0))
+                       *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_0;
+
+               *(engine + 1) |= IQS626_CHx_ENG_1_CCT_ENABLE;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,proj-bias", &val)) {
+               if (val > IQS626_CHx_ENG_1_PROJ_BIAS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel bias current: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_PROJ_BIAS_MASK;
+               *(engine + 1) |= (val << IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,sense-freq", &val)) {
+               if (val > IQS626_CHx_ENG_1_SENSE_FREQ_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel sensing frequency: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_SENSE_FREQ_MASK;
+               *(engine + 1) |= (val << IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT);
+       }
+
+       *(engine + 1) &= ~IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN;
+       if (fwnode_property_present(ch_node, "azoteq,ati-band-tighten"))
+               *(engine + 1) |= IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN;
+
+       if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3)
+               return iqs626_parse_trackpad(iqs626, ch_node);
+
+       if (ch_id == IQS626_CH_ULP_0) {
+               sys_reg->ch_reg_ulp.hyst &= ~IQS626_ULP_PROJ_ENABLE;
+               if (fwnode_property_present(ch_node, "azoteq,proj-enable"))
+                       sys_reg->ch_reg_ulp.hyst |= IQS626_ULP_PROJ_ENABLE;
+
+               filter = &sys_reg->ch_reg_ulp.filter;
+
+               rx_enable = &sys_reg->ch_reg_ulp.rx_enable;
+               tx_enable = &sys_reg->ch_reg_ulp.tx_enable;
+       } else {
+               i = ch_id - IQS626_CH_GEN_0;
+               filter = &sys_reg->ch_reg_gen[i].filter;
+
+               rx_enable = &sys_reg->ch_reg_gen[i].rx_enable;
+               tx_enable = &sys_reg->ch_reg_gen[i].tx_enable;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_NP_CNT_MASK;
+               *filter |= (val << IQS626_FILT_STR_NP_CNT_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_LP_CNT_MASK;
+               *filter |= (val << IQS626_FILT_STR_LP_CNT_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-lta",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_NP_LTA_MASK;
+               *filter |= (val << IQS626_FILT_STR_NP_LTA_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-lta",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_LP_LTA_MASK;
+               *filter |= val;
+       }
+
+       error = iqs626_parse_pins(iqs626, ch_node, "azoteq,rx-enable",
+                                 rx_enable);
+       if (error)
+               return error;
+
+       error = iqs626_parse_pins(iqs626, ch_node, "azoteq,tx-enable",
+                                 tx_enable);
+       if (error)
+               return error;
+
+       if (ch_id == IQS626_CH_ULP_0)
+               return 0;
+
+       *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,local-cap-size",
+                                     &val) && val) {
+               unsigned int orig_val = val--;
+
+               if (val > IQS626_CHx_ENG_2_LOCAL_CAP_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel local cap. size: %u\n",
+                               fwnode_get_name(ch_node), orig_val);
+                       return -EINVAL;
+               }
+
+               *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_MASK;
+               *(engine + 2) |= (val << IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT);
+
+               *(engine + 2) |= IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,sense-mode", &val)) {
+               if (val > IQS626_CHx_ENG_2_SENSE_MODE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel sensing mode: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 2) &= ~IQS626_CHx_ENG_2_SENSE_MODE_MASK;
+               *(engine + 2) |= val;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,tx-freq", &val)) {
+               if (val > IQS626_CHx_ENG_3_TX_FREQ_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel excitation frequency: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 3) &= ~IQS626_CHx_ENG_3_TX_FREQ_MASK;
+               *(engine + 3) |= (val << IQS626_CHx_ENG_3_TX_FREQ_SHIFT);
+       }
+
+       *(engine + 3) &= ~IQS626_CHx_ENG_3_INV_LOGIC;
+       if (fwnode_property_present(ch_node, "azoteq,invert-enable"))
+               *(engine + 3) |= IQS626_CHx_ENG_3_INV_LOGIC;
+
+       *(engine + 4) &= ~IQS626_CHx_ENG_4_COMP_DISABLE;
+       if (fwnode_property_present(ch_node, "azoteq,comp-disable"))
+               *(engine + 4) |= IQS626_CHx_ENG_4_COMP_DISABLE;
+
+       *(engine + 4) &= ~IQS626_CHx_ENG_4_STATIC_ENABLE;
+       if (fwnode_property_present(ch_node, "azoteq,static-enable"))
+               *(engine + 4) |= IQS626_CHx_ENG_4_STATIC_ENABLE;
+
+       i = ch_id - IQS626_CH_GEN_0;
+       assoc_select = &sys_reg->ch_reg_gen[i].assoc_select;
+       assoc_weight = &sys_reg->ch_reg_gen[i].assoc_weight;
+
+       *assoc_select = 0;
+       if (!fwnode_property_present(ch_node, "azoteq,assoc-select"))
+               return 0;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (fwnode_property_match_string(ch_node, "azoteq,assoc-select",
+                                                iqs626_channels[i].name) < 0)
+                       continue;
+
+               *assoc_select |= iqs626_channels[i].active;
+       }
+
+       if (fwnode_property_read_u32(ch_node, "azoteq,assoc-weight", &val))
+               return 0;
+
+       if (val > IQS626_GEN_WEIGHT_MAX) {
+               dev_err(&client->dev,
+                       "Invalid %s channel associated weight: %u\n",
+                       fwnode_get_name(ch_node), val);
+               return -EINVAL;
+       }
+
+       *assoc_weight = val;
+
+       return 0;
+}
+
+static int iqs626_parse_prop(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       struct fwnode_handle *ch_node;
+       unsigned int val;
+       int error, i;
+       u16 general;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,suspend-mode",
+                                     &val)) {
+               if (val > IQS626_SYS_SETTINGS_PWR_MODE_MAX) {
+                       dev_err(&client->dev, "Invalid suspend mode: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               iqs626->suspend_mode = val;
+       }
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_SYS_SETTINGS, sys_reg,
+                               sizeof(*sys_reg));
+       if (error)
+               return error;
+
+       general = be16_to_cpu(sys_reg->general);
+       general &= IQS626_SYS_SETTINGS_ULP_UPDATE_MASK;
+
+       if (device_property_present(&client->dev, "azoteq,clk-div"))
+               general |= IQS626_SYS_SETTINGS_CLK_DIV;
+
+       if (device_property_present(&client->dev, "azoteq,ulp-enable"))
+               general |= IQS626_SYS_SETTINGS_ULP_AUTO;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,ulp-update",
+                                     &val)) {
+               if (val > IQS626_SYS_SETTINGS_ULP_UPDATE_MAX) {
+                       dev_err(&client->dev, "Invalid update rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               general &= ~IQS626_SYS_SETTINGS_ULP_UPDATE_MASK;
+               general |= (val << IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT);
+       }
+
+       sys_reg->misc_a &= ~IQS626_MISC_A_ATI_BAND_DISABLE;
+       if (device_property_present(&client->dev, "azoteq,ati-band-disable"))
+               sys_reg->misc_a |= IQS626_MISC_A_ATI_BAND_DISABLE;
+
+       sys_reg->misc_a &= ~IQS626_MISC_A_ATI_LP_ONLY;
+       if (device_property_present(&client->dev, "azoteq,ati-lp-only"))
+               sys_reg->misc_a |= IQS626_MISC_A_ATI_LP_ONLY;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,gpio3-select",
+                                     &val)) {
+               if (val > IQS626_MISC_A_GPIO3_SELECT_MAX) {
+                       dev_err(&client->dev, "Invalid GPIO3 selection: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_a &= ~IQS626_MISC_A_GPIO3_SELECT_MASK;
+               sys_reg->misc_a |= val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,reseed-select",
+                                     &val)) {
+               if (val > IQS626_MISC_B_RESEED_UI_SEL_MAX) {
+                       dev_err(&client->dev, "Invalid reseed selection: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_UI_SEL_MASK;
+               sys_reg->misc_b |= (val << IQS626_MISC_B_RESEED_UI_SEL_SHIFT);
+       }
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_THRESH_EXTEND;
+       if (device_property_present(&client->dev, "azoteq,thresh-extend"))
+               sys_reg->misc_b |= IQS626_MISC_B_THRESH_EXTEND;
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_TRACKING_UI_ENABLE;
+       if (device_property_present(&client->dev, "azoteq,tracking-enable"))
+               sys_reg->misc_b |= IQS626_MISC_B_TRACKING_UI_ENABLE;
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_OFFSET;
+       if (device_property_present(&client->dev, "azoteq,reseed-offset"))
+               sys_reg->misc_b |= IQS626_MISC_B_RESEED_OFFSET;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-np-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_NP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_np = val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-lp-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_LP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_lp = val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-ulp-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_ULP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_ulp = val / 16;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,timeout-pwr-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_PWR_MS_MAX) {
+                       dev_err(&client->dev, "Invalid timeout: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_pwr = val / 512;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,timeout-lta-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_LTA_MS_MAX) {
+                       dev_err(&client->dev, "Invalid timeout: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_lta = val / 512;
+       }
+
+       sys_reg->event_mask = ~((u8)IQS626_EVENT_MASK_SYS);
+       sys_reg->redo_ati = 0;
+
+       sys_reg->reseed = 0;
+       sys_reg->active = 0;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               ch_node = device_get_named_child_node(&client->dev,
+                                                     iqs626_channels[i].name);
+               if (!ch_node)
+                       continue;
+
+               error = iqs626_parse_channel(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               error = iqs626_parse_ati_target(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               error = iqs626_parse_events(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               if (!fwnode_property_present(ch_node, "azoteq,ati-exclude"))
+                       sys_reg->redo_ati |= iqs626_channels[i].active;
+
+               if (!fwnode_property_present(ch_node, "azoteq,reseed-disable"))
+                       sys_reg->reseed |= iqs626_channels[i].active;
+
+               sys_reg->active |= iqs626_channels[i].active;
+       }
+
+       general |= IQS626_SYS_SETTINGS_EVENT_MODE;
+
+       /*
+        * Enable streaming during normal-power mode if the trackpad is used to
+        * report raw coordinates instead of gestures. In that case, the device
+        * returns to event mode during low-power mode.
+        */
+       if (sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active &&
+           sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE)
+               general |= IQS626_SYS_SETTINGS_EVENT_MODE_LP;
+
+       general |= IQS626_SYS_SETTINGS_REDO_ATI;
+       general |= IQS626_SYS_SETTINGS_ACK_RESET;
+
+       sys_reg->general = cpu_to_be16(general);
+
+       error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                &iqs626->sys_reg, sizeof(iqs626->sys_reg));
+       if (error)
+               return error;
+
+       iqs626_irq_wait();
+
+       return 0;
+}
+
+static int iqs626_input_init(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       int error, i, j;
+
+       iqs626->keypad = devm_input_allocate_device(&client->dev);
+       if (!iqs626->keypad)
+               return -ENOMEM;
+
+       iqs626->keypad->keycodemax = ARRAY_SIZE(iqs626->kp_code);
+       iqs626->keypad->keycode = iqs626->kp_code;
+       iqs626->keypad->keycodesize = sizeof(**iqs626->kp_code);
+
+       iqs626->keypad->name = "iqs626a_keypad";
+       iqs626->keypad->id.bustype = BUS_I2C;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (!(sys_reg->active & iqs626_channels[i].active))
+                       continue;
+
+               for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) {
+                       if (!iqs626->kp_type[i][j])
+                               continue;
+
+                       input_set_capability(iqs626->keypad,
+                                            iqs626->kp_type[i][j],
+                                            iqs626->kp_code[i][j]);
+               }
+       }
+
+       if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active))
+               return 0;
+
+       iqs626->trackpad = devm_input_allocate_device(&client->dev);
+       if (!iqs626->trackpad)
+               return -ENOMEM;
+
+       iqs626->trackpad->keycodemax = ARRAY_SIZE(iqs626->tp_code);
+       iqs626->trackpad->keycode = iqs626->tp_code;
+       iqs626->trackpad->keycodesize = sizeof(*iqs626->tp_code);
+
+       iqs626->trackpad->name = "iqs626a_trackpad";
+       iqs626->trackpad->id.bustype = BUS_I2C;
+
+       /*
+        * Present the trackpad as a traditional pointing device if no gestures
+        * have been mapped to a keycode.
+        */
+       if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) {
+               u8 tp_mask = iqs626_channels[IQS626_CH_TP_3].active;
+
+               input_set_capability(iqs626->trackpad, EV_KEY, BTN_TOUCH);
+               input_set_abs_params(iqs626->trackpad, ABS_Y, 0, 255, 0, 0);
+
+               if ((sys_reg->active & tp_mask) == tp_mask)
+                       input_set_abs_params(iqs626->trackpad,
+                                            ABS_X, 0, 255, 0, 0);
+               else
+                       input_set_abs_params(iqs626->trackpad,
+                                            ABS_X, 0, 128, 0, 0);
+
+               touchscreen_parse_properties(iqs626->trackpad, false,
+                                            &iqs626->prop);
+       } else {
+               for (i = 0; i < IQS626_NUM_GESTURES; i++)
+                       if (iqs626->tp_code[i] != KEY_RESERVED)
+                               input_set_capability(iqs626->trackpad, EV_KEY,
+                                                    iqs626->tp_code[i]);
+       }
+
+       error = input_register_device(iqs626->trackpad);
+       if (error)
+               dev_err(&client->dev, "Failed to register trackpad: %d\n",
+                       error);
+
+       return error;
+}
+
+static int iqs626_report(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       struct iqs626_flags flags;
+       __le16 hall_output;
+       int error, i, j;
+       u8 state;
+       u8 *dir_mask = &flags.states[IQS626_ST_OFFS_DIR];
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_SYS_FLAGS, &flags,
+                               sizeof(flags));
+       if (error) {
+               dev_err(&client->dev, "Failed to read device status: %d\n",
+                       error);
+               return error;
+       }
+
+       /*
+        * The device resets itself if its own watchdog bites, which can happen
+        * in the event of an I2C communication error. In this case, the device
+        * asserts a SHOW_RESET interrupt and all registers must be restored.
+        */
+       if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_SHOW_RESET) {
+               dev_err(&client->dev, "Unexpected device reset\n");
+
+               error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                        sys_reg, sizeof(*sys_reg));
+               if (error)
+                       dev_err(&client->dev,
+                               "Failed to re-initialize device: %d\n", error);
+
+               return error;
+       }
+
+       if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_IN_ATI)
+               return 0;
+
+       /*
+        * Unlike the ULP or generic channels, the Hall channel does not have a
+        * direction flag. Instead, the direction (i.e. magnet polarity) can be
+        * derived based on the sign of the 2's complement differential output.
+        */
+       if (sys_reg->active & iqs626_channels[IQS626_CH_HALL].active) {
+               error = regmap_raw_read(iqs626->regmap, IQS626_HALL_OUTPUT,
+                                       &hall_output, sizeof(hall_output));
+               if (error) {
+                       dev_err(&client->dev,
+                               "Failed to read Hall output: %d\n", error);
+                       return error;
+               }
+
+               *dir_mask &= ~iqs626_channels[IQS626_CH_HALL].active;
+               if (le16_to_cpu(hall_output) < 0x8000)
+                       *dir_mask |= iqs626_channels[IQS626_CH_HALL].active;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (!(sys_reg->active & iqs626_channels[i].active))
+                       continue;
+
+               for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) {
+                       if (!iqs626->kp_type[i][j])
+                               continue;
+
+                       state = flags.states[iqs626_events[j].st_offs];
+                       state &= iqs626_events[j].dir_up ? *dir_mask
+                                                        : ~(*dir_mask);
+                       state &= iqs626_channels[i].active;
+
+                       input_event(iqs626->keypad, iqs626->kp_type[i][j],
+                                   iqs626->kp_code[i][j], !!state);
+               }
+       }
+
+       input_sync(iqs626->keypad);
+
+       /*
+        * The following completion signals that ATI has finished, any initial
+        * switch states have been reported and the keypad can be registered.
+        */
+       complete_all(&iqs626->ati_done);
+
+       if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active))
+               return 0;
+
+       if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) {
+               state = flags.states[IQS626_ST_OFFS_TOUCH];
+               state &= iqs626_channels[IQS626_CH_TP_2].active;
+
+               input_report_key(iqs626->trackpad, BTN_TOUCH, state);
+
+               if (state)
+                       touchscreen_report_pos(iqs626->trackpad, &iqs626->prop,
+                                              flags.trackpad_x,
+                                              flags.trackpad_y, false);
+       } else {
+               for (i = 0; i < IQS626_NUM_GESTURES; i++)
+                       input_report_key(iqs626->trackpad, iqs626->tp_code[i],
+                                        flags.gesture & BIT(i));
+
+               if (flags.gesture & GENMASK(IQS626_GESTURE_TAP, 0)) {
+                       input_sync(iqs626->trackpad);
+
+                       /*
+                        * Momentary gestures are followed by a complementary
+                        * release cycle so as to emulate a full keystroke.
+                        */
+                       for (i = 0; i < IQS626_GESTURE_HOLD; i++)
+                               input_report_key(iqs626->trackpad,
+                                                iqs626->tp_code[i], 0);
+               }
+       }
+
+       input_sync(iqs626->trackpad);
+
+       return 0;
+}
+
+static irqreturn_t iqs626_irq(int irq, void *context)
+{
+       struct iqs626_private *iqs626 = context;
+
+       if (iqs626_report(iqs626))
+               return IRQ_NONE;
+
+       /*
+        * The device does not deassert its interrupt (RDY) pin until shortly
+        * after receiving an I2C stop condition; the following delay ensures
+        * the interrupt handler does not return before this time.
+        */
+       iqs626_irq_wait();
+
+       return IRQ_HANDLED;
+}
+
+static const struct regmap_config iqs626_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 16,
+       .max_register = IQS626_MAX_REG,
+};
+
+static int iqs626_probe(struct i2c_client *client)
+{
+       struct iqs626_ver_info ver_info;
+       struct iqs626_private *iqs626;
+       int error;
+
+       iqs626 = devm_kzalloc(&client->dev, sizeof(*iqs626), GFP_KERNEL);
+       if (!iqs626)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, iqs626);
+       iqs626->client = client;
+
+       iqs626->regmap = devm_regmap_init_i2c(client, &iqs626_regmap_config);
+       if (IS_ERR(iqs626->regmap)) {
+               error = PTR_ERR(iqs626->regmap);
+               dev_err(&client->dev, "Failed to initialize register map: %d\n",
+                       error);
+               return error;
+       }
+
+       init_completion(&iqs626->ati_done);
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_VER_INFO, &ver_info,
+                               sizeof(ver_info));
+       if (error)
+               return error;
+
+       if (ver_info.prod_num != IQS626_VER_INFO_PROD_NUM) {
+               dev_err(&client->dev, "Unrecognized product number: 0x%02X\n",
+                       ver_info.prod_num);
+               return -EINVAL;
+       }
+
+       error = iqs626_parse_prop(iqs626);
+       if (error)
+               return error;
+
+       error = iqs626_input_init(iqs626);
+       if (error)
+               return error;
+
+       error = devm_request_threaded_irq(&client->dev, client->irq,
+                                         NULL, iqs626_irq, IRQF_ONESHOT,
+                                         client->name, iqs626);
+       if (error) {
+               dev_err(&client->dev, "Failed to request IRQ: %d\n", error);
+               return error;
+       }
+
+       if (!wait_for_completion_timeout(&iqs626->ati_done,
+                                        msecs_to_jiffies(2000))) {
+               dev_err(&client->dev, "Failed to complete ATI\n");
+               return -ETIMEDOUT;
+       }
+
+       /*
+        * The keypad may include one or more switches and is not registered
+        * until ATI is complete and the initial switch states are read.
+        */
+       error = input_register_device(iqs626->keypad);
+       if (error)
+               dev_err(&client->dev, "Failed to register keypad: %d\n", error);
+
+       return error;
+}
+
+static int __maybe_unused iqs626_suspend(struct device *dev)
+{
+       struct iqs626_private *iqs626 = dev_get_drvdata(dev);
+       struct i2c_client *client = iqs626->client;
+       unsigned int val;
+       int error;
+
+       if (!iqs626->suspend_mode)
+               return 0;
+
+       disable_irq(client->irq);
+
+       /*
+        * Automatic power mode switching must be disabled before the device is
+        * forced into any particular power mode. In this case, the device will
+        * transition into normal-power mode.
+        */
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_DIS_AUTO, ~0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * The following check ensures the device has completed its transition
+        * into normal-power mode before a manual mode switch is performed.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                       !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+       if (error)
+               goto err_irq;
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_PWR_MODE_MASK,
+                                  iqs626->suspend_mode <<
+                                  IQS626_SYS_SETTINGS_PWR_MODE_SHIFT);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This last check ensures the device has completed its transition into
+        * the desired power mode to prevent any spurious interrupts from being
+        * triggered after iqs626_suspend has already returned.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                        (val & IQS626_SYS_FLAGS_PWR_MODE_MASK)
+                                        == (iqs626->suspend_mode <<
+                                            IQS626_SYS_FLAGS_PWR_MODE_SHIFT),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+
+err_irq:
+       iqs626_irq_wait();
+       enable_irq(client->irq);
+
+       return error;
+}
+
+static int __maybe_unused iqs626_resume(struct device *dev)
+{
+       struct iqs626_private *iqs626 = dev_get_drvdata(dev);
+       struct i2c_client *client = iqs626->client;
+       unsigned int val;
+       int error;
+
+       if (!iqs626->suspend_mode)
+               return 0;
+
+       disable_irq(client->irq);
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_PWR_MODE_MASK, 0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This check ensures the device has returned to normal-power mode
+        * before automatic power mode switching is re-enabled.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                       !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+       if (error)
+               goto err_irq;
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_DIS_AUTO, 0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This step reports any events that may have been "swallowed" as a
+        * result of polling PWR_MODE (which automatically acknowledges any
+        * pending interrupts).
+        */
+       error = iqs626_report(iqs626);
+
+err_irq:
+       iqs626_irq_wait();
+       enable_irq(client->irq);
+
+       return error;
+}
+
+static SIMPLE_DEV_PM_OPS(iqs626_pm, iqs626_suspend, iqs626_resume);
+
+static const struct of_device_id iqs626_of_match[] = {
+       { .compatible = "azoteq,iqs626a" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, iqs626_of_match);
+
+static struct i2c_driver iqs626_i2c_driver = {
+       .driver = {
+               .name = "iqs626a",
+               .of_match_table = iqs626_of_match,
+               .pm = &iqs626_pm,
+       },
+       .probe_new = iqs626_probe,
+};
+module_i2c_driver(iqs626_i2c_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS626A Capacitive Touch Controller");
+MODULE_LICENSE("GPL");
index 20ff087..cd5e99e 100644 (file)
@@ -61,15 +61,10 @@ static int max8997_haptic_set_duty_cycle(struct max8997_haptic *chip)
                unsigned int duty = chip->pwm_period * chip->level / 100;
                ret = pwm_config(chip->pwm, duty, chip->pwm_period);
        } else {
-               int i;
                u8 duty_index = 0;
 
-               for (i = 0; i <= 64; i++) {
-                       if (chip->level <= i * 100 / 64) {
-                               duty_index = i;
-                               break;
-                       }
-               }
+               duty_index = DIV_ROUND_UP(chip->level * 64, 100);
+
                switch (chip->internal_mode_pattern) {
                case 0:
                        max8997_write_reg(chip->client,
index e12da5b..dc4a240 100644 (file)
 #define ETP_FW_PAGE_SIZE_512   512
 #define ETP_FW_SIGNATURE_SIZE  6
 
+#define ETP_PRODUCT_ID_DELBIN  0x00C2
+#define ETP_PRODUCT_ID_VOXEL   0x00BF
+#define ETP_PRODUCT_ID_MAGPIE  0x0120
+#define ETP_PRODUCT_ID_BOBBA   0x0121
+
 struct i2c_client;
 struct completion;
 
@@ -73,7 +78,7 @@ struct elan_transport_ops {
        int (*calibrate_result)(struct i2c_client *client, u8 *val);
 
        int (*get_baseline_data)(struct i2c_client *client,
-                                bool max_baseliune, u8 *value);
+                                bool max_baseline, u8 *value);
 
        int (*get_version)(struct i2c_client *client, u8 pattern, bool iap,
                           u8 *version);
index bef7382..dad22c1 100644 (file)
@@ -46,6 +46,9 @@
 #define ETP_FINGER_WIDTH       15
 #define ETP_RETRY_COUNT                3
 
+/* quirks to control the device */
+#define ETP_QUIRK_QUICK_WAKEUP BIT(0)
+
 /* The main device structure */
 struct elan_tp_data {
        struct i2c_client       *client;
@@ -90,8 +93,38 @@ struct elan_tp_data {
        bool                    baseline_ready;
        u8                      clickpad;
        bool                    middle_button;
+
+       u32                     quirks;         /* Various quirks */
 };
 
+static u32 elan_i2c_lookup_quirks(u16 ic_type, u16 product_id)
+{
+       static const struct {
+               u16 ic_type;
+               u16 product_id;
+               u32 quirks;
+       } elan_i2c_quirks[] = {
+               { 0x0D, ETP_PRODUCT_ID_DELBIN, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x10, ETP_PRODUCT_ID_VOXEL, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x14, ETP_PRODUCT_ID_MAGPIE, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x14, ETP_PRODUCT_ID_BOBBA, ETP_QUIRK_QUICK_WAKEUP },
+       };
+       u32 quirks = 0;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(elan_i2c_quirks); i++) {
+               if (elan_i2c_quirks[i].ic_type == ic_type &&
+                   elan_i2c_quirks[i].product_id == product_id) {
+                       quirks = elan_i2c_quirks[i].quirks;
+               }
+       }
+
+       if (ic_type >= 0x0D && product_id >= 0x123)
+               quirks |= ETP_QUIRK_QUICK_WAKEUP;
+
+       return quirks;
+}
+
 static int elan_get_fwinfo(u16 ic_type, u8 iap_version, u16 *validpage_count,
                           u32 *signature_address, u16 *page_size)
 {
@@ -258,16 +291,18 @@ static int elan_check_ASUS_special_fw(struct elan_tp_data *data)
        return false;
 }
 
-static int __elan_initialize(struct elan_tp_data *data)
+static int __elan_initialize(struct elan_tp_data *data, bool skip_reset)
 {
        struct i2c_client *client = data->client;
        bool woken_up = false;
        int error;
 
-       error = data->ops->initialize(client);
-       if (error) {
-               dev_err(&client->dev, "device initialize failed: %d\n", error);
-               return error;
+       if (!skip_reset) {
+               error = data->ops->initialize(client);
+               if (error) {
+                       dev_err(&client->dev, "device initialize failed: %d\n", error);
+                       return error;
+               }
        }
 
        error = elan_query_product(data);
@@ -311,16 +346,17 @@ static int __elan_initialize(struct elan_tp_data *data)
        return 0;
 }
 
-static int elan_initialize(struct elan_tp_data *data)
+static int elan_initialize(struct elan_tp_data *data, bool skip_reset)
 {
        int repeat = ETP_RETRY_COUNT;
        int error;
 
        do {
-               error = __elan_initialize(data);
+               error = __elan_initialize(data, skip_reset);
                if (!error)
                        return 0;
 
+               skip_reset = false;
                msleep(30);
        } while (--repeat > 0);
 
@@ -357,6 +393,8 @@ static int elan_query_device_info(struct elan_tp_data *data)
        if (error)
                return error;
 
+       data->quirks = elan_i2c_lookup_quirks(data->ic_type, data->product_id);
+
        error = elan_get_fwinfo(data->ic_type, data->iap_version,
                                &data->fw_validpage_count,
                                &data->fw_signature_address,
@@ -546,7 +584,7 @@ static int elan_update_firmware(struct elan_tp_data *data,
                data->ops->iap_reset(client);
        } else {
                /* Reinitialize TP after fw is updated */
-               elan_initialize(data);
+               elan_initialize(data, false);
                elan_query_device_info(data);
        }
 
@@ -1247,7 +1285,7 @@ static int elan_probe(struct i2c_client *client,
        }
 
        /* Initialize the touchpad. */
-       error = elan_initialize(data);
+       error = elan_initialize(data, false);
        if (error)
                return error;
 
@@ -1384,7 +1422,7 @@ static int __maybe_unused elan_resume(struct device *dev)
                goto err;
        }
 
-       error = elan_initialize(data);
+       error = elan_initialize(data, data->quirks & ETP_QUIRK_QUICK_WAKEUP);
        if (error)
                dev_err(dev, "initialize when resuming failed: %d\n", error);
 
index 594ac4e..974d7bf 100644 (file)
@@ -103,7 +103,6 @@ static int apbps2_open(struct serio *io)
 {
        struct apbps2_priv *priv = io->port_data;
        int limit;
-       unsigned long tmp;
 
        /* clear error flags */
        iowrite32be(0, &priv->regs->status);
@@ -111,7 +110,7 @@ static int apbps2_open(struct serio *io)
        /* Clear old data if available (unlikely) */
        limit = 1024;
        while ((ioread32be(&priv->regs->status) & APBPS2_STATUS_DR) && --limit)
-               tmp = ioread32be(&priv->regs->data);
+               ioread32be(&priv->regs->data);
 
        /* Enable reciever and it's interrupt */
        iowrite32be(APBPS2_CTRL_RE | APBPS2_CTRL_RI, &priv->regs->ctrl);
similarity index 93%
rename from drivers/input/touchscreen/of_touchscreen.c
rename to drivers/input/touchscreen.c
index 97342e1..dd18cb9 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- *  Generic DT helper functions for touchscreen devices
+ *  Generic helper functions for touchscreens and other two-dimensional
+ *  pointing devices
  *
  *  Copyright (c) 2014 Sebastian Reichel <sre@kernel.org>
  */
@@ -37,7 +38,7 @@ static void touchscreen_set_params(struct input_dev *dev,
 
        if (!test_bit(axis, dev->absbit)) {
                dev_warn(&dev->dev,
-                        "DT specifies parameters but the axis %lu is not set up\n",
+                        "Parameters are specified but the axis %lu is not set up\n",
                         axis);
                return;
        }
@@ -49,7 +50,7 @@ static void touchscreen_set_params(struct input_dev *dev,
 }
 
 /**
- * touchscreen_parse_properties - parse common touchscreen DT properties
+ * touchscreen_parse_properties - parse common touchscreen properties
  * @input: input device that should be parsed
  * @multitouch: specifies whether parsed properties should be applied to
  *     single-touch or multi-touch axes
@@ -57,9 +58,9 @@ static void touchscreen_set_params(struct input_dev *dev,
  *     axis swap and invert info for use with touchscreen_report_x_y();
  *     or %NULL
  *
- * This function parses common DT properties for touchscreens and setups the
+ * This function parses common properties for touchscreens and sets up the
  * input device accordingly. The function keeps previously set up default
- * values if no value is specified via DT.
+ * values if no value is specified.
  */
 void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
                                  struct touchscreen_properties *prop)
@@ -203,4 +204,4 @@ void touchscreen_report_pos(struct input_dev *input,
 EXPORT_SYMBOL(touchscreen_report_pos);
 
 MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Device-tree helpers functions for touchscreen devices");
+MODULE_DESCRIPTION("Helper functions for touchscreens and other devices");
index 529614d..ad454cd 100644 (file)
@@ -12,10 +12,6 @@ menuconfig INPUT_TOUCHSCREEN
 
 if INPUT_TOUCHSCREEN
 
-config TOUCHSCREEN_PROPERTIES
-       def_tristate INPUT
-       depends on INPUT
-
 config TOUCHSCREEN_88PM860X
        tristate "Marvell 88PM860x touchscreen"
        depends on MFD_88PM860X
@@ -415,6 +411,17 @@ config TOUCHSCREEN_HIDEEP
          To compile this driver as a module, choose M here : the
          module will be called hideep_ts.
 
+config TOUCHSCREEN_HYCON_HY46XX
+       tristate "Hycon hy46xx touchscreen support"
+       depends on I2C
+       help
+         Say Y here if you have a touchscreen using Hycon hy46xx
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called hycon-hy46xx.
+
 config TOUCHSCREEN_ILI210X
        tristate "Ilitek ILI210X based touchscreen"
        depends on I2C
@@ -430,6 +437,18 @@ config TOUCHSCREEN_ILI210X
          To compile this driver as a module, choose M here: the
          module will be called ili210x.
 
+config TOUCHSCREEN_ILITEK
+       tristate "Ilitek I2C 213X/23XX/25XX/Lego Series Touch ICs"
+       depends on I2C
+       help
+         Say Y here if you have touchscreen with ILITEK touch IC,
+         it supports 213X/23XX/25XX and other Lego series.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called ilitek_ts_i2c.
+
 config TOUCHSCREEN_IPROC
        tristate "IPROC touch panel driver support"
        depends on ARCH_BCM_IPROC || COMPILE_TEST
@@ -594,6 +613,18 @@ config TOUCHSCREEN_MELFAS_MIP4
          To compile this driver as a module, choose M here:
          the module will be called melfas_mip4.
 
+config TOUCHSCREEN_MSG2638
+       tristate "MStar msg2638 touchscreen support"
+       depends on I2C
+       depends on GPIOLIB || COMPILE_TEST
+       help
+         Say Y here if you have an I2C touchscreen using MStar msg2638.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called msg2638.
+
 config TOUCHSCREEN_MTOUCH
        tristate "MicroTouch serial touchscreens"
        select SERIO
index 6233541..7d34100 100644 (file)
@@ -7,7 +7,6 @@
 
 wm97xx-ts-y := wm97xx-core.o
 
-obj-$(CONFIG_TOUCHSCREEN_PROPERTIES)   += of_touchscreen.o
 obj-$(CONFIG_TOUCHSCREEN_88PM860X)     += 88pm860x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_AD7877)       += ad7877.o
 obj-$(CONFIG_TOUCHSCREEN_AD7879)       += ad7879.o
@@ -35,6 +34,7 @@ obj-$(CONFIG_TOUCHSCREEN_DA9052)      += da9052_tsi.o
 obj-$(CONFIG_TOUCHSCREEN_DYNAPRO)      += dynapro.o
 obj-$(CONFIG_TOUCHSCREEN_EDT_FT5X06)   += edt-ft5x06.o
 obj-$(CONFIG_TOUCHSCREEN_HAMPSHIRE)    += hampshire.o
+obj-$(CONFIG_TOUCHSCREEN_HYCON_HY46XX) += hycon-hy46xx.o
 obj-$(CONFIG_TOUCHSCREEN_GUNZE)                += gunze.o
 obj-$(CONFIG_TOUCHSCREEN_EETI)         += eeti_ts.o
 obj-$(CONFIG_TOUCHSCREEN_EKTF2127)     += ektf2127.o
@@ -47,6 +47,7 @@ obj-$(CONFIG_TOUCHSCREEN_FUJITSU)     += fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GOODIX)       += goodix.o
 obj-$(CONFIG_TOUCHSCREEN_HIDEEP)       += hideep.o
 obj-$(CONFIG_TOUCHSCREEN_ILI210X)      += ili210x.o
+obj-$(CONFIG_TOUCHSCREEN_ILITEK)       += ilitek_ts_i2c.o
 obj-$(CONFIG_TOUCHSCREEN_IMX6UL_TSC)   += imx6ul_tsc.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)       += inexio.o
 obj-$(CONFIG_TOUCHSCREEN_IPROC)                += bcm_iproc_tsc.o
@@ -59,6 +60,7 @@ obj-$(CONFIG_TOUCHSCREEN_MCS5000)     += mcs5000_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MELFAS_MIP4)  += melfas_mip4.o
 obj-$(CONFIG_TOUCHSCREEN_MIGOR)                += migor_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MMS114)       += mms114.o
+obj-$(CONFIG_TOUCHSCREEN_MSG2638)      += msg2638.o
 obj-$(CONFIG_TOUCHSCREEN_MTOUCH)       += mtouch.o
 obj-$(CONFIG_TOUCHSCREEN_MK712)                += mk712.o
 obj-$(CONFIG_TOUCHSCREEN_HP600)                += hp680_ts_input.o
index c0d5c24..dc6a853 100644 (file)
@@ -125,7 +125,7 @@ static int ar1021_i2c_probe(struct i2c_client *client,
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, ar1021_i2c_irq,
-                                         IRQF_ONESHOT,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          "ar1021_i2c", ar1021);
        if (error) {
                dev_err(&client->dev,
@@ -133,9 +133,6 @@ static int ar1021_i2c_probe(struct i2c_client *client,
                return error;
        }
 
-       /* Disable the IRQ, we'll enable it in ar1021_i2c_open() */
-       disable_irq(client->irq);
-
        error = input_register_device(ar1021->input);
        if (error) {
                dev_err(&client->dev,
index 383a848..05de92c 100644 (file)
@@ -31,6 +31,7 @@
 #include <media/v4l2-ioctl.h>
 #include <media/videobuf2-v4l2.h>
 #include <media/videobuf2-vmalloc.h>
+#include <dt-bindings/input/atmel-maxtouch.h>
 
 /* Firmware files */
 #define MXT_FW_NAME            "maxtouch.fw"
@@ -199,6 +200,7 @@ enum t100_type {
 #define MXT_CRC_TIMEOUT                1000    /* msec */
 #define MXT_FW_RESET_TIME      3000    /* msec */
 #define MXT_FW_CHG_TIMEOUT     300     /* msec */
+#define MXT_WAKEUP_TIME                25      /* msec */
 
 /* Command to unlock bootloader */
 #define MXT_UNLOCK_CMD_MSB     0xaa
@@ -312,6 +314,7 @@ struct mxt_data {
        struct mxt_dbg dbg;
        struct regulator_bulk_data regulators[2];
        struct gpio_desc *reset_gpio;
+       struct gpio_desc *wake_gpio;
        bool use_retrigen_workaround;
 
        /* Cached parameters from object table */
@@ -342,6 +345,8 @@ struct mxt_data {
        unsigned int t19_num_keys;
 
        enum mxt_suspend_mode suspend_mode;
+
+       u32 wakeup_method;
 };
 
 struct mxt_vb2_buffer {
@@ -621,10 +626,42 @@ static int mxt_send_bootloader_cmd(struct mxt_data *data, bool unlock)
        return mxt_bootloader_write(data, buf, sizeof(buf));
 }
 
+static bool mxt_wakeup_toggle(struct i2c_client *client,
+                             bool wake_up, bool in_i2c)
+{
+       struct mxt_data *data = i2c_get_clientdata(client);
+
+       switch (data->wakeup_method) {
+       case ATMEL_MXT_WAKEUP_I2C_SCL:
+               if (!in_i2c)
+                       return false;
+               break;
+
+       case ATMEL_MXT_WAKEUP_GPIO:
+               if (in_i2c)
+                       return false;
+
+               gpiod_set_value(data->wake_gpio, wake_up);
+               break;
+
+       default:
+               return false;
+       }
+
+       if (wake_up) {
+               dev_dbg(&client->dev, "waking up controller\n");
+
+               msleep(MXT_WAKEUP_TIME);
+       }
+
+       return true;
+}
+
 static int __mxt_read_reg(struct i2c_client *client,
                               u16 reg, u16 len, void *val)
 {
        struct i2c_msg xfer[2];
+       bool retried = false;
        u8 buf[2];
        int ret;
 
@@ -643,9 +680,13 @@ static int __mxt_read_reg(struct i2c_client *client,
        xfer[1].len = len;
        xfer[1].buf = val;
 
+retry:
        ret = i2c_transfer(client->adapter, xfer, 2);
        if (ret == 2) {
                ret = 0;
+       } else if (!retried && mxt_wakeup_toggle(client, true, true)) {
+               retried = true;
+               goto retry;
        } else {
                if (ret >= 0)
                        ret = -EIO;
@@ -659,6 +700,7 @@ static int __mxt_read_reg(struct i2c_client *client,
 static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len,
                           const void *val)
 {
+       bool retried = false;
        u8 *buf;
        size_t count;
        int ret;
@@ -672,9 +714,13 @@ static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len,
        buf[1] = (reg >> 8) & 0xff;
        memcpy(&buf[2], val, len);
 
+retry:
        ret = i2c_master_send(client, buf, count);
        if (ret == count) {
                ret = 0;
+       } else if (!retried && mxt_wakeup_toggle(client, true, true)) {
+               retried = true;
+               goto retry;
        } else {
                if (ret >= 0)
                        ret = -EIO;
@@ -2975,6 +3021,8 @@ static const struct attribute_group mxt_attr_group = {
 
 static void mxt_start(struct mxt_data *data)
 {
+       mxt_wakeup_toggle(data->client, true, false);
+
        switch (data->suspend_mode) {
        case MXT_SUSPEND_T9_CTRL:
                mxt_soft_reset(data);
@@ -3009,6 +3057,8 @@ static void mxt_stop(struct mxt_data *data)
                mxt_set_t7_power_cfg(data, MXT_POWER_CFG_DEEPSLEEP);
                break;
        }
+
+       mxt_wakeup_toggle(data->client, false, false);
 }
 
 static int mxt_input_open(struct input_dev *dev)
@@ -3155,16 +3205,24 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id)
                return error;
        }
 
+       /* Request the WAKE line as asserted so we go out of sleep */
+       data->wake_gpio = devm_gpiod_get_optional(&client->dev,
+                                                 "wake", GPIOD_OUT_HIGH);
+       if (IS_ERR(data->wake_gpio)) {
+               error = PTR_ERR(data->wake_gpio);
+               dev_err(&client->dev, "Failed to get wake gpio: %d\n", error);
+               return error;
+       }
+
        error = devm_request_threaded_irq(&client->dev, client->irq,
-                                         NULL, mxt_interrupt, IRQF_ONESHOT,
+                                         NULL, mxt_interrupt,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          client->name, data);
        if (error) {
                dev_err(&client->dev, "Failed to register interrupt\n");
                return error;
        }
 
-       disable_irq(client->irq);
-
        error = regulator_bulk_enable(ARRAY_SIZE(data->regulators),
                                      data->regulators);
        if (error) {
@@ -3185,6 +3243,25 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id)
                msleep(MXT_RESET_INVALID_CHG);
        }
 
+       /*
+        * Controllers like mXT1386 have a dedicated WAKE line that could be
+        * connected to a GPIO or to I2C SCL pin, or permanently asserted low.
+        *
+        * This WAKE line is used for waking controller from a deep-sleep and
+        * it needs to be asserted low for 25 milliseconds before I2C transfers
+        * could be accepted by controller if it was in a deep-sleep mode.
+        * Controller will go into sleep automatically after 2 seconds of
+        * inactivity if WAKE line is deasserted and deep sleep is activated.
+        *
+        * If WAKE line is connected to I2C SCL pin, then the first I2C transfer
+        * will get an instant NAK and transfer needs to be retried after 25ms.
+        *
+        * If WAKE line is connected to a GPIO line, the line must be asserted
+        * 25ms before the host attempts to communicate with the controller.
+        */
+       device_property_read_u32(&client->dev, "atmel,wakeup-method",
+                                &data->wakeup_method);
+
        error = mxt_initialize(data);
        if (error)
                goto err_disable_regulators;
index 341925e..392950a 100644 (file)
@@ -401,10 +401,10 @@ static int bu21029_probe(struct i2c_client *client,
 
        input_set_drvdata(in_dev, bu21029);
 
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, bu21029_touch_soft_irq,
-                                         IRQF_ONESHOT, DRIVER_NAME, bu21029);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         DRIVER_NAME, bu21029);
        if (error) {
                dev_err(&client->dev,
                        "unable to request touch irq: %d\n", error);
index 73c854f..106dd49 100644 (file)
@@ -229,16 +229,21 @@ static int cyttsp_set_sysinfo_regs(struct cyttsp *ts)
 static void cyttsp_hard_reset(struct cyttsp *ts)
 {
        if (ts->reset_gpio) {
+               /*
+                * According to the CY8CTMA340 datasheet page 21, the external
+                * reset pulse width should be >= 1 ms. The datasheet does not
+                * specify how long we have to wait after reset but a vendor
+                * tree specifies 5 ms here.
+                */
                gpiod_set_value_cansleep(ts->reset_gpio, 1);
-               msleep(CY_DELAY_DFLT);
+               usleep_range(1000, 2000);
                gpiod_set_value_cansleep(ts->reset_gpio, 0);
-               msleep(CY_DELAY_DFLT);
+               usleep_range(5000, 6000);
        }
 }
 
 static int cyttsp_soft_reset(struct cyttsp *ts)
 {
-       unsigned long timeout;
        int retval;
 
        /* wait for interrupt to set ready completion */
@@ -248,12 +253,16 @@ static int cyttsp_soft_reset(struct cyttsp *ts)
        enable_irq(ts->irq);
 
        retval = ttsp_send_command(ts, CY_SOFT_RESET_MODE);
-       if (retval)
+       if (retval) {
+               dev_err(ts->dev, "failed to send soft reset\n");
                goto out;
+       }
 
-       timeout = wait_for_completion_timeout(&ts->bl_ready,
-                       msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX));
-       retval = timeout ? 0 : -EIO;
+       if (!wait_for_completion_timeout(&ts->bl_ready,
+                       msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX))) {
+               dev_err(ts->dev, "timeout waiting for soft reset\n");
+               retval = -EIO;
+       }
 
 out:
        ts->state = CY_IDLE_STATE;
@@ -405,8 +414,10 @@ static int cyttsp_power_on(struct cyttsp *ts)
        if (GET_BOOTLOADERMODE(ts->bl_data.bl_status) &&
            IS_VALID_APP(ts->bl_data.bl_status)) {
                error = cyttsp_exit_bl_mode(ts);
-               if (error)
+               if (error) {
+                       dev_err(ts->dev, "failed to exit bootloader mode\n");
                        return error;
+               }
        }
 
        if (GET_HSTMODE(ts->bl_data.bl_file) != CY_OPERATE_MODE ||
@@ -629,10 +640,8 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
                return ERR_PTR(error);
 
        init_completion(&ts->bl_ready);
-       snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(dev));
 
        input_dev->name = "Cypress TTSP TouchScreen";
-       input_dev->phys = ts->phys;
        input_dev->id.bustype = bus_ops->bustype;
        input_dev->dev.parent = ts->dev;
 
@@ -643,16 +652,20 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
 
        input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X);
        input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y);
+       /* One byte for width 0..255 so this is the limit */
+       input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, 255, 0, 0);
+
        touchscreen_parse_properties(input_dev, true, NULL);
 
-       error = input_mt_init_slots(input_dev, CY_MAX_ID, 0);
+       error = input_mt_init_slots(input_dev, CY_MAX_ID, INPUT_MT_DIRECT);
        if (error) {
                dev_err(dev, "Unable to init MT slots.\n");
                return ERR_PTR(error);
        }
 
        error = devm_request_threaded_irq(dev, ts->irq, NULL, cyttsp_irq,
-                                         IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+                                         IRQF_TRIGGER_FALLING | IRQF_ONESHOT |
+                                         IRQF_NO_AUTOEN,
                                          "cyttsp", ts);
        if (error) {
                dev_err(ts->dev, "failed to request IRQ %d, err: %d\n",
@@ -660,8 +673,6 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
                return ERR_PTR(error);
        }
 
-       disable_irq(ts->irq);
-
        cyttsp_hard_reset(ts);
 
        error = cyttsp_power_on(ts);
index 8c65133..9bc4fe7 100644 (file)
@@ -114,7 +114,6 @@ struct cyttsp {
        struct device *dev;
        int irq;
        struct input_dev *input;
-       char phys[32];
        const struct cyttsp_bus_ops *bus_ops;
        struct cyttsp_bootloader_data bl_data;
        struct cyttsp_sysinfo_data sysinfo_data;
index 5f7706f..17540bd 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/of.h>
 #include <linux/gpio/consumer.h>
 #include <linux/regulator/consumer.h>
+#include <linux/uuid.h>
 #include <asm/unaligned.h>
 
 /* Device, Driver information */
@@ -1334,6 +1335,40 @@ static void elants_i2c_power_off(void *_data)
        }
 }
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id i2c_hid_ids[] = {
+       {"ACPI0C50", 0 },
+       {"PNP0C50", 0 },
+       { },
+};
+
+static const guid_t i2c_hid_guid =
+       GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
+                 0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
+
+static bool elants_acpi_is_hid_device(struct device *dev)
+{
+       acpi_handle handle = ACPI_HANDLE(dev);
+       union acpi_object *obj;
+
+       if (acpi_match_device_ids(ACPI_COMPANION(dev), i2c_hid_ids))
+               return false;
+
+       obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL, ACPI_TYPE_INTEGER);
+       if (obj) {
+               ACPI_FREE(obj);
+               return true;
+       }
+
+       return false;
+}
+#else
+static bool elants_acpi_is_hid_device(struct device *dev)
+{
+       return false;
+}
+#endif
+
 static int elants_i2c_probe(struct i2c_client *client,
                            const struct i2c_device_id *id)
 {
@@ -1342,9 +1377,14 @@ static int elants_i2c_probe(struct i2c_client *client,
        unsigned long irqflags;
        int error;
 
+       /* Don't bind to i2c-hid compatible devices, these are handled by the i2c-hid drv. */
+       if (elants_acpi_is_hid_device(&client->dev)) {
+               dev_warn(&client->dev, "This device appears to be an I2C-HID device, not binding\n");
+               return -ENODEV;
+       }
+
        if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-               dev_err(&client->dev,
-                       "%s: i2c check functionality error\n", DEVICE_NAME);
+               dev_err(&client->dev, "I2C check functionality error\n");
                return -ENXIO;
        }
 
index a6597f0..cbe0dd4 100644 (file)
 #define EXC3000_NUM_SLOTS              10
 #define EXC3000_SLOTS_PER_FRAME                5
 #define EXC3000_LEN_FRAME              66
+#define EXC3000_LEN_VENDOR_REQUEST     68
 #define EXC3000_LEN_POINT              10
 
 #define EXC3000_LEN_MODEL_NAME         16
 #define EXC3000_LEN_FW_VERSION         16
 
+#define EXC3000_VENDOR_EVENT           0x03
 #define EXC3000_MT1_EVENT              0x06
 #define EXC3000_MT2_EVENT              0x18
 
@@ -76,9 +78,6 @@ struct exc3000_data {
        u8 buf[2 * EXC3000_LEN_FRAME];
        struct completion wait_event;
        struct mutex query_lock;
-       int query_result;
-       char model[EXC3000_LEN_MODEL_NAME];
-       char fw_version[EXC3000_LEN_FW_VERSION];
 };
 
 static void exc3000_report_slots(struct input_dev *input,
@@ -105,15 +104,16 @@ static void exc3000_timer(struct timer_list *t)
        input_sync(data->input);
 }
 
+static inline void exc3000_schedule_timer(struct exc3000_data *data)
+{
+       mod_timer(&data->timer, jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS));
+}
+
 static int exc3000_read_frame(struct exc3000_data *data, u8 *buf)
 {
        struct i2c_client *client = data->client;
-       u8 expected_event = EXC3000_MT1_EVENT;
        int ret;
 
-       if (data->info->max_xy == SZ_16K - 1)
-               expected_event = EXC3000_MT2_EVENT;
-
        ret = i2c_master_send(client, "'", 2);
        if (ret < 0)
                return ret;
@@ -131,175 +131,196 @@ static int exc3000_read_frame(struct exc3000_data *data, u8 *buf)
        if (get_unaligned_le16(buf) != EXC3000_LEN_FRAME)
                return -EINVAL;
 
-       if (buf[2] != expected_event)
-               return -EINVAL;
-
        return 0;
 }
 
-static int exc3000_read_data(struct exc3000_data *data,
-                            u8 *buf, int *n_slots)
+static int exc3000_handle_mt_event(struct exc3000_data *data)
 {
-       int error;
-
-       error = exc3000_read_frame(data, buf);
-       if (error)
-               return error;
+       struct input_dev *input = data->input;
+       int ret, total_slots;
+       u8 *buf = data->buf;
 
-       *n_slots = buf[3];
-       if (!*n_slots || *n_slots > EXC3000_NUM_SLOTS)
-               return -EINVAL;
+       total_slots = buf[3];
+       if (!total_slots || total_slots > EXC3000_NUM_SLOTS) {
+               ret = -EINVAL;
+               goto out_fail;
+       }
 
-       if (*n_slots > EXC3000_SLOTS_PER_FRAME) {
+       if (total_slots > EXC3000_SLOTS_PER_FRAME) {
                /* Read 2nd frame to get the rest of the contacts. */
-               error = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME);
-               if (error)
-                       return error;
+               ret = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME);
+               if (ret)
+                       goto out_fail;
 
                /* 2nd chunk must have number of contacts set to 0. */
-               if (buf[EXC3000_LEN_FRAME + 3] != 0)
-                       return -EINVAL;
+               if (buf[EXC3000_LEN_FRAME + 3] != 0) {
+                       ret = -EINVAL;
+                       goto out_fail;
+               }
        }
 
-       return 0;
-}
-
-static int exc3000_query_interrupt(struct exc3000_data *data)
-{
-       u8 *buf = data->buf;
-       int error;
+       /*
+        * We read full state successfully, no contacts will be "stuck".
+        */
+       del_timer_sync(&data->timer);
 
-       error = i2c_master_recv(data->client, buf, EXC3000_LEN_FRAME);
-       if (error < 0)
-               return error;
+       while (total_slots > 0) {
+               int slots = min(total_slots, EXC3000_SLOTS_PER_FRAME);
 
-       if (buf[0] != 'B')
-               return -EPROTO;
+               exc3000_report_slots(input, &data->prop, buf + 4, slots);
+               total_slots -= slots;
+               buf += EXC3000_LEN_FRAME;
+       }
 
-       if (buf[4] == 'E')
-               strlcpy(data->model, buf + 5, sizeof(data->model));
-       else if (buf[4] == 'D')
-               strlcpy(data->fw_version, buf + 5, sizeof(data->fw_version));
-       else
-               return -EPROTO;
+       input_mt_sync_frame(input);
+       input_sync(input);
 
        return 0;
+
+out_fail:
+       /* Schedule a timer to release "stuck" contacts */
+       exc3000_schedule_timer(data);
+
+       return ret;
 }
 
 static irqreturn_t exc3000_interrupt(int irq, void *dev_id)
 {
        struct exc3000_data *data = dev_id;
-       struct input_dev *input = data->input;
        u8 *buf = data->buf;
-       int slots, total_slots;
-       int error;
-
-       if (mutex_is_locked(&data->query_lock)) {
-               data->query_result = exc3000_query_interrupt(data);
-               complete(&data->wait_event);
-               goto out;
-       }
+       int ret;
 
-       error = exc3000_read_data(data, buf, &total_slots);
-       if (error) {
+       ret = exc3000_read_frame(data, buf);
+       if (ret) {
                /* Schedule a timer to release "stuck" contacts */
-               mod_timer(&data->timer,
-                         jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS));
+               exc3000_schedule_timer(data);
                goto out;
        }
 
-       /*
-        * We read full state successfully, no contacts will be "stuck".
-        */
-       del_timer_sync(&data->timer);
+       switch (buf[2]) {
+       case EXC3000_VENDOR_EVENT:
+               complete(&data->wait_event);
+               break;
 
-       while (total_slots > 0) {
-               slots = min(total_slots, EXC3000_SLOTS_PER_FRAME);
-               exc3000_report_slots(input, &data->prop, buf + 4, slots);
-               total_slots -= slots;
-               buf += EXC3000_LEN_FRAME;
-       }
+       case EXC3000_MT1_EVENT:
+       case EXC3000_MT2_EVENT:
+               exc3000_handle_mt_event(data);
+               break;
 
-       input_mt_sync_frame(input);
-       input_sync(input);
+       default:
+               break;
+       }
 
 out:
        return IRQ_HANDLED;
 }
 
-static ssize_t fw_version_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static int exc3000_vendor_data_request(struct exc3000_data *data, u8 *request,
+                                      u8 request_len, u8 *response, int timeout)
 {
-       struct i2c_client *client = to_i2c_client(dev);
-       struct exc3000_data *data = i2c_get_clientdata(client);
-       static const u8 request[68] = {
-               0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'D', 0x00
-       };
-       int error;
+       u8 buf[EXC3000_LEN_VENDOR_REQUEST] = { 0x67, 0x00, 0x42, 0x00, 0x03 };
+       int ret;
 
        mutex_lock(&data->query_lock);
 
-       data->query_result = -ETIMEDOUT;
        reinit_completion(&data->wait_event);
 
-       error = i2c_master_send(client, request, sizeof(request));
-       if (error < 0) {
-               mutex_unlock(&data->query_lock);
-               return error;
+       buf[5] = request_len;
+       memcpy(&buf[6], request, request_len);
+
+       ret = i2c_master_send(data->client, buf, EXC3000_LEN_VENDOR_REQUEST);
+       if (ret < 0)
+               goto out_unlock;
+
+       if (response) {
+               ret = wait_for_completion_timeout(&data->wait_event,
+                                                 timeout * HZ);
+               if (ret <= 0) {
+                       ret = -ETIMEDOUT;
+                       goto out_unlock;
+               }
+
+               if (data->buf[3] >= EXC3000_LEN_FRAME) {
+                       ret = -ENOSPC;
+                       goto out_unlock;
+               }
+
+               memcpy(response, &data->buf[4], data->buf[3]);
+               ret = data->buf[3];
        }
 
-       wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ);
+out_unlock:
        mutex_unlock(&data->query_lock);
 
-       if (data->query_result < 0)
-               return data->query_result;
-
-       return sprintf(buf, "%s\n", data->fw_version);
+       return ret;
 }
-static DEVICE_ATTR_RO(fw_version);
 
-static ssize_t exc3000_get_model(struct exc3000_data *data)
+static ssize_t fw_version_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
 {
-       static const u8 request[68] = {
-               0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'E', 0x00
-       };
-       struct i2c_client *client = data->client;
-       int error;
+       struct i2c_client *client = to_i2c_client(dev);
+       struct exc3000_data *data = i2c_get_clientdata(client);
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
 
-       mutex_lock(&data->query_lock);
-       data->query_result = -ETIMEDOUT;
-       reinit_completion(&data->wait_event);
+       /* query bootloader info */
+       ret = exc3000_vendor_data_request(data,
+                                         (u8[]){0x39, 0x02}, 2, response, 1);
+       if (ret < 0)
+               return ret;
 
-       error = i2c_master_send(client, request, sizeof(request));
-       if (error < 0) {
-               mutex_unlock(&data->query_lock);
-               return error;
-       }
+       /*
+        * If the bootloader version is non-zero then the device is in
+        * bootloader mode and won't answer a query for the application FW
+        * version, so we just use the bootloader version info.
+        */
+       if (response[2] || response[3])
+               return sprintf(buf, "%d.%d\n", response[2], response[3]);
 
-       wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ);
-       mutex_unlock(&data->query_lock);
+       ret = exc3000_vendor_data_request(data, (u8[]){'D'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
 
-       return data->query_result;
+       return sprintf(buf, "%s\n", &response[1]);
 }
+static DEVICE_ATTR_RO(fw_version);
 
 static ssize_t model_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct exc3000_data *data = i2c_get_clientdata(client);
-       int error;
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
 
-       error = exc3000_get_model(data);
-       if (error < 0)
-               return error;
+       ret = exc3000_vendor_data_request(data, (u8[]){'E'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
 
-       return sprintf(buf, "%s\n", data->model);
+       return sprintf(buf, "%s\n", &response[1]);
 }
 static DEVICE_ATTR_RO(model);
 
+static ssize_t type_show(struct device *dev,
+                         struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct exc3000_data *data = i2c_get_clientdata(client);
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
+
+       ret = exc3000_vendor_data_request(data, (u8[]){'F'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
+
+       return sprintf(buf, "%s\n", &response[1]);
+}
+static DEVICE_ATTR_RO(type);
+
 static struct attribute *sysfs_attrs[] = {
        &dev_attr_fw_version.attr,
        &dev_attr_model.attr,
+       &dev_attr_type.attr,
        NULL
 };
 
@@ -379,9 +400,15 @@ static int exc3000_probe(struct i2c_client *client)
         * or two touch events anyways).
         */
        for (retry = 0; retry < 3; retry++) {
-               error = exc3000_get_model(data);
-               if (!error)
+               u8 response[EXC3000_LEN_FRAME];
+
+               error = exc3000_vendor_data_request(data, (u8[]){'E'}, 1,
+                                                   response, 1);
+               if (error > 0) {
+                       dev_dbg(&client->dev, "TS Model: %s", &response[1]);
+                       error = 0;
                        break;
+               }
                dev_warn(&client->dev, "Retry %d get EETI EXC3000 model: %d\n",
                         retry + 1, error);
        }
@@ -389,8 +416,6 @@ static int exc3000_probe(struct i2c_client *client)
        if (error)
                return error;
 
-       dev_dbg(&client->dev, "TS Model: %s", data->model);
-
        i2c_set_clientdata(client, data);
 
        error = devm_device_add_group(&client->dev, &exc3000_attribute_group);
diff --git a/drivers/input/touchscreen/hycon-hy46xx.c b/drivers/input/touchscreen/hycon-hy46xx.c
new file mode 100644 (file)
index 0000000..891d043
--- /dev/null
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021
+ * Author(s): Giulio Benetti <giulio.benetti@benettiengineering.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/irq.h>
+#include <linux/regulator/consumer.h>
+#include <linux/regmap.h>
+
+#include <asm/unaligned.h>
+
+#define HY46XX_CHKSUM_CODE             0x1
+#define HY46XX_FINGER_NUM              0x2
+#define HY46XX_CHKSUM_LEN              0x7
+#define HY46XX_THRESHOLD               0x80
+#define HY46XX_GLOVE_EN                        0x84
+#define HY46XX_REPORT_SPEED            0x88
+#define HY46XX_PWR_NOISE_EN            0x89
+#define HY46XX_FILTER_DATA             0x8A
+#define HY46XX_GAIN                    0x92
+#define HY46XX_EDGE_OFFSET             0x93
+#define HY46XX_RX_NR_USED              0x94
+#define HY46XX_TX_NR_USED              0x95
+#define HY46XX_PWR_MODE                        0xA5
+#define HY46XX_FW_VERSION              0xA6
+#define HY46XX_LIB_VERSION             0xA7
+#define HY46XX_TP_INFO                 0xA8
+#define HY46XX_TP_CHIP_ID              0xA9
+#define HY46XX_BOOT_VER                        0xB0
+
+#define HY46XX_TPLEN                   0x6
+#define HY46XX_REPORT_PKT_LEN          0x44
+
+#define HY46XX_MAX_SUPPORTED_POINTS    11
+
+#define TOUCH_EVENT_DOWN               0x00
+#define TOUCH_EVENT_UP                 0x01
+#define TOUCH_EVENT_CONTACT            0x02
+#define TOUCH_EVENT_RESERVED           0x03
+
+struct hycon_hy46xx_data {
+       struct i2c_client *client;
+       struct input_dev *input;
+       struct touchscreen_properties prop;
+       struct regulator *vcc;
+
+       struct gpio_desc *reset_gpio;
+
+       struct mutex mutex;
+       struct regmap *regmap;
+
+       int threshold;
+       bool glove_enable;
+       int report_speed;
+       bool noise_filter_enable;
+       int filter_data;
+       int gain;
+       int edge_offset;
+       int rx_number_used;
+       int tx_number_used;
+       int power_mode;
+       int fw_version;
+       int lib_version;
+       int tp_information;
+       int tp_chip_id;
+       int bootloader_version;
+};
+
+static const struct regmap_config hycon_hy46xx_i2c_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 8,
+};
+
+static bool hycon_hy46xx_check_checksum(struct hycon_hy46xx_data *tsdata, u8 *buf)
+{
+       u8 chksum = 0;
+       int i;
+
+       for (i = 2; i < buf[HY46XX_CHKSUM_LEN]; i++)
+               chksum += buf[i];
+
+       if (chksum == buf[HY46XX_CHKSUM_CODE])
+               return true;
+
+       dev_err_ratelimited(&tsdata->client->dev,
+                           "checksum error: 0x%02x expected, got 0x%02x\n",
+                           chksum, buf[HY46XX_CHKSUM_CODE]);
+
+       return false;
+}
+
+static irqreturn_t hycon_hy46xx_isr(int irq, void *dev_id)
+{
+       struct hycon_hy46xx_data *tsdata = dev_id;
+       struct device *dev = &tsdata->client->dev;
+       u8 rdbuf[HY46XX_REPORT_PKT_LEN];
+       int i, x, y, id;
+       int error;
+
+       memset(rdbuf, 0, sizeof(rdbuf));
+
+       error = regmap_bulk_read(tsdata->regmap, 0, rdbuf, sizeof(rdbuf));
+       if (error) {
+               dev_err_ratelimited(dev, "Unable to fetch data, error: %d\n",
+                                   error);
+               goto out;
+       }
+
+       if (!hycon_hy46xx_check_checksum(tsdata, rdbuf))
+               goto out;
+
+       for (i = 0; i < HY46XX_MAX_SUPPORTED_POINTS; i++) {
+               u8 *buf = &rdbuf[3 + (HY46XX_TPLEN * i)];
+               int type = buf[0] >> 6;
+
+               if (type == TOUCH_EVENT_RESERVED)
+                       continue;
+
+               x = get_unaligned_be16(buf) & 0x0fff;
+               y = get_unaligned_be16(buf + 2) & 0x0fff;
+
+               id = buf[2] >> 4;
+
+               input_mt_slot(tsdata->input, id);
+               if (input_mt_report_slot_state(tsdata->input, MT_TOOL_FINGER,
+                                              type != TOUCH_EVENT_UP))
+                       touchscreen_report_pos(tsdata->input, &tsdata->prop,
+                                              x, y, true);
+       }
+
+       input_mt_report_pointer_emulation(tsdata->input, false);
+       input_sync(tsdata->input);
+
+out:
+       return IRQ_HANDLED;
+}
+
+struct hycon_hy46xx_attribute {
+       struct device_attribute dattr;
+       size_t field_offset;
+       u8 address;
+       u8 limit_low;
+       u8 limit_high;
+};
+
+#define HYCON_ATTR_U8(_field, _mode, _address, _limit_low, _limit_high)        \
+       struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = {            \
+               .dattr = __ATTR(_field, _mode,                          \
+                               hycon_hy46xx_setting_show,                      \
+                               hycon_hy46xx_setting_store),                    \
+               .field_offset = offsetof(struct hycon_hy46xx_data, _field),     \
+               .address = _address,                                    \
+               .limit_low = _limit_low,                                \
+               .limit_high = _limit_high,                              \
+       }
+
+#define HYCON_ATTR_BOOL(_field, _mode, _address)                       \
+       struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = {            \
+               .dattr = __ATTR(_field, _mode,                          \
+                               hycon_hy46xx_setting_show,                      \
+                               hycon_hy46xx_setting_store),                    \
+               .field_offset = offsetof(struct hycon_hy46xx_data, _field),     \
+               .address = _address,                                    \
+               .limit_low = false,                                     \
+               .limit_high = true,                                     \
+       }
+
+static ssize_t hycon_hy46xx_setting_show(struct device *dev,
+                                  struct device_attribute *dattr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client);
+       struct hycon_hy46xx_attribute *attr =
+                       container_of(dattr, struct hycon_hy46xx_attribute, dattr);
+       u8 *field = (u8 *)tsdata + attr->field_offset;
+       size_t count = 0;
+       int error = 0;
+       int val;
+
+       mutex_lock(&tsdata->mutex);
+
+       error = regmap_read(tsdata->regmap, attr->address, &val);
+       if (error < 0) {
+               dev_err(&tsdata->client->dev,
+                       "Failed to fetch attribute %s, error %d\n",
+                       dattr->attr.name, error);
+               goto out;
+       }
+
+       if (val != *field) {
+               dev_warn(&tsdata->client->dev,
+                        "%s: read (%d) and stored value (%d) differ\n",
+                        dattr->attr.name, val, *field);
+               *field = val;
+       }
+
+       count = scnprintf(buf, PAGE_SIZE, "%d\n", val);
+
+out:
+       mutex_unlock(&tsdata->mutex);
+       return error ?: count;
+}
+
+static ssize_t hycon_hy46xx_setting_store(struct device *dev,
+                                       struct device_attribute *dattr,
+                                       const char *buf, size_t count)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client);
+       struct hycon_hy46xx_attribute *attr =
+                       container_of(dattr, struct hycon_hy46xx_attribute, dattr);
+       u8 *field = (u8 *)tsdata + attr->field_offset;
+       unsigned int val;
+       int error;
+
+       mutex_lock(&tsdata->mutex);
+
+       error = kstrtouint(buf, 0, &val);
+       if (error)
+               goto out;
+
+       if (val < attr->limit_low || val > attr->limit_high) {
+               error = -ERANGE;
+               goto out;
+       }
+
+       error = regmap_write(tsdata->regmap, attr->address, val);
+       if (error < 0) {
+               dev_err(&tsdata->client->dev,
+                       "Failed to update attribute %s, error: %d\n",
+                       dattr->attr.name, error);
+               goto out;
+       }
+       *field = val;
+
+out:
+       mutex_unlock(&tsdata->mutex);
+       return error ?: count;
+}
+
+static HYCON_ATTR_U8(threshold, 0644, HY46XX_THRESHOLD, 0, 255);
+static HYCON_ATTR_BOOL(glove_enable, 0644, HY46XX_GLOVE_EN);
+static HYCON_ATTR_U8(report_speed, 0644, HY46XX_REPORT_SPEED, 0, 255);
+static HYCON_ATTR_BOOL(noise_filter_enable, 0644, HY46XX_PWR_NOISE_EN);
+static HYCON_ATTR_U8(filter_data, 0644, HY46XX_FILTER_DATA, 0, 5);
+static HYCON_ATTR_U8(gain, 0644, HY46XX_GAIN, 0, 5);
+static HYCON_ATTR_U8(edge_offset, 0644, HY46XX_EDGE_OFFSET, 0, 5);
+static HYCON_ATTR_U8(fw_version, 0444, HY46XX_FW_VERSION, 0, 255);
+static HYCON_ATTR_U8(lib_version, 0444, HY46XX_LIB_VERSION, 0, 255);
+static HYCON_ATTR_U8(tp_information, 0444, HY46XX_TP_INFO, 0, 255);
+static HYCON_ATTR_U8(tp_chip_id, 0444, HY46XX_TP_CHIP_ID, 0, 255);
+static HYCON_ATTR_U8(bootloader_version, 0444, HY46XX_BOOT_VER, 0, 255);
+
+static struct attribute *hycon_hy46xx_attrs[] = {
+       &hycon_hy46xx_attr_threshold.dattr.attr,
+       &hycon_hy46xx_attr_glove_enable.dattr.attr,
+       &hycon_hy46xx_attr_report_speed.dattr.attr,
+       &hycon_hy46xx_attr_noise_filter_enable.dattr.attr,
+       &hycon_hy46xx_attr_filter_data.dattr.attr,
+       &hycon_hy46xx_attr_gain.dattr.attr,
+       &hycon_hy46xx_attr_edge_offset.dattr.attr,
+       &hycon_hy46xx_attr_fw_version.dattr.attr,
+       &hycon_hy46xx_attr_lib_version.dattr.attr,
+       &hycon_hy46xx_attr_tp_information.dattr.attr,
+       &hycon_hy46xx_attr_tp_chip_id.dattr.attr,
+       &hycon_hy46xx_attr_bootloader_version.dattr.attr,
+       NULL
+};
+
+static const struct attribute_group hycon_hy46xx_attr_group = {
+       .attrs = hycon_hy46xx_attrs,
+};
+
+static void hycon_hy46xx_get_defaults(struct device *dev, struct hycon_hy46xx_data *tsdata)
+{
+       bool val_bool;
+       int error;
+       u32 val;
+
+       error = device_property_read_u32(dev, "hycon,threshold", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_THRESHOLD, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->threshold = val;
+       }
+
+       val_bool = device_property_read_bool(dev, "hycon,glove-enable");
+       error = regmap_write(tsdata->regmap, HY46XX_GLOVE_EN, val_bool);
+       if (error < 0)
+               goto out;
+       tsdata->glove_enable = val_bool;
+
+       error = device_property_read_u32(dev, "hycon,report-speed-hz", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_REPORT_SPEED, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->report_speed = val;
+       }
+
+       val_bool = device_property_read_bool(dev, "hycon,noise-filter-enable");
+       error = regmap_write(tsdata->regmap, HY46XX_PWR_NOISE_EN, val_bool);
+       if (error < 0)
+               goto out;
+       tsdata->noise_filter_enable = val_bool;
+
+       error = device_property_read_u32(dev, "hycon,filter-data", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_FILTER_DATA, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->filter_data = val;
+       }
+
+       error = device_property_read_u32(dev, "hycon,gain", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_GAIN, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->gain = val;
+       }
+
+       error = device_property_read_u32(dev, "hycon,edge-offset", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_EDGE_OFFSET, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->edge_offset = val;
+       }
+
+       return;
+out:
+       dev_err(&tsdata->client->dev, "Failed to set default settings");
+}
+
+static void hycon_hy46xx_get_parameters(struct hycon_hy46xx_data *tsdata)
+{
+       int error;
+       u32 val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_THRESHOLD, &val);
+       if (error < 0)
+               goto out;
+       tsdata->threshold = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_GLOVE_EN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->glove_enable = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_REPORT_SPEED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->report_speed = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_PWR_NOISE_EN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->noise_filter_enable = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_FILTER_DATA, &val);
+       if (error < 0)
+               goto out;
+       tsdata->filter_data = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_GAIN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->gain = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_EDGE_OFFSET, &val);
+       if (error < 0)
+               goto out;
+       tsdata->edge_offset = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_RX_NR_USED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->rx_number_used = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TX_NR_USED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tx_number_used = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_PWR_MODE, &val);
+       if (error < 0)
+               goto out;
+       tsdata->power_mode = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_FW_VERSION, &val);
+       if (error < 0)
+               goto out;
+       tsdata->fw_version = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_LIB_VERSION, &val);
+       if (error < 0)
+               goto out;
+       tsdata->lib_version = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TP_INFO, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tp_information = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TP_CHIP_ID, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tp_chip_id = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_BOOT_VER, &val);
+       if (error < 0)
+               goto out;
+       tsdata->bootloader_version = val;
+
+       return;
+out:
+       dev_err(&tsdata->client->dev, "Failed to read default settings");
+}
+
+static void hycon_hy46xx_disable_regulator(void *arg)
+{
+       struct hycon_hy46xx_data *data = arg;
+
+       regulator_disable(data->vcc);
+}
+
+static int hycon_hy46xx_probe(struct i2c_client *client,
+                                        const struct i2c_device_id *id)
+{
+       struct hycon_hy46xx_data *tsdata;
+       struct input_dev *input;
+       int error;
+
+       dev_dbg(&client->dev, "probing for HYCON HY46XX I2C\n");
+
+       tsdata = devm_kzalloc(&client->dev, sizeof(*tsdata), GFP_KERNEL);
+       if (!tsdata)
+               return -ENOMEM;
+
+       tsdata->vcc = devm_regulator_get(&client->dev, "vcc");
+       if (IS_ERR(tsdata->vcc)) {
+               error = PTR_ERR(tsdata->vcc);
+               if (error != -EPROBE_DEFER)
+                       dev_err(&client->dev,
+                               "failed to request regulator: %d\n", error);
+               return error;
+       }
+
+       error = regulator_enable(tsdata->vcc);
+       if (error < 0) {
+               dev_err(&client->dev, "failed to enable vcc: %d\n", error);
+               return error;
+       }
+
+       error = devm_add_action_or_reset(&client->dev,
+                                        hycon_hy46xx_disable_regulator,
+                                        tsdata);
+       if (error)
+               return error;
+
+       tsdata->reset_gpio = devm_gpiod_get_optional(&client->dev,
+                                                    "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(tsdata->reset_gpio)) {
+               error = PTR_ERR(tsdata->reset_gpio);
+               dev_err(&client->dev,
+                       "Failed to request GPIO reset pin, error %d\n", error);
+               return error;
+       }
+
+       if (tsdata->reset_gpio) {
+               usleep_range(5000, 6000);
+               gpiod_set_value_cansleep(tsdata->reset_gpio, 1);
+               usleep_range(5000, 6000);
+               gpiod_set_value_cansleep(tsdata->reset_gpio, 0);
+               msleep(1000);
+       }
+
+       input = devm_input_allocate_device(&client->dev);
+       if (!input) {
+               dev_err(&client->dev, "failed to allocate input device.\n");
+               return -ENOMEM;
+       }
+
+       mutex_init(&tsdata->mutex);
+       tsdata->client = client;
+       tsdata->input = input;
+
+       tsdata->regmap = devm_regmap_init_i2c(client,
+                                             &hycon_hy46xx_i2c_regmap_config);
+       if (IS_ERR(tsdata->regmap)) {
+               dev_err(&client->dev, "regmap allocation failed\n");
+               return PTR_ERR(tsdata->regmap);
+       }
+
+       hycon_hy46xx_get_defaults(&client->dev, tsdata);
+       hycon_hy46xx_get_parameters(tsdata);
+
+       input->name = "Hycon Capacitive Touch";
+       input->id.bustype = BUS_I2C;
+       input->dev.parent = &client->dev;
+
+       input_set_abs_params(input, ABS_MT_POSITION_X, 0, -1, 0, 0);
+       input_set_abs_params(input, ABS_MT_POSITION_Y, 0, -1, 0, 0);
+
+       touchscreen_parse_properties(input, true, &tsdata->prop);
+
+       error = input_mt_init_slots(input, HY46XX_MAX_SUPPORTED_POINTS,
+                                   INPUT_MT_DIRECT);
+       if (error) {
+               dev_err(&client->dev, "Unable to init MT slots.\n");
+               return error;
+       }
+
+       i2c_set_clientdata(client, tsdata);
+
+       error = devm_request_threaded_irq(&client->dev, client->irq,
+                                         NULL, hycon_hy46xx_isr, IRQF_ONESHOT,
+                                         client->name, tsdata);
+       if (error) {
+               dev_err(&client->dev, "Unable to request touchscreen IRQ.\n");
+               return error;
+       }
+
+       error = devm_device_add_group(&client->dev, &hycon_hy46xx_attr_group);
+       if (error)
+               return error;
+
+       error = input_register_device(input);
+       if (error)
+               return error;
+
+       dev_dbg(&client->dev,
+               "HYCON HY46XX initialized: IRQ %d, Reset pin %d.\n",
+               client->irq,
+               tsdata->reset_gpio ? desc_to_gpio(tsdata->reset_gpio) : -1);
+
+       return 0;
+}
+
+static const struct i2c_device_id hycon_hy46xx_id[] = {
+       { .name = "hy4613" },
+       { .name = "hy4614" },
+       { .name = "hy4621" },
+       { .name = "hy4623" },
+       { .name = "hy4633" },
+       { .name = "hy4635" },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, hycon_hy46xx_id);
+
+static const struct of_device_id hycon_hy46xx_of_match[] = {
+       { .compatible = "hycon,hy4613" },
+       { .compatible = "hycon,hy4614" },
+       { .compatible = "hycon,hy4621" },
+       { .compatible = "hycon,hy4623" },
+       { .compatible = "hycon,hy4633" },
+       { .compatible = "hycon,hy4635" },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, hycon_hy46xx_of_match);
+
+static struct i2c_driver hycon_hy46xx_driver = {
+       .driver = {
+               .name = "hycon_hy46xx",
+               .of_match_table = hycon_hy46xx_of_match,
+               .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+       },
+       .id_table = hycon_hy46xx_id,
+       .probe    = hycon_hy46xx_probe,
+};
+
+module_i2c_driver(hycon_hy46xx_driver);
+
+MODULE_AUTHOR("Giulio Benetti <giulio.benetti@benettiengineering.com>");
+MODULE_DESCRIPTION("HYCON HY46XX I2C Touchscreen Driver");
+MODULE_LICENSE("GPL v2");
index d8fccf0..30576a5 100644 (file)
@@ -87,7 +87,7 @@ static bool ili210x_touchdata_to_coords(const u8 *touchdata,
                                        unsigned int *x, unsigned int *y,
                                        unsigned int *z)
 {
-       if (touchdata[0] & BIT(finger))
+       if (!(touchdata[0] & BIT(finger)))
                return false;
 
        *x = get_unaligned_be16(touchdata + 1 + (finger * 4) + 0);
diff --git a/drivers/input/touchscreen/ilitek_ts_i2c.c b/drivers/input/touchscreen/ilitek_ts_i2c.c
new file mode 100644 (file)
index 0000000..c5d259c
--- /dev/null
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ILITEK Touch IC driver for 23XX, 25XX and Lego series
+ *
+ * Copyright (C) 2011 ILI Technology Corporation.
+ * Copyright (C) 2020 Luca Hsu <luca_hsu@ilitek.com>
+ * Copyright (C) 2021 Joe Hung <joe_hung@ilitek.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/i2c.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/input/touchscreen.h>
+#include <asm/unaligned.h>
+
+
+#define ILITEK_TS_NAME                                 "ilitek_ts"
+#define BL_V1_8                                                0x108
+#define BL_V1_7                                                0x107
+#define BL_V1_6                                                0x106
+
+#define ILITEK_TP_CMD_GET_TP_RES                       0x20
+#define ILITEK_TP_CMD_GET_SCRN_RES                     0x21
+#define ILITEK_TP_CMD_SET_IC_SLEEP                     0x30
+#define ILITEK_TP_CMD_SET_IC_WAKE                      0x31
+#define ILITEK_TP_CMD_GET_FW_VER                       0x40
+#define ILITEK_TP_CMD_GET_PRL_VER                      0x42
+#define ILITEK_TP_CMD_GET_MCU_VER                      0x61
+#define ILITEK_TP_CMD_GET_IC_MODE                      0xC0
+
+#define REPORT_COUNT_ADDRESS                           61
+#define ILITEK_SUPPORT_MAX_POINT                       40
+
+struct ilitek_protocol_info {
+       u16 ver;
+       u8 ver_major;
+};
+
+struct ilitek_ts_data {
+       struct i2c_client               *client;
+       struct gpio_desc                *reset_gpio;
+       struct input_dev                *input_dev;
+       struct touchscreen_properties   prop;
+
+       const struct ilitek_protocol_map *ptl_cb_func;
+       struct ilitek_protocol_info     ptl;
+
+       char                            product_id[30];
+       u16                             mcu_ver;
+       u8                              ic_mode;
+       u8                              firmware_ver[8];
+
+       s32                             reset_time;
+       s32                             screen_max_x;
+       s32                             screen_max_y;
+       s32                             screen_min_x;
+       s32                             screen_min_y;
+       s32                             max_tp;
+};
+
+struct ilitek_protocol_map {
+       u16 cmd;
+       const char *name;
+       int (*func)(struct ilitek_ts_data *ts, u16 cmd, u8 *inbuf, u8 *outbuf);
+};
+
+enum ilitek_cmds {
+       /* common cmds */
+       GET_PTL_VER = 0,
+       GET_FW_VER,
+       GET_SCRN_RES,
+       GET_TP_RES,
+       GET_IC_MODE,
+       GET_MCU_VER,
+       SET_IC_SLEEP,
+       SET_IC_WAKE,
+
+       /* ALWAYS keep at the end */
+       MAX_CMD_CNT
+};
+
+/* ILITEK I2C R/W APIs */
+static int ilitek_i2c_write_and_read(struct ilitek_ts_data *ts,
+                                    u8 *cmd, int write_len, int delay,
+                                    u8 *data, int read_len)
+{
+       int error;
+       struct i2c_client *client = ts->client;
+       struct i2c_msg msgs[] = {
+               {
+                       .addr = client->addr,
+                       .flags = 0,
+                       .len = write_len,
+                       .buf = cmd,
+               },
+               {
+                       .addr = client->addr,
+                       .flags = I2C_M_RD,
+                       .len = read_len,
+                       .buf = data,
+               },
+       };
+
+       if (delay == 0 && write_len > 0 && read_len > 0) {
+               error = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+               if (error < 0)
+                       return error;
+       } else {
+               if (write_len > 0) {
+                       error = i2c_transfer(client->adapter, msgs, 1);
+                       if (error < 0)
+                               return error;
+               }
+               if (delay > 0)
+                       mdelay(delay);
+
+               if (read_len > 0) {
+                       error = i2c_transfer(client->adapter, msgs + 1, 1);
+                       if (error < 0)
+                               return error;
+               }
+       }
+
+       return 0;
+}
+
+/* ILITEK ISR APIs */
+static void ilitek_touch_down(struct ilitek_ts_data *ts, unsigned int id,
+                             unsigned int x, unsigned int y)
+{
+       struct input_dev *input = ts->input_dev;
+
+       input_mt_slot(input, id);
+       input_mt_report_slot_state(input, MT_TOOL_FINGER, true);
+
+       touchscreen_report_pos(input, &ts->prop, x, y, true);
+}
+
+static int ilitek_process_and_report_v6(struct ilitek_ts_data *ts)
+{
+       int error = 0;
+       u8 buf[512];
+       int packet_len = 5;
+       int packet_max_point = 10;
+       int report_max_point;
+       int i, count;
+       struct input_dev *input = ts->input_dev;
+       struct device *dev = &ts->client->dev;
+       unsigned int x, y, status, id;
+
+       error = ilitek_i2c_write_and_read(ts, NULL, 0, 0, buf, 64);
+       if (error) {
+               dev_err(dev, "get touch info failed, err:%d\n", error);
+               goto err_sync_frame;
+       }
+
+       report_max_point = buf[REPORT_COUNT_ADDRESS];
+       if (report_max_point > ts->max_tp) {
+               dev_err(dev, "FW report max point:%d > panel info. max:%d\n",
+                       report_max_point, ts->max_tp);
+               error = -EINVAL;
+               goto err_sync_frame;
+       }
+
+       count = DIV_ROUND_UP(report_max_point, packet_max_point);
+       for (i = 1; i < count; i++) {
+               error = ilitek_i2c_write_and_read(ts, NULL, 0, 0,
+                                                 buf + i * 64, 64);
+               if (error) {
+                       dev_err(dev, "get touch info. failed, cnt:%d, err:%d\n",
+                               count, error);
+                       goto err_sync_frame;
+               }
+       }
+
+       for (i = 0; i < report_max_point; i++) {
+               status = buf[i * packet_len + 1] & 0x40;
+               if (!status)
+                       continue;
+
+               id = buf[i * packet_len + 1] & 0x3F;
+
+               x = get_unaligned_le16(buf + i * packet_len + 2);
+               y = get_unaligned_le16(buf + i * packet_len + 4);
+
+               if (x > ts->screen_max_x || x < ts->screen_min_x ||
+                   y > ts->screen_max_y || y < ts->screen_min_y) {
+                       dev_warn(dev, "invalid position, X[%d,%u,%d], Y[%d,%u,%d]\n",
+                                ts->screen_min_x, x, ts->screen_max_x,
+                                ts->screen_min_y, y, ts->screen_max_y);
+                       continue;
+               }
+
+               ilitek_touch_down(ts, id, x, y);
+       }
+
+err_sync_frame:
+       input_mt_sync_frame(input);
+       input_sync(input);
+       return error;
+}
+
+/* APIs of cmds for ILITEK Touch IC */
+static int api_protocol_set_cmd(struct ilitek_ts_data *ts,
+                               u16 idx, u8 *inbuf, u8 *outbuf)
+{
+       u16 cmd;
+       int error;
+
+       if (idx >= MAX_CMD_CNT)
+               return -EINVAL;
+
+       cmd = ts->ptl_cb_func[idx].cmd;
+       error = ts->ptl_cb_func[idx].func(ts, cmd, inbuf, outbuf);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static int api_protocol_get_ptl_ver(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 3);
+       if (error)
+               return error;
+
+       ts->ptl.ver = get_unaligned_be16(outbuf);
+       ts->ptl.ver_major = outbuf[0];
+
+       return 0;
+}
+
+static int api_protocol_get_mcu_ver(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 32);
+       if (error)
+               return error;
+
+       ts->mcu_ver = get_unaligned_le16(outbuf);
+       memset(ts->product_id, 0, sizeof(ts->product_id));
+       memcpy(ts->product_id, outbuf + 6, 26);
+
+       return 0;
+}
+
+static int api_protocol_get_fw_ver(struct ilitek_ts_data *ts,
+                                  u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8);
+       if (error)
+               return error;
+
+       memcpy(ts->firmware_ver, outbuf, 8);
+
+       return 0;
+}
+
+static int api_protocol_get_scrn_res(struct ilitek_ts_data *ts,
+                                    u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8);
+       if (error)
+               return error;
+
+       ts->screen_min_x = get_unaligned_le16(outbuf);
+       ts->screen_min_y = get_unaligned_le16(outbuf + 2);
+       ts->screen_max_x = get_unaligned_le16(outbuf + 4);
+       ts->screen_max_y = get_unaligned_le16(outbuf + 6);
+
+       return 0;
+}
+
+static int api_protocol_get_tp_res(struct ilitek_ts_data *ts,
+                                  u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 15);
+       if (error)
+               return error;
+
+       ts->max_tp = outbuf[8];
+       if (ts->max_tp > ILITEK_SUPPORT_MAX_POINT) {
+               dev_err(&ts->client->dev, "Invalid MAX_TP:%d from FW\n",
+                       ts->max_tp);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int api_protocol_get_ic_mode(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 2);
+       if (error)
+               return error;
+
+       ts->ic_mode = outbuf[0];
+       return 0;
+}
+
+static int api_protocol_set_ic_sleep(struct ilitek_ts_data *ts,
+                                    u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       u8 buf[64];
+
+       buf[0] = cmd;
+       return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0);
+}
+
+static int api_protocol_set_ic_wake(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       u8 buf[64];
+
+       buf[0] = cmd;
+       return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0);
+}
+
+static const struct ilitek_protocol_map ptl_func_map[] = {
+       /* common cmds */
+       [GET_PTL_VER] = {
+               ILITEK_TP_CMD_GET_PRL_VER, "GET_PTL_VER",
+               api_protocol_get_ptl_ver
+       },
+       [GET_FW_VER] = {
+               ILITEK_TP_CMD_GET_FW_VER, "GET_FW_VER",
+               api_protocol_get_fw_ver
+       },
+       [GET_SCRN_RES] = {
+               ILITEK_TP_CMD_GET_SCRN_RES, "GET_SCRN_RES",
+               api_protocol_get_scrn_res
+       },
+       [GET_TP_RES] = {
+               ILITEK_TP_CMD_GET_TP_RES, "GET_TP_RES",
+               api_protocol_get_tp_res
+       },
+       [GET_IC_MODE] = {
+               ILITEK_TP_CMD_GET_IC_MODE, "GET_IC_MODE",
+                          api_protocol_get_ic_mode
+       },
+       [GET_MCU_VER] = {
+               ILITEK_TP_CMD_GET_MCU_VER, "GET_MOD_VER",
+                          api_protocol_get_mcu_ver
+       },
+       [SET_IC_SLEEP] = {
+               ILITEK_TP_CMD_SET_IC_SLEEP, "SET_IC_SLEEP",
+               api_protocol_set_ic_sleep
+       },
+       [SET_IC_WAKE] = {
+               ILITEK_TP_CMD_SET_IC_WAKE, "SET_IC_WAKE",
+               api_protocol_set_ic_wake
+       },
+};
+
+/* Probe APIs */
+static void ilitek_reset(struct ilitek_ts_data *ts, int delay)
+{
+       if (ts->reset_gpio) {
+               gpiod_set_value(ts->reset_gpio, 1);
+               mdelay(10);
+               gpiod_set_value(ts->reset_gpio, 0);
+               mdelay(delay);
+       }
+}
+
+static int ilitek_protocol_init(struct ilitek_ts_data *ts)
+{
+       int error;
+       u8 outbuf[64];
+
+       ts->ptl_cb_func = ptl_func_map;
+       ts->reset_time = 600;
+
+       error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       /* Protocol v3 is not support currently */
+       if (ts->ptl.ver_major == 0x3 ||
+           ts->ptl.ver == BL_V1_6 ||
+           ts->ptl.ver == BL_V1_7)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int ilitek_read_tp_info(struct ilitek_ts_data *ts, bool boot)
+{
+       u8 outbuf[256];
+       int error;
+
+       error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_MCU_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_FW_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       if (boot) {
+               error = api_protocol_set_cmd(ts, GET_SCRN_RES, NULL,
+                                            outbuf);
+               if (error)
+                       return error;
+       }
+
+       error = api_protocol_set_cmd(ts, GET_TP_RES, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_IC_MODE, NULL, outbuf);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static int ilitek_input_dev_init(struct device *dev, struct ilitek_ts_data *ts)
+{
+       int error;
+       struct input_dev *input;
+
+       input = devm_input_allocate_device(dev);
+       if (!input)
+               return -ENOMEM;
+
+       ts->input_dev = input;
+       input->name = ILITEK_TS_NAME;
+       input->id.bustype = BUS_I2C;
+
+       __set_bit(INPUT_PROP_DIRECT, input->propbit);
+
+       input_set_abs_params(input, ABS_MT_POSITION_X,
+                            ts->screen_min_x, ts->screen_max_x, 0, 0);
+       input_set_abs_params(input, ABS_MT_POSITION_Y,
+                            ts->screen_min_y, ts->screen_max_y, 0, 0);
+
+       touchscreen_parse_properties(input, true, &ts->prop);
+
+       error = input_mt_init_slots(input, ts->max_tp,
+                                   INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
+       if (error) {
+               dev_err(dev, "initialize MT slots failed, err:%d\n", error);
+               return error;
+       }
+
+       error = input_register_device(input);
+       if (error) {
+               dev_err(dev, "register input device failed, err:%d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static irqreturn_t ilitek_i2c_isr(int irq, void *dev_id)
+{
+       struct ilitek_ts_data *ts = dev_id;
+       int error;
+
+       error = ilitek_process_and_report_v6(ts);
+       if (error < 0) {
+               dev_err(&ts->client->dev, "[%s] err:%d\n", __func__, error);
+               return IRQ_NONE;
+       }
+
+       return IRQ_HANDLED;
+}
+
+static ssize_t firmware_version_show(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+
+       return scnprintf(buf, PAGE_SIZE,
+                        "fw version: [%02X%02X.%02X%02X.%02X%02X.%02X%02X]\n",
+                        ts->firmware_ver[0], ts->firmware_ver[1],
+                        ts->firmware_ver[2], ts->firmware_ver[3],
+                        ts->firmware_ver[4], ts->firmware_ver[5],
+                        ts->firmware_ver[6], ts->firmware_ver[7]);
+}
+static DEVICE_ATTR_RO(firmware_version);
+
+static ssize_t product_id_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+
+       return scnprintf(buf, PAGE_SIZE, "product id: [%04X], module: [%s]\n",
+                        ts->mcu_ver, ts->product_id);
+}
+static DEVICE_ATTR_RO(product_id);
+
+static struct attribute *ilitek_sysfs_attrs[] = {
+       &dev_attr_firmware_version.attr,
+       &dev_attr_product_id.attr,
+       NULL
+};
+
+static struct attribute_group ilitek_attrs_group = {
+       .attrs = ilitek_sysfs_attrs,
+};
+
+static int ilitek_ts_i2c_probe(struct i2c_client *client,
+                              const struct i2c_device_id *id)
+{
+       struct ilitek_ts_data *ts;
+       struct device *dev = &client->dev;
+       int error;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(dev, "i2c check functionality failed\n");
+               return -ENXIO;
+       }
+
+       ts = devm_kzalloc(dev, sizeof(*ts), GFP_KERNEL);
+       if (!ts)
+               return -ENOMEM;
+
+       ts->client = client;
+       i2c_set_clientdata(client, ts);
+
+       ts->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(ts->reset_gpio)) {
+               error = PTR_ERR(ts->reset_gpio);
+               dev_err(dev, "request gpiod failed: %d", error);
+               return error;
+       }
+
+       ilitek_reset(ts, 1000);
+
+       error = ilitek_protocol_init(ts);
+       if (error) {
+               dev_err(dev, "protocol init failed: %d", error);
+               return error;
+       }
+
+       error = ilitek_read_tp_info(ts, true);
+       if (error) {
+               dev_err(dev, "read tp info failed: %d", error);
+               return error;
+       }
+
+       error = ilitek_input_dev_init(dev, ts);
+       if (error) {
+               dev_err(dev, "input dev init failed: %d", error);
+               return error;
+       }
+
+       error = devm_request_threaded_irq(dev, ts->client->irq,
+                                         NULL, ilitek_i2c_isr, IRQF_ONESHOT,
+                                         "ilitek_touch_irq", ts);
+       if (error) {
+               dev_err(dev, "request threaded irq failed: %d\n", error);
+               return error;
+       }
+
+       error = devm_device_add_group(dev, &ilitek_attrs_group);
+       if (error) {
+               dev_err(dev, "sysfs create group failed: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused ilitek_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+       int error;
+
+       disable_irq(client->irq);
+
+       if (!device_may_wakeup(dev)) {
+               error = api_protocol_set_cmd(ts, SET_IC_SLEEP, NULL, NULL);
+               if (error)
+                       return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused ilitek_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+       int error;
+
+       if (!device_may_wakeup(dev)) {
+               error = api_protocol_set_cmd(ts, SET_IC_WAKE, NULL, NULL);
+               if (error)
+                       return error;
+
+               ilitek_reset(ts, ts->reset_time);
+       }
+
+       enable_irq(client->irq);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(ilitek_pm_ops, ilitek_suspend, ilitek_resume);
+
+static const struct i2c_device_id ilitek_ts_i2c_id[] = {
+       { ILITEK_TS_NAME, 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(i2c, ilitek_ts_i2c_id);
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id ilitekts_acpi_id[] = {
+       { "ILTK0001", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, ilitekts_acpi_id);
+#endif
+
+#ifdef CONFIG_OF
+static const struct of_device_id ilitek_ts_i2c_match[] = {
+       {.compatible = "ilitek,ili2130",},
+       {.compatible = "ilitek,ili2131",},
+       {.compatible = "ilitek,ili2132",},
+       {.compatible = "ilitek,ili2316",},
+       {.compatible = "ilitek,ili2322",},
+       {.compatible = "ilitek,ili2323",},
+       {.compatible = "ilitek,ili2326",},
+       {.compatible = "ilitek,ili2520",},
+       {.compatible = "ilitek,ili2521",},
+       { },
+};
+MODULE_DEVICE_TABLE(of, ilitek_ts_i2c_match);
+#endif
+
+static struct i2c_driver ilitek_ts_i2c_driver = {
+       .driver = {
+               .name = ILITEK_TS_NAME,
+               .pm = &ilitek_pm_ops,
+               .of_match_table = of_match_ptr(ilitek_ts_i2c_match),
+               .acpi_match_table = ACPI_PTR(ilitekts_acpi_id),
+       },
+       .probe = ilitek_ts_i2c_probe,
+       .id_table = ilitek_ts_i2c_id,
+};
+module_i2c_driver(ilitek_ts_i2c_driver);
+
+MODULE_AUTHOR("ILITEK");
+MODULE_DESCRIPTION("ILITEK I2C Touchscreen Driver");
+MODULE_LICENSE("GPL");
index 54f3003..b3fa712 100644 (file)
@@ -8,7 +8,7 @@
  * made available by the vendor. Firmware files may be pushed to the device's
  * nonvolatile memory by writing the filename to the 'fw_file' sysfs control.
  *
- * Link to PC-based configuration tool and data sheet: http://www.azoteq.com/
+ * Link to PC-based configuration tool and datasheet: https://www.azoteq.com/
  */
 
 #include <linux/bits.h>
 #define IQS5XX_NUM_RETRIES     10
 #define IQS5XX_NUM_CONTACTS    5
 #define IQS5XX_WR_BYTES_MAX    2
-#define IQS5XX_XY_RES_MAX      0xFFFE
 
 #define IQS5XX_PROD_NUM_IQS550 40
 #define IQS5XX_PROD_NUM_IQS572 58
 #define IQS5XX_PROD_NUM_IQS525 52
-#define IQS5XX_PROJ_NUM_A000   0
-#define IQS5XX_PROJ_NUM_B000   15
-#define IQS5XX_MAJOR_VER_MIN   2
 
 #define IQS5XX_SHOW_RESET      BIT(7)
 #define IQS5XX_ACK_RESET       BIT(7)
@@ -64,6 +60,7 @@
 #define IQS5XX_SYS_CFG1                0x058F
 #define IQS5XX_X_RES           0x066E
 #define IQS5XX_Y_RES           0x0670
+#define IQS5XX_EXP_FILE                0x0677
 #define IQS5XX_CHKSM           0x83C0
 #define IQS5XX_APP             0x8400
 #define IQS5XX_CSTM            0xBE00
 #define IQS5XX_BL_CMD_CRC      0x03
 #define IQS5XX_BL_BLK_LEN_MAX  64
 #define IQS5XX_BL_ID           0x0200
-#define IQS5XX_BL_STATUS_RESET 0x00
-#define IQS5XX_BL_STATUS_AVAIL 0xA5
 #define IQS5XX_BL_STATUS_NONE  0xEE
 #define IQS5XX_BL_CRC_PASS     0x00
 #define IQS5XX_BL_CRC_FAIL     0x01
 #define IQS5XX_BL_ATTEMPTS     3
 
-struct iqs5xx_private {
-       struct i2c_client *client;
-       struct input_dev *input;
-       struct gpio_desc *reset_gpio;
-       struct touchscreen_properties prop;
-       struct mutex lock;
-       u8 bl_status;
-};
-
 struct iqs5xx_dev_id_info {
        __be16 prod_num;
        __be16 proj_num;
@@ -134,6 +120,16 @@ struct iqs5xx_status {
        struct iqs5xx_touch_data touch_data[IQS5XX_NUM_CONTACTS];
 } __packed;
 
+struct iqs5xx_private {
+       struct i2c_client *client;
+       struct input_dev *input;
+       struct gpio_desc *reset_gpio;
+       struct touchscreen_properties prop;
+       struct mutex lock;
+       struct iqs5xx_dev_id_info dev_id_info;
+       u8 exp_file[2];
+};
+
 static int iqs5xx_read_burst(struct i2c_client *client,
                             u16 reg, void *val, u16 len)
 {
@@ -446,7 +442,7 @@ static int iqs5xx_set_state(struct i2c_client *client, u8 state)
        struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client);
        int error1, error2;
 
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
+       if (!iqs5xx->dev_id_info.bl_status)
                return 0;
 
        mutex_lock(&iqs5xx->lock);
@@ -504,10 +500,6 @@ static int iqs5xx_axis_init(struct i2c_client *client)
                input->open = iqs5xx_open;
                input->close = iqs5xx_close;
 
-               input_set_capability(input, EV_ABS, ABS_MT_POSITION_X);
-               input_set_capability(input, EV_ABS, ABS_MT_POSITION_Y);
-               input_set_capability(input, EV_ABS, ABS_MT_PRESSURE);
-
                input_set_drvdata(input, iqs5xx);
                iqs5xx->input = input;
        }
@@ -520,26 +512,29 @@ static int iqs5xx_axis_init(struct i2c_client *client)
        if (error)
                return error;
 
-       input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_X, max_x);
-       input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_Y, max_y);
+       input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_X, 0, max_x, 0, 0);
+       input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_Y, 0, max_y, 0, 0);
+       input_set_abs_params(iqs5xx->input, ABS_MT_PRESSURE, 0, U16_MAX, 0, 0);
 
        touchscreen_parse_properties(iqs5xx->input, true, prop);
 
-       if (prop->max_x > IQS5XX_XY_RES_MAX) {
-               dev_err(&client->dev, "Invalid maximum x-coordinate: %u > %u\n",
-                       prop->max_x, IQS5XX_XY_RES_MAX);
+       /*
+        * The device reserves 0xFFFF for coordinates that correspond to slots
+        * which are not in a state of touch.
+        */
+       if (prop->max_x >= U16_MAX || prop->max_y >= U16_MAX) {
+               dev_err(&client->dev, "Invalid touchscreen size: %u*%u\n",
+                       prop->max_x, prop->max_y);
                return -EINVAL;
-       } else if (prop->max_x != max_x) {
+       }
+
+       if (prop->max_x != max_x) {
                error = iqs5xx_write_word(client, IQS5XX_X_RES, prop->max_x);
                if (error)
                        return error;
        }
 
-       if (prop->max_y > IQS5XX_XY_RES_MAX) {
-               dev_err(&client->dev, "Invalid maximum y-coordinate: %u > %u\n",
-                       prop->max_y, IQS5XX_XY_RES_MAX);
-               return -EINVAL;
-       } else if (prop->max_y != max_y) {
+       if (prop->max_y != max_y) {
                error = iqs5xx_write_word(client, IQS5XX_Y_RES, prop->max_y);
                if (error)
                        return error;
@@ -574,7 +569,7 @@ static int iqs5xx_dev_init(struct i2c_client *client)
         * the missing zero is prepended).
         */
        buf[0] = 0;
-       dev_id_info = (struct iqs5xx_dev_id_info *)&buf[(buf[1] > 0) ? 0 : 1];
+       dev_id_info = (struct iqs5xx_dev_id_info *)&buf[buf[1] ? 0 : 1];
 
        switch (be16_to_cpu(dev_id_info->prod_num)) {
        case IQS5XX_PROD_NUM_IQS550:
@@ -587,35 +582,20 @@ static int iqs5xx_dev_init(struct i2c_client *client)
                return -EINVAL;
        }
 
-       switch (be16_to_cpu(dev_id_info->proj_num)) {
-       case IQS5XX_PROJ_NUM_A000:
-               dev_err(&client->dev, "Unsupported project number: %u\n",
-                       be16_to_cpu(dev_id_info->proj_num));
-               return iqs5xx_bl_open(client);
-       case IQS5XX_PROJ_NUM_B000:
-               break;
-       default:
-               dev_err(&client->dev, "Unrecognized project number: %u\n",
-                       be16_to_cpu(dev_id_info->proj_num));
-               return -EINVAL;
-       }
-
-       if (dev_id_info->major_ver < IQS5XX_MAJOR_VER_MIN) {
-               dev_err(&client->dev, "Unsupported major version: %u\n",
-                       dev_id_info->major_ver);
+       /*
+        * With the product number recognized yet shifted by one byte, open the
+        * bootloader and wait for user space to convert the A000 device into a
+        * B000 device via new firmware.
+        */
+       if (buf[1]) {
+               dev_err(&client->dev, "Opening bootloader for A000 device\n");
                return iqs5xx_bl_open(client);
        }
 
-       switch (dev_id_info->bl_status) {
-       case IQS5XX_BL_STATUS_AVAIL:
-       case IQS5XX_BL_STATUS_NONE:
-               break;
-       default:
-               dev_err(&client->dev,
-                       "Unrecognized bootloader status: 0x%02X\n",
-                       dev_id_info->bl_status);
-               return -EINVAL;
-       }
+       error = iqs5xx_read_burst(client, IQS5XX_EXP_FILE,
+                                 iqs5xx->exp_file, sizeof(iqs5xx->exp_file));
+       if (error)
+               return error;
 
        error = iqs5xx_axis_init(client);
        if (error)
@@ -640,7 +620,7 @@ static int iqs5xx_dev_init(struct i2c_client *client)
        if (error)
                return error;
 
-       iqs5xx->bl_status = dev_id_info->bl_status;
+       iqs5xx->dev_id_info = *dev_id_info;
 
        /*
         * The following delay allows ATI to complete before the open and close
@@ -666,7 +646,7 @@ static irqreturn_t iqs5xx_irq(int irq, void *data)
         * RDY output during bootloader mode. If the device operates outside of
         * bootloader mode, the input device is guaranteed to be allocated.
         */
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
+       if (!iqs5xx->dev_id_info.bl_status)
                return IRQ_NONE;
 
        error = iqs5xx_read_burst(client, IQS5XX_SYS_INFO0,
@@ -852,12 +832,9 @@ static int iqs5xx_fw_file_parse(struct i2c_client *client,
 static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
 {
        struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client);
-       int error, error_bl = 0;
+       int error, error_init = 0;
        u8 *pmap;
 
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_NONE)
-               return -EPERM;
-
        pmap = kzalloc(IQS5XX_PMAP_LEN, GFP_KERNEL);
        if (!pmap)
                return -ENOMEM;
@@ -875,7 +852,7 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
         */
        disable_irq(client->irq);
 
-       iqs5xx->bl_status = IQS5XX_BL_STATUS_RESET;
+       iqs5xx->dev_id_info.bl_status = 0;
 
        error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_VER, 0);
        if (error) {
@@ -895,21 +872,14 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
        error = iqs5xx_bl_verify(client, IQS5XX_CSTM,
                                 pmap + IQS5XX_CHKSM_LEN + IQS5XX_APP_LEN,
                                 IQS5XX_CSTM_LEN);
-       if (error)
-               goto err_reset;
-
-       error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_EXEC, 0);
 
 err_reset:
-       if (error) {
-               iqs5xx_reset(client);
-               usleep_range(10000, 10100);
-       }
+       iqs5xx_reset(client);
+       usleep_range(15000, 15100);
 
-       error_bl = error;
-       error = iqs5xx_dev_init(client);
-       if (!error && iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
-               error = -EINVAL;
+       error_init = iqs5xx_dev_init(client);
+       if (!iqs5xx->dev_id_info.bl_status)
+               error_init = error_init ? : -EINVAL;
 
        enable_irq(client->irq);
 
@@ -918,10 +888,7 @@ err_reset:
 err_kfree:
        kfree(pmap);
 
-       if (error_bl)
-               return error_bl;
-
-       return error;
+       return error ? : error_init;
 }
 
 static ssize_t fw_file_store(struct device *dev,
@@ -968,14 +935,47 @@ static ssize_t fw_file_store(struct device *dev,
        return count;
 }
 
+static ssize_t fw_info_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev);
+
+       if (!iqs5xx->dev_id_info.bl_status)
+               return -ENODATA;
+
+       return scnprintf(buf, PAGE_SIZE, "%u.%u.%u.%u:%u.%u\n",
+                        be16_to_cpu(iqs5xx->dev_id_info.prod_num),
+                        be16_to_cpu(iqs5xx->dev_id_info.proj_num),
+                        iqs5xx->dev_id_info.major_ver,
+                        iqs5xx->dev_id_info.minor_ver,
+                        iqs5xx->exp_file[0], iqs5xx->exp_file[1]);
+}
+
 static DEVICE_ATTR_WO(fw_file);
+static DEVICE_ATTR_RO(fw_info);
 
 static struct attribute *iqs5xx_attrs[] = {
        &dev_attr_fw_file.attr,
+       &dev_attr_fw_info.attr,
        NULL,
 };
 
+static umode_t iqs5xx_attr_is_visible(struct kobject *kobj,
+                                     struct attribute *attr, int i)
+{
+       struct device *dev = kobj_to_dev(kobj);
+       struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev);
+
+       if (attr == &dev_attr_fw_file.attr &&
+           (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE ||
+           !iqs5xx->reset_gpio))
+               return 0;
+
+       return attr->mode;
+}
+
 static const struct attribute_group iqs5xx_attr_group = {
+       .is_visible = iqs5xx_attr_is_visible,
        .attrs = iqs5xx_attrs,
 };
 
@@ -1032,8 +1032,8 @@ static int iqs5xx_probe(struct i2c_client *client,
        i2c_set_clientdata(client, iqs5xx);
        iqs5xx->client = client;
 
-       iqs5xx->reset_gpio = devm_gpiod_get(&client->dev,
-                                           "reset", GPIOD_OUT_LOW);
+       iqs5xx->reset_gpio = devm_gpiod_get_optional(&client->dev,
+                                                    "reset", GPIOD_OUT_LOW);
        if (IS_ERR(iqs5xx->reset_gpio)) {
                error = PTR_ERR(iqs5xx->reset_gpio);
                dev_err(&client->dev, "Failed to request GPIO: %d\n", error);
@@ -1042,9 +1042,6 @@ static int iqs5xx_probe(struct i2c_client *client,
 
        mutex_init(&iqs5xx->lock);
 
-       iqs5xx_reset(client);
-       usleep_range(10000, 10100);
-
        error = iqs5xx_dev_init(client);
        if (error)
                return error;
index b51450b..15b5cb7 100644 (file)
 #define LPC32XX_TSC_AUX_MIN                    0x38
 #define LPC32XX_TSC_AUX_MAX                    0x3C
 
-#define LPC32XX_TSC_STAT_FIFO_OVRRN            (1 << 8)
-#define LPC32XX_TSC_STAT_FIFO_EMPTY            (1 << 7)
+#define LPC32XX_TSC_STAT_FIFO_OVRRN            BIT(8)
+#define LPC32XX_TSC_STAT_FIFO_EMPTY            BIT(7)
 
 #define LPC32XX_TSC_SEL_DEFVAL                 0x0284
 
 #define LPC32XX_TSC_ADCCON_IRQ_TO_FIFO_4       (0x1 << 11)
 #define LPC32XX_TSC_ADCCON_X_SAMPLE_SIZE(s)    ((10 - (s)) << 7)
 #define LPC32XX_TSC_ADCCON_Y_SAMPLE_SIZE(s)    ((10 - (s)) << 4)
-#define LPC32XX_TSC_ADCCON_POWER_UP            (1 << 2)
-#define LPC32XX_TSC_ADCCON_AUTO_EN             (1 << 0)
+#define LPC32XX_TSC_ADCCON_POWER_UP            BIT(2)
+#define LPC32XX_TSC_ADCCON_AUTO_EN             BIT(0)
 
-#define LPC32XX_TSC_FIFO_TS_P_LEVEL            (1 << 31)
+#define LPC32XX_TSC_FIFO_TS_P_LEVEL            BIT(31)
 #define LPC32XX_TSC_FIFO_NORMALIZE_X_VAL(x)    (((x) & 0x03FF0000) >> 16)
 #define LPC32XX_TSC_FIFO_NORMALIZE_Y_VAL(y)    ((y) & 0x000003FF)
 
index 225796a..2745bf1 100644 (file)
@@ -1502,7 +1502,8 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, mip4_interrupt,
-                                         IRQF_ONESHOT, MIP4_DEVICE_NAME, ts);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         MIP4_DEVICE_NAME, ts);
        if (error) {
                dev_err(&client->dev,
                        "Failed to request interrupt %d: %d\n",
@@ -1510,8 +1511,6 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id)
                return error;
        }
 
-       disable_irq(client->irq);
-
        error = input_register_device(input);
        if (error) {
                dev_err(&client->dev,
index 16557f5..0efd1a1 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-// Melfas MMS114/MMS152 touchscreen device driver
+// Melfas MMS114/MMS136/MMS152 touchscreen device driver
 //
 // Copyright (c) 2012 Samsung Electronics Co., Ltd.
 // Author: Joonyoung Shim <jy0922.shim@samsung.com>
@@ -44,7 +44,8 @@
 #define MMS114_MAX_AREA                        0xff
 
 #define MMS114_MAX_TOUCH               10
-#define MMS114_PACKET_NUM              8
+#define MMS114_EVENT_SIZE              8
+#define MMS136_EVENT_SIZE              6
 
 /* Touch type */
 #define MMS114_TYPE_NONE               0
@@ -53,6 +54,7 @@
 
 enum mms_type {
        TYPE_MMS114     = 114,
+       TYPE_MMS136     = 136,
        TYPE_MMS152     = 152,
        TYPE_MMS345L    = 345,
 };
@@ -209,7 +211,11 @@ static irqreturn_t mms114_interrupt(int irq, void *dev_id)
        if (packet_size <= 0)
                goto out;
 
-       touch_size = packet_size / MMS114_PACKET_NUM;
+       /* MMS136 has slightly different event size */
+       if (data->type == TYPE_MMS136)
+               touch_size = packet_size / MMS136_EVENT_SIZE;
+       else
+               touch_size = packet_size / MMS114_EVENT_SIZE;
 
        error = __mms114_read_reg(data, MMS114_INFORMATION, packet_size,
                        (u8 *)touch);
@@ -275,6 +281,7 @@ static int mms114_get_version(struct mms114_data *data)
                break;
 
        case TYPE_MMS114:
+       case TYPE_MMS136:
                error = __mms114_read_reg(data, MMS114_TSP_REV, 6, buf);
                if (error)
                        return error;
@@ -297,8 +304,8 @@ static int mms114_setup_regs(struct mms114_data *data)
        if (error < 0)
                return error;
 
-       /* Only MMS114 has configuration and power on registers */
-       if (data->type != TYPE_MMS114)
+       /* Only MMS114 and MMS136 have configuration and power on registers */
+       if (data->type != TYPE_MMS114 && data->type != TYPE_MMS136)
                return 0;
 
        error = mms114_set_active(data, true);
@@ -480,7 +487,7 @@ static int mms114_probe(struct i2c_client *client,
                                     0, data->props.max_y, 0, 0);
        }
 
-       if (data->type == TYPE_MMS114) {
+       if (data->type == TYPE_MMS114 || data->type == TYPE_MMS136) {
                /*
                 * The firmware handles movement and pressure fuzz, so
                 * don't duplicate that in software.
@@ -530,13 +537,13 @@ static int mms114_probe(struct i2c_client *client,
        }
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
-                                         NULL, mms114_interrupt, IRQF_ONESHOT,
+                                         NULL, mms114_interrupt,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          dev_name(&client->dev), data);
        if (error) {
                dev_err(&client->dev, "Failed to register interrupt\n");
                return error;
        }
-       disable_irq(client->irq);
 
        error = input_register_device(data->input_dev);
        if (error) {
@@ -605,6 +612,9 @@ static const struct of_device_id mms114_dt_match[] = {
                .compatible = "melfas,mms114",
                .data = (void *)TYPE_MMS114,
        }, {
+               .compatible = "melfas,mms136",
+               .data = (void *)TYPE_MMS136,
+       }, {
                .compatible = "melfas,mms152",
                .data = (void *)TYPE_MMS152,
        }, {
diff --git a/drivers/input/touchscreen/msg2638.c b/drivers/input/touchscreen/msg2638.c
new file mode 100644 (file)
index 0000000..75536bc
--- /dev/null
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for MStar msg2638 touchscreens
+ *
+ * Copyright (c) 2021 Vincent Knecht <vincent.knecht@mailoo.org>
+ *
+ * Checksum and IRQ handler based on mstar_drv_common.c and
+ * mstar_drv_mutual_fw_control.c
+ * Copyright (c) 2006-2012 MStar Semiconductor, Inc.
+ *
+ * Driver structure based on zinitix.c by Michael Srba <Michael.Srba@seznam.cz>
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+#define MODE_DATA_RAW                  0x5A
+
+#define MAX_SUPPORTED_FINGER_NUM       5
+
+#define CHIP_ON_DELAY_MS               15
+#define FIRMWARE_ON_DELAY_MS           50
+#define RESET_DELAY_MIN_US             10000
+#define RESET_DELAY_MAX_US             11000
+
+struct packet {
+       u8      xy_hi; /* higher bits of x and y coordinates */
+       u8      x_low;
+       u8      y_low;
+       u8      pressure;
+};
+
+struct touch_event {
+       u8      mode;
+       struct  packet pkt[MAX_SUPPORTED_FINGER_NUM];
+       u8      proximity;
+       u8      checksum;
+};
+
+struct msg2638_ts_data {
+       struct i2c_client *client;
+       struct input_dev *input_dev;
+       struct touchscreen_properties prop;
+       struct regulator_bulk_data supplies[2];
+       struct gpio_desc *reset_gpiod;
+};
+
+static u8 msg2638_checksum(u8 *data, u32 length)
+{
+       s32 sum = 0;
+       u32 i;
+
+       for (i = 0; i < length; i++)
+               sum += data[i];
+
+       return (u8)((-sum) & 0xFF);
+}
+
+static irqreturn_t msg2638_ts_irq_handler(int irq, void *msg2638_handler)
+{
+       struct msg2638_ts_data *msg2638 = msg2638_handler;
+       struct i2c_client *client = msg2638->client;
+       struct input_dev *input = msg2638->input_dev;
+       struct touch_event touch_event;
+       u32 len = sizeof(touch_event);
+       struct i2c_msg msg[] = {
+               {
+                       .addr   = client->addr,
+                       .flags  = I2C_M_RD,
+                       .len    = sizeof(touch_event),
+                       .buf    = (u8 *)&touch_event,
+               },
+       };
+       struct packet *p;
+       u16 x, y;
+       int ret;
+       int i;
+
+       ret = i2c_transfer(client->adapter, msg, ARRAY_SIZE(msg));
+       if (ret != ARRAY_SIZE(msg)) {
+               dev_err(&client->dev,
+                       "Failed I2C transfer in irq handler: %d\n",
+                       ret < 0 ? ret : -EIO);
+               goto out;
+       }
+
+       if (touch_event.mode != MODE_DATA_RAW)
+               goto out;
+
+       if (msg2638_checksum((u8 *)&touch_event, len - 1) !=
+                                               touch_event.checksum) {
+               dev_err(&client->dev, "Failed checksum!\n");
+               goto out;
+       }
+
+       for (i = 0; i < MAX_SUPPORTED_FINGER_NUM; i++) {
+               p = &touch_event.pkt[i];
+
+               /* Ignore non-pressed finger data */
+               if (p->xy_hi == 0xFF && p->x_low == 0xFF && p->y_low == 0xFF)
+                       continue;
+
+               x = (((p->xy_hi & 0xF0) << 4) | p->x_low);
+               y = (((p->xy_hi & 0x0F) << 8) | p->y_low);
+
+               input_mt_slot(input, i);
+               input_mt_report_slot_state(input, MT_TOOL_FINGER, true);
+               touchscreen_report_pos(input, &msg2638->prop, x, y, true);
+       }
+
+       input_mt_sync_frame(msg2638->input_dev);
+       input_sync(msg2638->input_dev);
+
+out:
+       return IRQ_HANDLED;
+}
+
+static void msg2638_reset(struct msg2638_ts_data *msg2638)
+{
+       gpiod_set_value_cansleep(msg2638->reset_gpiod, 1);
+       usleep_range(RESET_DELAY_MIN_US, RESET_DELAY_MAX_US);
+       gpiod_set_value_cansleep(msg2638->reset_gpiod, 0);
+       msleep(FIRMWARE_ON_DELAY_MS);
+}
+
+static int msg2638_start(struct msg2638_ts_data *msg2638)
+{
+       int error;
+
+       error = regulator_bulk_enable(ARRAY_SIZE(msg2638->supplies),
+                                     msg2638->supplies);
+       if (error) {
+               dev_err(&msg2638->client->dev,
+                       "Failed to enable regulators: %d\n", error);
+               return error;
+       }
+
+       msleep(CHIP_ON_DELAY_MS);
+
+       msg2638_reset(msg2638);
+
+       enable_irq(msg2638->client->irq);
+
+       return 0;
+}
+
+static int msg2638_stop(struct msg2638_ts_data *msg2638)
+{
+       int error;
+
+       disable_irq(msg2638->client->irq);
+
+       error = regulator_bulk_disable(ARRAY_SIZE(msg2638->supplies),
+                                      msg2638->supplies);
+       if (error) {
+               dev_err(&msg2638->client->dev,
+                       "Failed to disable regulators: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int msg2638_input_open(struct input_dev *dev)
+{
+       struct msg2638_ts_data *msg2638 = input_get_drvdata(dev);
+
+       return msg2638_start(msg2638);
+}
+
+static void msg2638_input_close(struct input_dev *dev)
+{
+       struct msg2638_ts_data *msg2638 = input_get_drvdata(dev);
+
+       msg2638_stop(msg2638);
+}
+
+static int msg2638_init_input_dev(struct msg2638_ts_data *msg2638)
+{
+       struct device *dev = &msg2638->client->dev;
+       struct input_dev *input_dev;
+       int error;
+
+       input_dev = devm_input_allocate_device(dev);
+       if (!input_dev) {
+               dev_err(dev, "Failed to allocate input device.\n");
+               return -ENOMEM;
+       }
+
+       input_set_drvdata(input_dev, msg2638);
+       msg2638->input_dev = input_dev;
+
+       input_dev->name = "MStar TouchScreen";
+       input_dev->phys = "input/ts";
+       input_dev->id.bustype = BUS_I2C;
+       input_dev->open = msg2638_input_open;
+       input_dev->close = msg2638_input_close;
+
+       input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X);
+       input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y);
+
+       touchscreen_parse_properties(input_dev, true, &msg2638->prop);
+       if (!msg2638->prop.max_x || !msg2638->prop.max_y) {
+               dev_err(dev, "touchscreen-size-x and/or touchscreen-size-y not set in properties\n");
+               return -EINVAL;
+       }
+
+       error = input_mt_init_slots(input_dev, MAX_SUPPORTED_FINGER_NUM,
+                                   INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
+       if (error) {
+               dev_err(dev, "Failed to initialize MT slots: %d\n", error);
+               return error;
+       }
+
+       error = input_register_device(input_dev);
+       if (error) {
+               dev_err(dev, "Failed to register input device: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int msg2638_ts_probe(struct i2c_client *client)
+{
+       struct device *dev = &client->dev;
+       struct msg2638_ts_data *msg2638;
+       int error;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(dev, "Failed to assert adapter's support for plain I2C.\n");
+               return -ENXIO;
+       }
+
+       msg2638 = devm_kzalloc(dev, sizeof(*msg2638), GFP_KERNEL);
+       if (!msg2638)
+               return -ENOMEM;
+
+       msg2638->client = client;
+       i2c_set_clientdata(client, msg2638);
+
+       msg2638->supplies[0].supply = "vdd";
+       msg2638->supplies[1].supply = "vddio";
+       error = devm_regulator_bulk_get(dev, ARRAY_SIZE(msg2638->supplies),
+                                       msg2638->supplies);
+       if (error) {
+               dev_err(dev, "Failed to get regulators: %d\n", error);
+               return error;
+       }
+
+       msg2638->reset_gpiod = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(msg2638->reset_gpiod)) {
+               error = PTR_ERR(msg2638->reset_gpiod);
+               dev_err(dev, "Failed to request reset GPIO: %d\n", error);
+               return error;
+       }
+
+       error = msg2638_init_input_dev(msg2638);
+       if (error) {
+               dev_err(dev, "Failed to initialize input device: %d\n", error);
+               return error;
+       }
+
+       error = devm_request_threaded_irq(dev, client->irq,
+                                         NULL, msg2638_ts_irq_handler,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         client->name, msg2638);
+       if (error) {
+               dev_err(dev, "Failed to request IRQ: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused msg2638_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client);
+
+       mutex_lock(&msg2638->input_dev->mutex);
+
+       if (input_device_enabled(msg2638->input_dev))
+               msg2638_stop(msg2638);
+
+       mutex_unlock(&msg2638->input_dev->mutex);
+
+       return 0;
+}
+
+static int __maybe_unused msg2638_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client);
+       int ret = 0;
+
+       mutex_lock(&msg2638->input_dev->mutex);
+
+       if (input_device_enabled(msg2638->input_dev))
+               ret = msg2638_start(msg2638);
+
+       mutex_unlock(&msg2638->input_dev->mutex);
+
+       return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(msg2638_pm_ops, msg2638_suspend, msg2638_resume);
+
+static const struct of_device_id msg2638_of_match[] = {
+       { .compatible = "mstar,msg2638" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, msg2638_of_match);
+
+static struct i2c_driver msg2638_ts_driver = {
+       .probe_new = msg2638_ts_probe,
+       .driver = {
+               .name = "MStar-TS",
+               .pm = &msg2638_pm_ops,
+               .of_match_table = msg2638_of_match,
+       },
+};
+module_i2c_driver(msg2638_ts_driver);
+
+MODULE_AUTHOR("Vincent Knecht <vincent.knecht@mailoo.org>");
+MODULE_DESCRIPTION("MStar MSG2638 touchscreen driver");
+MODULE_LICENSE("GPL v2");
index 8fa2f3b..1ee760b 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/input/mt.h>
 #include <linux/input/touchscreen.h>
 #include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/irq.h>
 #include <linux/regulator/consumer.h>
 
@@ -335,10 +336,8 @@ static int silead_ts_get_id(struct i2c_client *client)
 
        error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_ID,
                                              sizeof(chip_id), (u8 *)&chip_id);
-       if (error < 0) {
-               dev_err(&client->dev, "Chip ID read error %d\n", error);
+       if (error < 0)
                return error;
-       }
 
        data->chip_id = le32_to_cpu(chip_id);
        dev_info(&client->dev, "Silead chip ID: 0x%8X", data->chip_id);
@@ -351,12 +350,49 @@ static int silead_ts_setup(struct i2c_client *client)
        int error;
        u32 status;
 
+       /*
+        * Some buggy BIOS-es bring up the chip in a stuck state where it
+        * blocks the I2C bus. The following steps are necessary to
+        * unstuck the chip / bus:
+        * 1. Turn off the Silead chip.
+        * 2. Try to do an I2C transfer with the chip, this will fail in
+        *    response to which the I2C-bus-driver will call:
+        *    i2c_recover_bus() which will unstuck the I2C-bus. Note the
+        *    unstuck-ing of the I2C bus only works if we first drop the
+        *    chip off the bus by turning it off.
+        * 3. Turn the chip back on.
+        *
+        * On the x86/ACPI systems were this problem is seen, step 1. and
+        * 3. require making ACPI calls and dealing with ACPI Power
+        * Resources. The workaround below runtime-suspends the chip to
+        * turn it off, leaving it up to the ACPI subsystem to deal with
+        * this.
+        */
+
+       if (device_property_read_bool(&client->dev,
+                                     "silead,stuck-controller-bug")) {
+               pm_runtime_set_active(&client->dev);
+               pm_runtime_enable(&client->dev);
+               pm_runtime_allow(&client->dev);
+
+               pm_runtime_suspend(&client->dev);
+
+               dev_warn(&client->dev, FW_BUG "Stuck I2C bus: please ignore the next 'controller timed out' error\n");
+               silead_ts_get_id(client);
+
+               /* The forbid will also resume the device */
+               pm_runtime_forbid(&client->dev);
+               pm_runtime_disable(&client->dev);
+       }
+
        silead_ts_set_power(client, SILEAD_POWER_OFF);
        silead_ts_set_power(client, SILEAD_POWER_ON);
 
        error = silead_ts_get_id(client);
-       if (error)
+       if (error) {
+               dev_err(&client->dev, "Chip ID read error %d\n", error);
                return error;
+       }
 
        error = silead_ts_init(client);
        if (error)
@@ -486,7 +522,7 @@ static int silead_ts_probe(struct i2c_client *client,
 
        silead_ts_read_props(client);
 
-       /* We must have the IRQ provided by DT or ACPI subsytem */
+       /* We must have the IRQ provided by DT or ACPI subsystem */
        if (client->irq <= 0)
                return -ENODEV;
 
index 9a64e1d..bc11203 100644 (file)
@@ -691,10 +691,9 @@ static int stmfts_probe(struct i2c_client *client,
         * interrupts. To be on the safe side it's better to not enable
         * the interrupts during their request.
         */
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        err = devm_request_threaded_irq(&client->dev, client->irq,
                                        NULL, stmfts_irq_handler,
-                                       IRQF_ONESHOT,
+                                       IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                        "stmfts_irq", sdata);
        if (err)
                return err;
index 91c60bf..69b08dd 100644 (file)
@@ -19,6 +19,8 @@
 #ifndef _TSC2007_H
 #define _TSC2007_H
 
+struct gpio_desc;
+
 #define TSC2007_MEASURE_TEMP0          (0x0 << 4)
 #define TSC2007_MEASURE_AUX            (0x2 << 4)
 #define TSC2007_MEASURE_TEMP1          (0x4 << 4)
@@ -69,7 +71,7 @@ struct tsc2007 {
        int                     fuzzy;
        int                     fuzzz;
 
-       unsigned int            gpio;
+       struct gpio_desc        *gpiod;
        int                     irq;
 
        wait_queue_head_t       wait;
index 3b80abf..3e871d1 100644 (file)
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/gpio/consumer.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
-#include <linux/of_device.h>
-#include <linux/of_gpio.h>
+#include <linux/mod_devicetable.h>
+#include <linux/property.h>
 #include <linux/platform_data/tsc2007.h>
 #include "tsc2007.h"
 
@@ -220,71 +221,58 @@ static void tsc2007_close(struct input_dev *input_dev)
        tsc2007_stop(ts);
 }
 
-#ifdef CONFIG_OF
 static int tsc2007_get_pendown_state_gpio(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct tsc2007 *ts = i2c_get_clientdata(client);
 
-       return !gpio_get_value(ts->gpio);
+       return gpiod_get_value(ts->gpiod);
 }
 
-static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts)
+static int tsc2007_probe_properties(struct device *dev, struct tsc2007 *ts)
 {
-       struct device_node *np = client->dev.of_node;
        u32 val32;
        u64 val64;
 
-       if (!np) {
-               dev_err(&client->dev, "missing device tree data\n");
-               return -EINVAL;
-       }
-
-       if (!of_property_read_u32(np, "ti,max-rt", &val32))
+       if (!device_property_read_u32(dev, "ti,max-rt", &val32))
                ts->max_rt = val32;
        else
                ts->max_rt = MAX_12BIT;
 
-       if (!of_property_read_u32(np, "ti,fuzzx", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzx", &val32))
                ts->fuzzx = val32;
 
-       if (!of_property_read_u32(np, "ti,fuzzy", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzy", &val32))
                ts->fuzzy = val32;
 
-       if (!of_property_read_u32(np, "ti,fuzzz", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzz", &val32))
                ts->fuzzz = val32;
 
-       if (!of_property_read_u64(np, "ti,poll-period", &val64))
+       if (!device_property_read_u64(dev, "ti,poll-period", &val64))
                ts->poll_period = msecs_to_jiffies(val64);
        else
                ts->poll_period = msecs_to_jiffies(1);
 
-       if (!of_property_read_u32(np, "ti,x-plate-ohms", &val32)) {
+       if (!device_property_read_u32(dev, "ti,x-plate-ohms", &val32)) {
                ts->x_plate_ohms = val32;
        } else {
-               dev_err(&client->dev, "missing ti,x-plate-ohms devicetree property.");
+               dev_err(dev, "Missing ti,x-plate-ohms device property\n");
                return -EINVAL;
        }
 
-       ts->gpio = of_get_gpio(np, 0);
-       if (gpio_is_valid(ts->gpio))
+       ts->gpiod = devm_gpiod_get_optional(dev, NULL, GPIOD_IN);
+       if (IS_ERR(ts->gpiod))
+               return PTR_ERR(ts->gpiod);
+
+       if (ts->gpiod)
                ts->get_pendown_state = tsc2007_get_pendown_state_gpio;
        else
-               dev_warn(&client->dev,
-                        "GPIO not specified in DT (of_get_gpio returned %d)\n",
-                        ts->gpio);
+               dev_warn(dev, "Pen down GPIO is not specified in properties\n");
 
        return 0;
 }
-#else
-static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts)
-{
-       dev_err(&client->dev, "platform data is required!\n");
-       return -EINVAL;
-}
-#endif
 
-static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts,
+static int tsc2007_probe_pdev(struct device *dev, struct tsc2007 *ts,
                              const struct tsc2007_platform_data *pdata,
                              const struct i2c_device_id *id)
 {
@@ -299,7 +287,7 @@ static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts,
        ts->fuzzz             = pdata->fuzzz;
 
        if (pdata->x_plate_ohms == 0) {
-               dev_err(&client->dev, "x_plate_ohms is not set up in platform data");
+               dev_err(dev, "x_plate_ohms is not set up in platform data\n");
                return -EINVAL;
        }
 
@@ -332,9 +320,9 @@ static int tsc2007_probe(struct i2c_client *client,
                return -ENOMEM;
 
        if (pdata)
-               err = tsc2007_probe_pdev(client, ts, pdata, id);
+               err = tsc2007_probe_pdev(&client->dev, ts, pdata, id);
        else
-               err = tsc2007_probe_dt(client, ts);
+               err = tsc2007_probe_properties(&client->dev, ts);
        if (err)
                return err;
 
@@ -431,18 +419,16 @@ static const struct i2c_device_id tsc2007_idtable[] = {
 
 MODULE_DEVICE_TABLE(i2c, tsc2007_idtable);
 
-#ifdef CONFIG_OF
 static const struct of_device_id tsc2007_of_match[] = {
        { .compatible = "ti,tsc2007" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, tsc2007_of_match);
-#endif
 
 static struct i2c_driver tsc2007_driver = {
        .driver = {
                .name   = "tsc2007",
-               .of_match_table = of_match_ptr(tsc2007_of_match),
+               .of_match_table = tsc2007_of_match,
        },
        .id_table       = tsc2007_idtable,
        .probe          = tsc2007_probe,
index 1afc6bd..22826c3 100644 (file)
@@ -145,15 +145,16 @@ static void wacom_i2c_close(struct input_dev *dev)
 }
 
 static int wacom_i2c_probe(struct i2c_client *client,
-                                    const struct i2c_device_id *id)
+                          const struct i2c_device_id *id)
 {
+       struct device *dev = &client->dev;
        struct wacom_i2c *wac_i2c;
        struct input_dev *input;
        struct wacom_features features = { 0 };
        int error;
 
        if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-               dev_err(&client->dev, "i2c_check_functionality error\n");
+               dev_err(dev, "i2c_check_functionality error\n");
                return -EIO;
        }
 
@@ -161,21 +162,22 @@ static int wacom_i2c_probe(struct i2c_client *client,
        if (error)
                return error;
 
-       wac_i2c = kzalloc(sizeof(*wac_i2c), GFP_KERNEL);
-       input = input_allocate_device();
-       if (!wac_i2c || !input) {
-               error = -ENOMEM;
-               goto err_free_mem;
-       }
+       wac_i2c = devm_kzalloc(dev, sizeof(*wac_i2c), GFP_KERNEL);
+       if (!wac_i2c)
+               return -ENOMEM;
 
        wac_i2c->client = client;
+
+       input = devm_input_allocate_device(dev);
+       if (!input)
+               return -ENOMEM;
+
        wac_i2c->input = input;
 
        input->name = "Wacom I2C Digitizer";
        input->id.bustype = BUS_I2C;
        input->id.vendor = 0x56a;
        input->id.version = features.fw_version;
-       input->dev.parent = &client->dev;
        input->open = wacom_i2c_open;
        input->close = wacom_i2c_close;
 
@@ -194,13 +196,11 @@ static int wacom_i2c_probe(struct i2c_client *client,
 
        input_set_drvdata(input, wac_i2c);
 
-       error = request_threaded_irq(client->irq, NULL, wacom_i2c_irq,
-                                    IRQF_TRIGGER_LOW | IRQF_ONESHOT,
-                                    "wacom_i2c", wac_i2c);
+       error = devm_request_threaded_irq(dev, client->irq, NULL, wacom_i2c_irq,
+                                         IRQF_ONESHOT, "wacom_i2c", wac_i2c);
        if (error) {
-               dev_err(&client->dev,
-                       "Failed to enable IRQ, error: %d\n", error);
-               goto err_free_mem;
+               dev_err(dev, "Failed to request IRQ: %d\n", error);
+               return error;
        }
 
        /* Disable the IRQ, we'll enable it in wac_i2c_open() */
@@ -208,31 +208,10 @@ static int wacom_i2c_probe(struct i2c_client *client,
 
        error = input_register_device(wac_i2c->input);
        if (error) {
-               dev_err(&client->dev,
-                       "Failed to register input device, error: %d\n", error);
-               goto err_free_irq;
+               dev_err(dev, "Failed to register input device: %d\n", error);
+               return error;
        }
 
-       i2c_set_clientdata(client, wac_i2c);
-       return 0;
-
-err_free_irq:
-       free_irq(client->irq, wac_i2c);
-err_free_mem:
-       input_free_device(input);
-       kfree(wac_i2c);
-
-       return error;
-}
-
-static int wacom_i2c_remove(struct i2c_client *client)
-{
-       struct wacom_i2c *wac_i2c = i2c_get_clientdata(client);
-
-       free_irq(client->irq, wac_i2c);
-       input_unregister_device(wac_i2c->input);
-       kfree(wac_i2c);
-
        return 0;
 }
 
@@ -269,7 +248,6 @@ static struct i2c_driver wacom_i2c_driver = {
        },
 
        .probe          = wacom_i2c_probe,
-       .remove         = wacom_i2c_remove,
        .id_table       = wacom_i2c_id,
 };
 module_i2c_driver(wacom_i2c_driver);
index bb1699e..319f57f 100644 (file)
@@ -317,14 +317,13 @@ static int wm831x_ts_probe(struct platform_device *pdev)
 
        error = request_threaded_irq(wm831x_ts->data_irq,
                                     NULL, wm831x_ts_data_irq,
-                                    irqf | IRQF_ONESHOT,
+                                    irqf | IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                     "Touchscreen data", wm831x_ts);
        if (error) {
                dev_err(&pdev->dev, "Failed to request data IRQ %d: %d\n",
                        wm831x_ts->data_irq, error);
                goto err_alloc;
        }
-       disable_irq(wm831x_ts->data_irq);
 
        if (pdata && pdata->pd_irqf)
                irqf = pdata->pd_irqf;
index 3b636be..b8d9010 100644 (file)
@@ -513,10 +513,10 @@ static int zinitix_ts_probe(struct i2c_client *client)
                return -EINVAL;
        }
 
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, zinitix_ts_irq_handler,
-                                         IRQF_ONESHOT, client->name, bt541);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         client->name, bt541);
        if (error) {
                dev_err(&client->dev, "Failed to request IRQ: %d\n", error);
                return error;
index b5ed4ea..77e9512 100644 (file)
@@ -201,6 +201,7 @@ static ssize_t empty_read(struct file *file, char __user *buf,
 
 static const struct proc_ops empty_proc_ops = {
        .proc_read      = empty_read,
+       .proc_lseek     = default_llseek,
 };
 
 // ---------------------------------------------------------------------------
index 2b6d6e9..bea8c44 100644 (file)
@@ -16,6 +16,7 @@
 #include "features.h"
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
index abfc883..68bc382 100644 (file)
@@ -9,9 +9,6 @@
    Please send bug reports and support requests to <luc@saillard.org>.
    The decompression routines have been implemented by reverse-engineering the
    Nemosoft binary pwcx module. Caveat emptor.
-
-
-   vim: set ts=8:
 */
 
 #include <asm/current.h>
index caedc4c..5e84bee 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-ti-emif-asm-offsets.h
+/ti-emif-asm-offsets.h
index 34073cd..3cf6de2 100644 (file)
@@ -1562,6 +1562,8 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
        int i;
        int putidx;
 
+       cdev->tx_skb = NULL;
+
        /* Generate ID field for TX buffer Element */
        /* Common to all supported M_CAN versions */
        if (cf->can_id & CAN_EFF_FLAG) {
@@ -1678,7 +1680,6 @@ static void m_can_tx_work_queue(struct work_struct *ws)
                                                   tx_work);
 
        m_can_tx_handler(cdev);
-       cdev->tx_skb = NULL;
 }
 
 static netdev_tx_t m_can_start_xmit(struct sk_buff *skb,
index 492f1bc..173c661 100644 (file)
@@ -956,8 +956,6 @@ static int mcp251x_stop(struct net_device *net)
 
        priv->force_quit = 1;
        free_irq(spi->irq, priv);
-       destroy_workqueue(priv->wq);
-       priv->wq = NULL;
 
        mutex_lock(&priv->mcp_lock);
 
@@ -1224,24 +1222,15 @@ static int mcp251x_open(struct net_device *net)
                goto out_close;
        }
 
-       priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
-                                  0);
-       if (!priv->wq) {
-               ret = -ENOMEM;
-               goto out_clean;
-       }
-       INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
-       INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
-
        ret = mcp251x_hw_wake(spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
        ret = mcp251x_setup(net, spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
        ret = mcp251x_set_normal_mode(spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
 
        can_led_event(net, CAN_LED_EVENT_OPEN);
 
@@ -1250,9 +1239,7 @@ static int mcp251x_open(struct net_device *net)
 
        return 0;
 
-out_free_wq:
-       destroy_workqueue(priv->wq);
-out_clean:
+out_free_irq:
        free_irq(spi->irq, priv);
        mcp251x_hw_sleep(spi);
 out_close:
@@ -1373,6 +1360,15 @@ static int mcp251x_can_probe(struct spi_device *spi)
        if (ret)
                goto out_clk;
 
+       priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
+                                  0);
+       if (!priv->wq) {
+               ret = -ENOMEM;
+               goto out_clk;
+       }
+       INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
+       INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
+
        priv->spi = spi;
        mutex_init(&priv->mcp_lock);
 
@@ -1417,6 +1413,8 @@ static int mcp251x_can_probe(struct spi_device *spi)
        return 0;
 
 error_probe:
+       destroy_workqueue(priv->wq);
+       priv->wq = NULL;
        mcp251x_power_enable(priv->power, 0);
 
 out_clk:
@@ -1438,6 +1436,9 @@ static int mcp251x_can_remove(struct spi_device *spi)
 
        mcp251x_power_enable(priv->power, 0);
 
+       destroy_workqueue(priv->wq);
+       priv->wq = NULL;
+
        clk_disable_unprepare(priv->clk);
 
        free_candev(net);
index 970dc57..e0ae00e 100644 (file)
@@ -2885,8 +2885,8 @@ static int mcp251xfd_probe(struct spi_device *spi)
 
        clk = devm_clk_get(&spi->dev, NULL);
        if (IS_ERR(clk))
-               dev_err_probe(&spi->dev, PTR_ERR(clk),
-                             "Failed to get Oscillator (clock)!\n");
+               return dev_err_probe(&spi->dev, PTR_ERR(clk),
+                                    "Failed to get Oscillator (clock)!\n");
        freq = clk_get_rate(clk);
 
        /* Sanity check */
@@ -2986,10 +2986,12 @@ static int mcp251xfd_probe(struct spi_device *spi)
 
        err = mcp251xfd_register(priv);
        if (err)
-               goto out_free_candev;
+               goto out_can_rx_offload_del;
 
        return 0;
 
+ out_can_rx_offload_del:
+       can_rx_offload_del(&priv->offload);
  out_free_candev:
        spi->max_speed_hz = priv->spi_max_speed_hz_orig;
 
index 85ba12a..ea7550d 100644 (file)
@@ -41,6 +41,9 @@ static int ksz8795_spi_probe(struct spi_device *spi)
        int i, ret = 0;
 
        ksz8 = devm_kzalloc(&spi->dev, sizeof(struct ksz8), GFP_KERNEL);
+       if (!ksz8)
+               return -ENOMEM;
+
        ksz8->priv = spi;
 
        dev = ksz_switch_alloc(&spi->dev, ksz8);
index 30d97ea..1129348 100644 (file)
@@ -147,11 +147,14 @@ static int ksz8863_smi_probe(struct mdio_device *mdiodev)
        int i;
 
        ksz8 = devm_kzalloc(&mdiodev->dev, sizeof(struct ksz8), GFP_KERNEL);
+       if (!ksz8)
+               return -ENOMEM;
+
        ksz8->priv = mdiodev;
 
        dev = ksz_switch_alloc(&mdiodev->dev, ksz8);
        if (!dev)
-               return -EINVAL;
+               return -ENOMEM;
 
        for (i = 0; i < ARRAY_SIZE(ksz8863_regmap_config); i++) {
                rc = ksz8863_regmap_config[i];
index 5552997..7965e5e 100644 (file)
@@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void)
 
 module_init(starfire_init);
 module_exit(starfire_cleanup);
-
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 961796a..c1eab91 100644 (file)
@@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void)
 module_init(atarilance_module_init);
 module_exit(atarilance_module_exit);
 #endif /* MODULE */
-
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 4
- * End:
- */
index aa41250..4100ab0 100644 (file)
@@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void)
 
 module_init(pcnet32_init_module);
 module_exit(pcnet32_cleanup_module);
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 9e02f88..b3d7433 100644 (file)
@@ -2016,7 +2016,7 @@ static struct pci_driver alx_driver = {
 module_pci_driver(alx_driver);
 MODULE_DEVICE_TABLE(pci, alx_pci_tbl);
 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
-MODULE_AUTHOR("Qualcomm Corporation, <nic-devel@qualcomm.com>");
+MODULE_AUTHOR("Qualcomm Corporation");
 MODULE_DESCRIPTION(
        "Qualcomm Atheros(R) AR816x/AR817x PCI-E Ethernet Network Driver");
 MODULE_LICENSE("GPL");
index 1d17c24..c6263cf 100644 (file)
@@ -32,7 +32,7 @@ static const struct pci_device_id atl1c_pci_tbl[] = {
 MODULE_DEVICE_TABLE(pci, atl1c_pci_tbl);
 
 MODULE_AUTHOR("Jie Yang");
-MODULE_AUTHOR("Qualcomm Atheros Inc., <nic-devel@qualcomm.com>");
+MODULE_AUTHOR("Qualcomm Atheros Inc.");
 MODULE_DESCRIPTION("Qualcomm Atheros 100/1000M Ethernet Network Driver");
 MODULE_LICENSE("GPL");
 
index 9c2f51f..d21f085 100644 (file)
@@ -1192,7 +1192,6 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param,
                return 0;
        }
 
-       err = -EIO;
        /* verify ari is enabled */
        if (!pci_ari_enabled(bp->pdev->bus)) {
                BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n");
index 7e4e831..ba47777 100644 (file)
@@ -1764,7 +1764,7 @@ bnad_dim_timeout(struct timer_list *t)
                }
        }
 
-       /* Check for BNAD_CF_DIM_ENABLED, does not eleminate a race */
+       /* Check for BNAD_CF_DIM_ENABLED, does not eliminate a race */
        if (test_bit(BNAD_RF_DIM_TIMER_RUNNING, &bnad->run_flags))
                mod_timer(&bnad->dim_timer,
                          jiffies + msecs_to_jiffies(BNAD_DIM_TIMER_FREQ));
index 0e94db9..6bc7d41 100644 (file)
@@ -4852,7 +4852,7 @@ static int __maybe_unused macb_suspend(struct device *dev)
 {
        struct net_device *netdev = dev_get_drvdata(dev);
        struct macb *bp = netdev_priv(netdev);
-       struct macb_queue *queue = bp->queues;
+       struct macb_queue *queue;
        unsigned long flags;
        unsigned int q;
        int err;
@@ -4939,7 +4939,7 @@ static int __maybe_unused macb_resume(struct device *dev)
 {
        struct net_device *netdev = dev_get_drvdata(dev);
        struct macb *bp = netdev_priv(netdev);
-       struct macb_queue *queue = bp->queues;
+       struct macb_queue *queue;
        unsigned long flags;
        unsigned int q;
        int err;
index 256fae1..1e5f2ed 100644 (file)
@@ -2563,12 +2563,12 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
        spin_lock_bh(&eosw_txq->lock);
        if (tc != FW_SCHED_CLS_NONE) {
                if (eosw_txq->state != CXGB4_EO_STATE_CLOSED)
-                       goto out_unlock;
+                       goto out_free_skb;
 
                next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND;
        } else {
                if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
-                       goto out_unlock;
+                       goto out_free_skb;
 
                next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND;
        }
@@ -2604,17 +2604,19 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
                eosw_txq_flush_pending_skbs(eosw_txq);
 
        ret = eosw_txq_enqueue(eosw_txq, skb);
-       if (ret) {
-               dev_consume_skb_any(skb);
-               goto out_unlock;
-       }
+       if (ret)
+               goto out_free_skb;
 
        eosw_txq->state = next_state;
        eosw_txq->flowc_idx = eosw_txq->pidx;
        eosw_txq_advance(eosw_txq, 1);
        ethofld_xmit(dev, eosw_txq);
 
-out_unlock:
+       spin_unlock_bh(&eosw_txq->lock);
+       return 0;
+
+out_free_skb:
+       dev_consume_skb_any(skb);
        spin_unlock_bh(&eosw_txq->lock);
        return ret;
 }
index f48957a..d0a8f71 100644 (file)
@@ -768,7 +768,7 @@ static inline int enic_queue_wq_skb_encap(struct enic *enic, struct vnic_wq *wq,
        return err;
 }
 
-static inline void enic_queue_wq_skb(struct enic *enic,
+static inline int enic_queue_wq_skb(struct enic *enic,
        struct vnic_wq *wq, struct sk_buff *skb)
 {
        unsigned int mss = skb_shinfo(skb)->gso_size;
@@ -814,6 +814,7 @@ static inline void enic_queue_wq_skb(struct enic *enic,
                wq->to_use = buf->next;
                dev_kfree_skb(skb);
        }
+       return err;
 }
 
 /* netif_tx_lock held, process context with BHs disabled, or BH */
@@ -857,7 +858,8 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
                return NETDEV_TX_BUSY;
        }
 
-       enic_queue_wq_skb(enic, wq, skb);
+       if (enic_queue_wq_skb(enic, wq, skb))
+               goto error;
 
        if (vnic_wq_desc_avail(wq) < MAX_SKB_FRAGS + ENIC_DESC_MAX_SPLITS)
                netif_tx_stop_queue(txq);
@@ -865,6 +867,7 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
        if (!netdev_xmit_more() || netif_xmit_stopped(txq))
                vnic_wq_doorbell(wq);
 
+error:
        spin_unlock(&enic->wq_lock[txq_map]);
 
        return NETDEV_TX_OK;
index c21dd11..783fdaf 100644 (file)
@@ -575,8 +575,8 @@ static int hns3_nic_net_stop(struct net_device *netdev)
        if (h->ae_algo->ops->set_timer_task)
                h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
 
-       netif_tx_stop_all_queues(netdev);
        netif_carrier_off(netdev);
+       netif_tx_disable(netdev);
 
        hns3_nic_net_down(netdev);
 
@@ -824,7 +824,7 @@ static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
  * and it is udp packet, which has a dest port as the IANA assigned.
  * the hardware is expected to do the checksum offload, but the
  * hardware will not do the checksum offload when udp dest port is
- * 4789 or 6081.
+ * 4789, 4790 or 6081.
  */
 static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
 {
@@ -842,7 +842,8 @@ static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
 
        if (!(!skb->encapsulation &&
              (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) ||
-             l4.udp->dest == htons(GENEVE_UDP_PORT))))
+             l4.udp->dest == htons(GENEVE_UDP_PORT) ||
+             l4.udp->dest == htons(4790))))
                return false;
 
        skb_checksum_help(skb);
@@ -4616,6 +4617,11 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle)
        struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev);
        int ret = 0;
 
+       if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
+               netdev_err(kinfo->netdev, "device is not initialized yet\n");
+               return -EFAULT;
+       }
+
        clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
 
        if (netif_running(kinfo->netdev)) {
index d252919..8223d69 100644 (file)
@@ -753,8 +753,9 @@ static int hclge_config_igu_egu_hw_err_int(struct hclge_dev *hdev, bool en)
 
        /* configure IGU,EGU error interrupts */
        hclge_cmd_setup_basic_desc(&desc, HCLGE_IGU_COMMON_INT_EN, false);
+       desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_TYPE);
        if (en)
-               desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN);
+               desc.data[0] |= cpu_to_le32(HCLGE_IGU_ERR_INT_EN);
 
        desc.data[1] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN_MASK);
 
index 608fe26..d647f3c 100644 (file)
@@ -32,7 +32,8 @@
 #define HCLGE_TQP_ECC_ERR_INT_EN_MASK  0x0FFF
 #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN_MASK    0x0F000000
 #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN 0x0F000000
-#define HCLGE_IGU_ERR_INT_EN   0x0000066F
+#define HCLGE_IGU_ERR_INT_EN   0x0000000F
+#define HCLGE_IGU_ERR_INT_TYPE 0x00000660
 #define HCLGE_IGU_ERR_INT_EN_MASK      0x000F
 #define HCLGE_IGU_TNL_ERR_INT_EN    0x0002AABF
 #define HCLGE_IGU_TNL_ERR_INT_EN_MASK  0x003F
index c296ab6..6304aed 100644 (file)
@@ -3978,6 +3978,12 @@ static void hclge_update_reset_level(struct hclge_dev *hdev)
        struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        enum hnae3_reset_type reset_level;
 
+       /* reset request will not be set during reset, so clear
+        * pending reset request to avoid unnecessary reset
+        * caused by the same reason.
+        */
+       hclge_get_reset_level(ae_dev, &hdev->reset_request);
+
        /* if default_reset_request has a higher level reset request,
         * it should be handled as soon as possible. since some errors
         * need this kind of reset to fix.
index 5512ffe..8e5f9dc 100644 (file)
@@ -533,7 +533,7 @@ static void hclge_get_link_mode(struct hclge_vport *vport,
        unsigned long advertising;
        unsigned long supported;
        unsigned long send_data;
-       u8 msg_data[10];
+       u8 msg_data[10] = {};
        u8 dest_vfid;
 
        advertising = hdev->hw.mac.advertising[0];
index 08e88d9..1231c34 100644 (file)
@@ -255,6 +255,8 @@ void hclge_mac_start_phy(struct hclge_dev *hdev)
        if (!phydev)
                return;
 
+       phy_loopback(phydev, false);
+
        phy_start(phydev);
 }
 
index 9067cd3..85d3dd3 100644 (file)
@@ -1144,7 +1144,6 @@ static inline bool i40e_is_sw_dcb(struct i40e_pf *pf)
        return !!(pf->flags & I40E_FLAG_DISABLE_FW_LLDP);
 }
 
-void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable);
 #ifdef CONFIG_I40E_DCB
 void i40e_dcbnl_flush_apps(struct i40e_pf *pf,
                           struct i40e_dcbx_config *old_cfg,
index ce626ea..140b677 100644 (file)
@@ -1566,8 +1566,10 @@ enum i40e_aq_phy_type {
        I40E_PHY_TYPE_25GBASE_LR                = 0x22,
        I40E_PHY_TYPE_25GBASE_AOC               = 0x23,
        I40E_PHY_TYPE_25GBASE_ACC               = 0x24,
-       I40E_PHY_TYPE_2_5GBASE_T                = 0x30,
-       I40E_PHY_TYPE_5GBASE_T                  = 0x31,
+       I40E_PHY_TYPE_2_5GBASE_T                = 0x26,
+       I40E_PHY_TYPE_5GBASE_T                  = 0x27,
+       I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS    = 0x30,
+       I40E_PHY_TYPE_5GBASE_T_LINK_STATUS      = 0x31,
        I40E_PHY_TYPE_MAX,
        I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
        I40E_PHY_TYPE_EMPTY                     = 0xFE,
index a2dba32..32f3fac 100644 (file)
@@ -375,6 +375,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
                                clear_bit(__I40E_CLIENT_INSTANCE_OPENED,
                                          &cdev->state);
                                i40e_client_del_instance(pf);
+                               return;
                        }
                }
        }
index 41b813f..67cb0b4 100644 (file)
@@ -1154,8 +1154,8 @@ static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw)
                break;
        case I40E_PHY_TYPE_100BASE_TX:
        case I40E_PHY_TYPE_1000BASE_T:
-       case I40E_PHY_TYPE_2_5GBASE_T:
-       case I40E_PHY_TYPE_5GBASE_T:
+       case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS:
+       case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS:
        case I40E_PHY_TYPE_10GBASE_T:
                media = I40E_MEDIA_TYPE_BASET;
                break;
index 040a014..ccd5b94 100644 (file)
@@ -841,8 +841,8 @@ static void i40e_get_settings_link_up(struct i40e_hw *hw,
                                                             10000baseT_Full);
                break;
        case I40E_PHY_TYPE_10GBASE_T:
-       case I40E_PHY_TYPE_5GBASE_T:
-       case I40E_PHY_TYPE_2_5GBASE_T:
+       case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS:
+       case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS:
        case I40E_PHY_TYPE_1000BASE_T:
        case I40E_PHY_TYPE_100BASE_TX:
                ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
@@ -1409,7 +1409,8 @@ static int i40e_set_fec_cfg(struct net_device *netdev, u8 fec_cfg)
 
                memset(&config, 0, sizeof(config));
                config.phy_type = abilities.phy_type;
-               config.abilities = abilities.abilities;
+               config.abilities = abilities.abilities |
+                                  I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
                config.phy_type_ext = abilities.phy_type_ext;
                config.link_speed = abilities.link_speed;
                config.eee_capability = abilities.eee_capability;
@@ -5281,7 +5282,6 @@ flags_complete:
                        i40e_aq_cfg_lldp_mib_change_event(&pf->hw, false, NULL);
                        i40e_aq_stop_lldp(&pf->hw, true, false, NULL);
                } else {
-                       i40e_set_lldp_forwarding(pf, false);
                        status = i40e_aq_start_lldp(&pf->hw, false, NULL);
                        if (status) {
                                adq_err = pf->hw.aq.asq_last_status;
index c2d145a..704e474 100644 (file)
@@ -6880,40 +6880,6 @@ out:
 #endif /* CONFIG_I40E_DCB */
 
 /**
- * i40e_set_lldp_forwarding - set forwarding of lldp frames
- * @pf: PF being configured
- * @enable: if forwarding to OS shall be enabled
- *
- * Toggle forwarding of lldp frames behavior,
- * When passing DCB control from firmware to software
- * lldp frames must be forwarded to the software based
- * lldp agent.
- */
-void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable)
-{
-       if (pf->lan_vsi == I40E_NO_VSI)
-               return;
-
-       if (!pf->vsi[pf->lan_vsi])
-               return;
-
-       /* No need to check the outcome, commands may fail
-        * if desired value is already set
-        */
-       i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP,
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX |
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC,
-                                             pf->vsi[pf->lan_vsi]->seid, 0,
-                                             enable, NULL, NULL);
-
-       i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP,
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX |
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC,
-                                             pf->vsi[pf->lan_vsi]->seid, 0,
-                                             enable, NULL, NULL);
-}
-
-/**
  * i40e_print_link_message - print link up or down
  * @vsi: the VSI for which link needs a message
  * @isup: true of link is up, false otherwise
@@ -10736,10 +10702,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired)
         */
        i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
                                                       pf->main_vsi_seid);
-#ifdef CONFIG_I40E_DCB
-       if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP)
-               i40e_set_lldp_forwarding(pf, true);
-#endif /* CONFIG_I40E_DCB */
 
        /* restart the VSIs that were rebuilt and running before the reset */
        i40e_pf_unquiesce_all_vsi(pf);
@@ -15772,10 +15734,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
         */
        i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
                                                       pf->main_vsi_seid);
-#ifdef CONFIG_I40E_DCB
-       if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP)
-               i40e_set_lldp_forwarding(pf, true);
-#endif /* CONFIG_I40E_DCB */
 
        if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
                (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
index 121cd99..de70c16 100644 (file)
@@ -1961,10 +1961,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
                                 union i40e_rx_desc *rx_desc)
 
 {
-       /* XDP packets use error pointer so abort at this point */
-       if (IS_ERR(skb))
-               return true;
-
        /* ERR_MASK will only have valid bits if EOP set, and
         * what we are doing here is actually checking
         * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
@@ -2534,7 +2530,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
                }
 
                /* exit if we failed to retrieve a buffer */
-               if (!skb) {
+               if (!xdp_res && !skb) {
                        rx_ring->rx_stats.alloc_buff_failed++;
                        rx_buffer->pagecnt_bias++;
                        break;
@@ -2547,7 +2543,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
                if (i40e_is_non_eop(rx_ring, rx_desc))
                        continue;
 
-               if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
+               if (xdp_res || i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
                        skb = NULL;
                        continue;
                }
index 5c10faa..c81109a 100644 (file)
@@ -239,11 +239,8 @@ struct i40e_phy_info {
 #define I40E_CAP_PHY_TYPE_25GBASE_ACC BIT_ULL(I40E_PHY_TYPE_25GBASE_ACC + \
                                             I40E_PHY_TYPE_OFFSET)
 /* Offset for 2.5G/5G PHY Types value to bit number conversion */
-#define I40E_PHY_TYPE_OFFSET2 (-10)
-#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T + \
-                                            I40E_PHY_TYPE_OFFSET2)
-#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T + \
-                                            I40E_PHY_TYPE_OFFSET2)
+#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T)
+#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T)
 #define I40E_HW_CAP_MAX_GPIO                   30
 /* Capabilities of a PF or a VF or the whole device */
 struct i40e_hw_capabilities {
index 7846a21..1f6bc0c 100644 (file)
@@ -535,6 +535,16 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
        u16 erif_index = 0;
        int err;
 
+       /* Add the eRIF */
+       if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+               erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+               err = mr->mr_ops->route_erif_add(mlxsw_sp,
+                                                rve->mr_route->route_priv,
+                                                erif_index);
+               if (err)
+                       return err;
+       }
+
        /* Update the route action, as the new eVIF can be a tunnel or a pimreg
         * device which will require updating the action.
         */
@@ -544,17 +554,7 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
                                                      rve->mr_route->route_priv,
                                                      route_action);
                if (err)
-                       return err;
-       }
-
-       /* Add the eRIF */
-       if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
-               erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
-               err = mr->mr_ops->route_erif_add(mlxsw_sp,
-                                                rve->mr_route->route_priv,
-                                                erif_index);
-               if (err)
-                       goto err_route_erif_add;
+                       goto err_route_action_update;
        }
 
        /* Update the minimum MTU */
@@ -572,14 +572,14 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
        return 0;
 
 err_route_min_mtu_update:
-       if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
-               mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
-                                          erif_index);
-err_route_erif_add:
        if (route_action != rve->mr_route->route_action)
                mr->mr_ops->route_action_update(mlxsw_sp,
                                                rve->mr_route->route_priv,
                                                rve->mr_route->route_action);
+err_route_action_update:
+       if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+               mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+                                          erif_index);
        return err;
 }
 
index 95864f0..f35c03c 100644 (file)
@@ -642,6 +642,7 @@ static void dwmac4_set_filter(struct mac_device_info *hw,
        value &= ~GMAC_PACKET_FILTER_PCF;
        value &= ~GMAC_PACKET_FILTER_PM;
        value &= ~GMAC_PACKET_FILTER_PR;
+       value &= ~GMAC_PACKET_FILTER_RA;
        if (dev->flags & IFF_PROMISC) {
                /* VLAN Tag Filter Fail Packets Queuing */
                if (hw->vlan_fail_q_en) {
index a602d16..5be8e6a 100644 (file)
@@ -232,7 +232,7 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode,
                                       u32 channel, int fifosz, u8 qmode)
 {
        unsigned int rqs = fifosz / 256 - 1;
-       u32 mtl_rx_op, mtl_rx_int;
+       u32 mtl_rx_op;
 
        mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(channel));
 
@@ -293,11 +293,6 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode,
        }
 
        writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(channel));
-
-       /* Enable MTL RX overflow */
-       mtl_rx_int = readl(ioaddr + MTL_CHAN_INT_CTRL(channel));
-       writel(mtl_rx_int | MTL_RX_OVERFLOW_INT_EN,
-              ioaddr + MTL_CHAN_INT_CTRL(channel));
 }
 
 static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode,
index 2cc9175..6d5e0f2 100644 (file)
@@ -564,7 +564,6 @@ struct stmmac_mode_ops {
 #define stmmac_clean_desc3(__priv, __args...) \
        stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
 
-struct stmmac_priv;
 struct tc_cls_u32_offload;
 struct tc_cbs_qopt_offload;
 struct flow_cls_offload;
index a9a984c..345b4c6 100644 (file)
@@ -3180,6 +3180,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv)
        char *name;
 
        clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state);
+       clear_bit(__FPE_REMOVING,  &priv->fpe_task_state);
 
        name = priv->wq_name;
        sprintf(name, "%s-fpe", priv->dev->name);
@@ -5586,7 +5587,6 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
        /* To handle GMAC own interrupts */
        if ((priv->plat->has_gmac) || xmac) {
                int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
-               int mtl_status;
 
                if (unlikely(status)) {
                        /* For LPI we need to save the tx status */
@@ -5597,17 +5597,8 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
                }
 
                for (queue = 0; queue < queues_count; queue++) {
-                       struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
-
-                       mtl_status = stmmac_host_mtl_irq_status(priv, priv->hw,
-                                                               queue);
-                       if (mtl_status != -EINVAL)
-                               status |= mtl_status;
-
-                       if (status & CORE_IRQ_MTL_RX_OVERFLOW)
-                               stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
-                                                      rx_q->rx_tail_addr,
-                                                      queue);
+                       status = stmmac_host_mtl_irq_status(priv, priv->hw,
+                                                           queue);
                }
 
                /* PCS link status */
index 9f06663..e374079 100644 (file)
@@ -211,8 +211,8 @@ static void gsi_irq_setup(struct gsi *gsi)
        iowrite32(0, gsi->virt + GSI_CNTXT_SRC_IEOB_IRQ_MSK_OFFSET);
 
        /* The inter-EE registers are in the non-adjusted address range */
-       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_OFFSET);
-       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET);
+       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET);
+       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET);
 
        iowrite32(0, gsi->virt + GSI_CNTXT_GSI_IRQ_EN_OFFSET);
 }
index b4ac025..cb42c5a 100644 (file)
 #define GSI_EE_REG_ADJUST                      0x0000d000      /* IPA v4.5+ */
 
 /* The two inter-EE IRQ register offsets are relative to gsi->virt_raw */
-#define GSI_INTER_EE_SRC_CH_IRQ_OFFSET \
-                       GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(GSI_EE_AP)
-#define GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(ee) \
-                       (0x0000c018 + 0x1000 * (ee))
-
-#define GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET \
-                       GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(GSI_EE_AP)
-#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(ee) \
-                       (0x0000c01c + 0x1000 * (ee))
+#define GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET \
+                       GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(GSI_EE_AP)
+#define GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(ee) \
+                       (0x0000c020 + 0x1000 * (ee))
+
+#define GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET \
+                       GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(GSI_EE_AP)
+#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(ee) \
+                       (0x0000c024 + 0x1000 * (ee))
 
 /* All other register offsets are relative to gsi->virt */
 
index 0b2cccb..e6721c1 100644 (file)
@@ -1088,6 +1088,38 @@ static int m88e1011_set_tunable(struct phy_device *phydev,
        }
 }
 
+static int m88e1112_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return m88e1111_config_init(phydev);
+}
+
+static int m88e1111gbe_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1111_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return m88e1111_config_init(phydev);
+}
+
+static int marvell_1011gbe_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return marvell_config_init(phydev);
+}
 static int m88e1116r_config_init(struct phy_device *phydev)
 {
        int err;
@@ -1168,6 +1200,9 @@ static int m88e1510_config_init(struct phy_device *phydev)
                if (err < 0)
                        return err;
        }
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
 
        return m88e1318_config_init(phydev);
 }
@@ -1320,6 +1355,9 @@ static int m88e1145_config_init(struct phy_device *phydev)
                if (err < 0)
                        return err;
        }
+       err = m88e1111_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
 
        err = marvell_of_reg_init(phydev);
        if (err < 0)
@@ -2698,7 +2736,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1112",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1112_config_init,
                .config_aneg = marvell_config_aneg,
                .config_intr = marvell_config_intr,
                .handle_interrupt = marvell_handle_interrupt,
@@ -2718,7 +2756,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1111",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1111gbe_config_init,
                .config_aneg = m88e1111_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2739,7 +2777,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1111 (Finisar)",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1111gbe_config_init,
                .config_aneg = m88e1111_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2779,7 +2817,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1121_hwmon_ops),
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1121_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2859,7 +2897,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1240",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1112_config_init,
                .config_aneg = marvell_config_aneg,
                .config_intr = marvell_config_intr,
                .handle_interrupt = marvell_handle_interrupt,
@@ -2929,7 +2967,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2955,7 +2993,7 @@ static struct phy_driver marvell_drivers[] = {
                .probe = marvell_probe,
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3000,7 +3038,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e6390_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3026,7 +3064,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e6390_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3052,7 +3090,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3077,7 +3115,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
                .probe = marvell_probe,
                /* PHY_GBIT_FEATURES */
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3099,7 +3137,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
                .probe = marvell_probe,
                .features = PHY_GBIT_FIBRE_FEATURES,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
index 4d9dc7d..0720f5f 100644 (file)
@@ -415,7 +415,7 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 
                if (pad > 0) { /* Pad the frame with zeros */
                        if (__skb_pad(skb, pad, false))
-                               goto out;
+                               goto drop;
                        skb_put(skb, pad);
                }
        }
@@ -448,9 +448,8 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
        return NETDEV_TX_OK;
 
 drop:
-       kfree_skb(skb);
-out:
        dev->stats.tx_dropped++;
+       kfree_skb(skb);
        return NETDEV_TX_OK;
 }
 
index 97c2708..51c847d 100644 (file)
@@ -227,6 +227,7 @@ static ssize_t prism2_aux_dump_proc_no_read(struct file *file, char __user *buf,
 
 static const struct proc_ops prism2_aux_dump_proc_ops = {
        .proc_read      = prism2_aux_dump_proc_no_read,
+       .proc_lseek     = default_llseek,
 };
 
 
index 96a03d1..18bd0d9 100644 (file)
@@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void)
 
 module_init(orinoco_nortel_init);
 module_exit(orinoco_nortel_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index f3c86b0..7e3a6dd 100644 (file)
@@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void)
 
 module_init(orinoco_pci_init);
 module_exit(orinoco_pci_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 16dada9..73e6ae1 100644 (file)
@@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void)
 
 module_init(orinoco_plx_init);
 module_exit(orinoco_plx_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 9a9d335..939d5a1 100644 (file)
@@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void)
 
 module_init(orinoco_tmd_init);
 module_exit(orinoco_tmd_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 41aa1f0..18a267d 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
index 7daac79..ed10a8b 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
index b6f7815..522c9b2 100644 (file)
@@ -576,6 +576,11 @@ static void nvme_free_ns(struct kref *kref)
        kfree(ns);
 }
 
+static inline bool nvme_get_ns(struct nvme_ns *ns)
+{
+       return kref_get_unless_zero(&ns->kref);
+}
+
 void nvme_put_ns(struct nvme_ns *ns)
 {
        kref_put(&ns->kref, nvme_free_ns);
@@ -584,9 +589,6 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
 
 static inline void nvme_clear_nvme_request(struct request *req)
 {
-       struct nvme_command *cmd = nvme_req(req)->cmd;
-
-       memset(cmd, 0, sizeof(*cmd));
        nvme_req(req)->retries = 0;
        nvme_req(req)->flags = 0;
        req->rq_flags |= RQF_DONTPREP;
@@ -637,6 +639,66 @@ static struct request *nvme_alloc_request_qid(struct request_queue *q,
        return req;
 }
 
+/*
+ * For something we're not in a state to send to the device the default action
+ * is to busy it and retry it after the controller state is recovered.  However,
+ * if the controller is deleting or if anything is marked for failfast or
+ * nvme multipath it is immediately failed.
+ *
+ * Note: commands used to initialize the controller will be marked for failfast.
+ * Note: nvme cli/ioctl commands are marked for failfast.
+ */
+blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
+               struct request *rq)
+{
+       if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
+           ctrl->state != NVME_CTRL_DEAD &&
+           !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
+           !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+               return BLK_STS_RESOURCE;
+       return nvme_host_path_error(rq);
+}
+EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
+
+bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live)
+{
+       struct nvme_request *req = nvme_req(rq);
+
+       /*
+        * currently we have a problem sending passthru commands
+        * on the admin_q if the controller is not LIVE because we can't
+        * make sure that they are going out after the admin connect,
+        * controller enable and/or other commands in the initialization
+        * sequence. until the controller will be LIVE, fail with
+        * BLK_STS_RESOURCE so that they will be rescheduled.
+        */
+       if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
+               return false;
+
+       if (ctrl->ops->flags & NVME_F_FABRICS) {
+               /*
+                * Only allow commands on a live queue, except for the connect
+                * command, which is require to set the queue live in the
+                * appropinquate states.
+                */
+               switch (ctrl->state) {
+               case NVME_CTRL_CONNECTING:
+                       if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
+                           req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+                               return true;
+                       break;
+               default:
+                       break;
+               case NVME_CTRL_DEAD:
+                       return false;
+               }
+       }
+
+       return queue_live;
+}
+EXPORT_SYMBOL_GPL(__nvme_check_ready);
+
 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
 {
        struct nvme_command c;
@@ -898,8 +960,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
        struct nvme_command *cmd = nvme_req(req)->cmd;
        blk_status_t ret = BLK_STS_OK;
 
-       if (!(req->rq_flags & RQF_DONTPREP))
+       if (!(req->rq_flags & RQF_DONTPREP)) {
                nvme_clear_nvme_request(req);
+               memset(cmd, 0, sizeof(*cmd));
+       }
 
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
@@ -1494,7 +1558,7 @@ static int nvme_ns_open(struct nvme_ns *ns)
        /* should never be called due to GENHD_FL_HIDDEN */
        if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
                goto fail;
-       if (!kref_get_unless_zero(&ns->kref))
+       if (!nvme_get_ns(ns))
                goto fail;
        if (!try_module_get(ns->ctrl->ops->module))
                goto fail_put_ns;
@@ -1999,28 +2063,6 @@ static const struct block_device_operations nvme_bdev_ops = {
        .pr_ops         = &nvme_pr_ops,
 };
 
-#ifdef CONFIG_NVME_MULTIPATH
-struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys)
-{
-       struct nvme_ctrl *ctrl;
-       int ret;
-
-       ret = mutex_lock_killable(&nvme_subsystems_lock);
-       if (ret)
-               return ERR_PTR(ret);
-       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
-               if (ctrl->state == NVME_CTRL_LIVE)
-                       goto found;
-       }
-       mutex_unlock(&nvme_subsystems_lock);
-       return ERR_PTR(-EWOULDBLOCK);
-found:
-       nvme_get_ctrl(ctrl);
-       mutex_unlock(&nvme_subsystems_lock);
-       return ctrl;
-}
-#endif /* CONFIG_NVME_MULTIPATH */
-
 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
 {
        unsigned long timeout =
@@ -3604,7 +3646,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list) {
                if (ns->head->ns_id == nsid) {
-                       if (!kref_get_unless_zero(&ns->kref))
+                       if (!nvme_get_ns(ns))
                                continue;
                        ret = ns;
                        break;
index 13c2747..a2bb7fc 100644 (file)
@@ -533,63 +533,6 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
        return NULL;
 }
 
-/*
- * For something we're not in a state to send to the device the default action
- * is to busy it and retry it after the controller state is recovered.  However,
- * if the controller is deleting or if anything is marked for failfast or
- * nvme multipath it is immediately failed.
- *
- * Note: commands used to initialize the controller will be marked for failfast.
- * Note: nvme cli/ioctl commands are marked for failfast.
- */
-blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
-               struct request *rq)
-{
-       if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
-           ctrl->state != NVME_CTRL_DEAD &&
-           !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
-           !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
-               return BLK_STS_RESOURCE;
-       return nvme_host_path_error(rq);
-}
-EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command);
-
-bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live)
-{
-       struct nvme_request *req = nvme_req(rq);
-
-       /*
-        * currently we have a problem sending passthru commands
-        * on the admin_q if the controller is not LIVE because we can't
-        * make sure that they are going out after the admin connect,
-        * controller enable and/or other commands in the initialization
-        * sequence. until the controller will be LIVE, fail with
-        * BLK_STS_RESOURCE so that they will be rescheduled.
-        */
-       if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
-               return false;
-
-       /*
-        * Only allow commands on a live queue, except for the connect command,
-        * which is require to set the queue live in the appropinquate states.
-        */
-       switch (ctrl->state) {
-       case NVME_CTRL_CONNECTING:
-               if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-                   req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
-                       return true;
-               break;
-       default:
-               break;
-       case NVME_CTRL_DEAD:
-               return false;
-       }
-
-       return queue_live;
-}
-EXPORT_SYMBOL_GPL(__nvmf_check_ready);
-
 static const match_table_t opt_tokens = {
        { NVMF_OPT_TRANSPORT,           "transport=%s"          },
        { NVMF_OPT_TRADDR,              "traddr=%s"             },
index 888b108..d7f7974 100644 (file)
@@ -184,20 +184,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
-blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
-               struct request *rq);
-bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live);
 bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
                struct nvmf_ctrl_options *opts);
 
-static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live)
-{
-       if (likely(ctrl->state == NVME_CTRL_LIVE ||
-                  ctrl->state == NVME_CTRL_DELETING))
-               return true;
-       return __nvmf_check_ready(ctrl, rq, queue_live);
-}
-
 #endif /* _NVME_FABRICS_H */
index 9b9b7be..d9ab9e7 100644 (file)
@@ -2766,8 +2766,8 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
        blk_status_t ret;
 
        if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
-           !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+           !nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        ret = nvme_setup_cmd(ns, rq);
        if (ret)
index 502f8e4..9557ead 100644 (file)
@@ -370,41 +370,45 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 }
 
 #ifdef CONFIG_NVME_MULTIPATH
-static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head,
-               unsigned int cmd, void __user *argp)
+static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
+               void __user *argp, struct nvme_ns_head *head, int srcu_idx)
 {
-       struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys);
+       struct nvme_ctrl *ctrl = ns->ctrl;
        int ret;
 
-       if (IS_ERR(ctrl))
-               return PTR_ERR(ctrl);
-       ret = nvme_ctrl_ioctl(ctrl, cmd, argp);
-       nvme_put_ctrl(ctrl);
-       return ret;
-}
+       nvme_get_ctrl(ns->ctrl);
+       nvme_put_ns_from_disk(head, srcu_idx);
+       ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
 
-static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head,
-               unsigned int cmd, void __user *argp)
-{
-       int srcu_idx = srcu_read_lock(&head->srcu);
-       struct nvme_ns *ns = nvme_find_path(head);
-       int ret = -EWOULDBLOCK;
-
-       if (ns)
-               ret = nvme_ns_ioctl(ns, cmd, argp);
-       srcu_read_unlock(&head->srcu, srcu_idx);
+       nvme_put_ctrl(ctrl);
        return ret;
 }
 
 int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
                unsigned int cmd, unsigned long arg)
 {
-       struct nvme_ns_head *head = bdev->bd_disk->private_data;
+       struct nvme_ns_head *head = NULL;
        void __user *argp = (void __user *)arg;
+       struct nvme_ns *ns;
+       int srcu_idx, ret;
+
+       ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+       if (unlikely(!ns))
+               return -EWOULDBLOCK;
 
+       /*
+        * Handle ioctls that apply to the controller instead of the namespace
+        * seperately and drop the ns SRCU reference early.  This avoids a
+        * deadlock when deleting namespaces using the passthrough interface.
+        */
        if (is_ctrl_ioctl(cmd))
-               return nvme_ns_head_ctrl_ioctl(head, cmd, argp);
-       return nvme_ns_head_ns_ioctl(head, cmd, argp);
+               ret = nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+       else {
+               ret = nvme_ns_ioctl(ns, cmd, argp);
+               nvme_put_ns_from_disk(head, srcu_idx);
+       }
+
+       return ret;
 }
 
 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
@@ -414,10 +418,23 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
        struct nvme_ns_head *head =
                container_of(cdev, struct nvme_ns_head, cdev);
        void __user *argp = (void __user *)arg;
+       struct nvme_ns *ns;
+       int srcu_idx, ret;
+
+       srcu_idx = srcu_read_lock(&head->srcu);
+       ns = nvme_find_path(head);
+       if (!ns) {
+               srcu_read_unlock(&head->srcu, srcu_idx);
+               return -EWOULDBLOCK;
+       }
 
        if (is_ctrl_ioctl(cmd))
-               return nvme_ns_head_ctrl_ioctl(head, cmd, argp);
-       return nvme_ns_head_ns_ioctl(head, cmd, argp);
+               return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+
+       ret = nvme_ns_ioctl(ns, cmd, argp);
+       nvme_put_ns_from_disk(head, srcu_idx);
+
+       return ret;
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
index 0d0de34..0551796 100644 (file)
@@ -70,6 +70,7 @@ void nvme_failover_req(struct request *req)
        struct nvme_ns *ns = req->q->queuedata;
        u16 status = nvme_req(req)->status & 0x7ff;
        unsigned long flags;
+       struct bio *bio;
 
        nvme_mpath_clear_current_path(ns);
 
@@ -84,6 +85,8 @@ void nvme_failover_req(struct request *req)
        }
 
        spin_lock_irqsave(&ns->head->requeue_lock, flags);
+       for (bio = req->bio; bio; bio = bio->bi_next)
+               bio_set_dev(bio, ns->head->disk->part0);
        blk_steal_bios(&ns->head->requeue_list, req);
        spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 
index 773dde5..05f31a2 100644 (file)
@@ -638,6 +638,21 @@ struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, blk_mq_req_flags_t flags);
 void nvme_cleanup_cmd(struct request *req);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req);
+blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
+               struct request *req);
+bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live);
+
+static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live)
+{
+       if (likely(ctrl->state == NVME_CTRL_LIVE))
+               return true;
+       if (ctrl->ops->flags & NVME_F_FABRICS &&
+           ctrl->state == NVME_CTRL_DELETING)
+               return true;
+       return __nvme_check_ready(ctrl, rq, queue_live);
+}
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
@@ -664,7 +679,6 @@ struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
 bool nvme_tryget_ns_head(struct nvme_ns_head *head);
 void nvme_put_ns_head(struct nvme_ns_head *head);
-struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys);
 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
                const struct file_operations *fops, struct module *owner);
 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device);
index 09d4c5f..a29b170 100644 (file)
@@ -933,6 +933,9 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
                return BLK_STS_IOERR;
 
+       if (!nvme_check_ready(&dev->ctrl, req, true))
+               return nvme_fail_nonready_command(&dev->ctrl, req);
+
        ret = nvme_setup_cmd(ns, req);
        if (ret)
                return ret;
index 660c774..37943dc 100644 (file)
@@ -2050,8 +2050,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 
        WARN_ON_ONCE(rq->tag < 0);
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        dev = queue->device->dev;
 
index 75435cd..0222e23 100644 (file)
@@ -2338,8 +2338,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
        blk_status_t ret;
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
        if (unlikely(ret))
index d2a26ff..e7a367c 100644 (file)
@@ -307,7 +307,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
        case NVME_LOG_ANA:
                return nvmet_execute_get_log_page_ana(req);
        }
-       pr_err("unhandled lid %d on qid %d\n",
+       pr_debug("unhandled lid %d on qid %d\n",
               req->cmd->get_log_page.lid, req->sq->qid);
        req->error_loc = offsetof(struct nvme_get_log_page_command, lid);
        nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR);
@@ -659,7 +659,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
                return nvmet_execute_identify_desclist(req);
        }
 
-       pr_err("unhandled identify cns %d on qid %d\n",
+       pr_debug("unhandled identify cns %d on qid %d\n",
               req->cmd->identify.cns, req->sq->qid);
        req->error_loc = offsetof(struct nvme_identify, cns);
        nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR);
@@ -977,7 +977,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
                return 0;
        }
 
-       pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+       pr_debug("unhandled cmd %d on qid %d\n", cmd->common.opcode,
               req->sq->qid);
        req->error_loc = offsetof(struct nvme_common_command, opcode);
        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
index 6665da3..74b3b15 100644 (file)
@@ -138,8 +138,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags);
        blk_status_t ret;
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, req, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, req);
 
        ret = nvme_setup_cmd(ns, req);
        if (ret)
index 180c6fb..d80160c 100644 (file)
@@ -1024,7 +1024,6 @@ int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
        struct device_node *overlay_root = NULL;
 
        *ovcs_id = 0;
-       ret = 0;
 
        if (overlay_fdt_size < sizeof(struct fdt_header) ||
            fdt_check_header(overlay_fdt)) {
@@ -1195,8 +1194,6 @@ int of_overlay_remove(int *ovcs_id)
        struct overlay_changeset *ovcs;
        int ret, ret_apply, ret_tmp;
 
-       ret = 0;
-
        if (devicetree_corrupt()) {
                pr_err("suspect devicetree state, refuse to remove overlay\n");
                ret = -EBUSY;
index 48b084e..0919ed9 100644 (file)
@@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features,
                 ", bit 2: hardware SPP mode"
                 ", bit 3: hardware EPP mode"
                 ", bit 4: hardware ECP mode");
-
-/*--- Inform (X)Emacs about preferred coding style ---------------------*/
-/*
- * Local Variables:
- * mode: c
- * c-file-style: "linux"
- * indent-tabs-mode: t
- * tab-width: 8
- * fill-column: 78
- * ispell-local-dictionary: "american"
- * End:
- */
index f8f056b..0148687 100644 (file)
@@ -35,7 +35,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
                return rc;
        zdev->state = ZPCI_FN_STATE_CONFIGURED;
 
-       return zpci_configure_device(zdev, zdev->fh);
+       return zpci_scan_configured_device(zdev, zdev->fh);
 }
 
 static int disable_slot(struct hotplug_slot *hotplug_slot)
index e693910..948b763 100644 (file)
@@ -75,7 +75,7 @@ void release_cis_mem(struct pcmcia_socket *s)
        mutex_unlock(&s->ops_mutex);
 }
 
-/**
+/*
  * set_cis_map() - map the card memory at "card_offset" into virtual space.
  *
  * If flags & MAP_ATTRIB, map the attribute space, otherwise
@@ -126,7 +126,7 @@ static void __iomem *set_cis_map(struct pcmcia_socket *s,
 #define IS_ATTR                1
 #define IS_INDIRECT    8
 
-/**
+/*
  * pcmcia_read_cis_mem() - low-level function to read CIS memory
  *
  * must be called with ops_mutex held
@@ -206,7 +206,7 @@ int pcmcia_read_cis_mem(struct pcmcia_socket *s, int attr, u_int addr,
 }
 
 
-/**
+/*
  * pcmcia_write_cis_mem() - low-level function to write CIS memory
  *
  * Probably only useful for writing one-byte registers. Must be called
@@ -277,7 +277,7 @@ int pcmcia_write_cis_mem(struct pcmcia_socket *s, int attr, u_int addr,
 }
 
 
-/**
+/*
  * read_cis_cache() - read CIS memory or its associated cache
  *
  * This is a wrapper around read_cis_mem, with the same interface,
@@ -365,7 +365,7 @@ void destroy_cis_cache(struct pcmcia_socket *s)
        }
 }
 
-/**
+/*
  * verify_cis_cache() - does the CIS match what is in the CIS cache?
  */
 int verify_cis_cache(struct pcmcia_socket *s)
@@ -401,7 +401,7 @@ int verify_cis_cache(struct pcmcia_socket *s)
        return 0;
 }
 
-/**
+/*
  * pcmcia_replace_cis() - use a replacement CIS instead of the card's CIS
  *
  * For really bad cards, we provide a facility for uploading a
index 7211490..bd81aa6 100644 (file)
@@ -83,7 +83,7 @@ struct pcmcia_dynid {
 };
 
 /**
- * pcmcia_store_new_id - add a new PCMCIA device ID to this driver and re-probe devices
+ * new_id_store() - add a new PCMCIA device ID to this driver and re-probe devices
  * @driver: target device driver
  * @buf: buffer for scanning device ID data
  * @count: input size
@@ -371,9 +371,6 @@ static int pcmcia_device_remove(struct device *dev)
                pcmcia_card_remove(p_dev->socket, p_dev);
 
        /* detach the "instance" */
-       if (!p_drv)
-               return 0;
-
        if (p_drv->remove)
                p_drv->remove(p_dev);
 
@@ -389,7 +386,7 @@ static int pcmcia_device_remove(struct device *dev)
                                 "pcmcia: driver %s did not release window properly\n",
                                 p_drv->name);
 
-       /* references from pcmcia_probe_device */
+       /* references from pcmcia_device_probe */
        pcmcia_put_dev(p_dev);
        module_put(p_drv->owner);
 
index e4c4daf..d2d0ed4 100644 (file)
@@ -122,7 +122,7 @@ next_entry:
 }
 
 
-/**
+/*
  * pcmcia_io_cfg_data_width() - convert cfgtable to data path width parameter
  */
 static int pcmcia_io_cfg_data_width(unsigned int flags)
@@ -143,7 +143,7 @@ struct pcmcia_cfg_mem {
        cistpl_cftable_entry_t dflt;
 };
 
-/**
+/*
  * pcmcia_do_loop_config() - internal helper for pcmcia_loop_config()
  *
  * pcmcia_do_loop_config() is the internal callback for the call from
@@ -289,7 +289,7 @@ struct pcmcia_loop_mem {
                           void *priv_data);
 };
 
-/**
+/*
  * pcmcia_do_loop_tuple() - internal helper for pcmcia_loop_config()
  *
  * pcmcia_do_loop_tuple() is the internal callback for the call from
@@ -337,7 +337,7 @@ struct pcmcia_loop_get {
        cisdata_t **buf;
 };
 
-/**
+/*
  * pcmcia_do_get_tuple() - internal helper for pcmcia_get_tuple()
  *
  * pcmcia_do_get_tuple() is the internal callback for the call from
@@ -386,7 +386,7 @@ size_t pcmcia_get_tuple(struct pcmcia_device *p_dev, cisdata_t code,
 EXPORT_SYMBOL(pcmcia_get_tuple);
 
 
-/**
+/*
  * pcmcia_do_get_mac() - internal helper for pcmcia_get_mac_from_cis()
  *
  * pcmcia_do_get_mac() is the internal callback for the call from
index e3a6b6c..c1c1972 100644 (file)
@@ -144,7 +144,7 @@ static int alloc_io_space(struct pcmcia_socket *s, struct resource *res,
 }
 
 
-/**
+/*
  * pcmcia_access_config() - read or write card configuration registers
  *
  * pcmcia_access_config() reads and writes configuration registers in
@@ -184,7 +184,7 @@ static int pcmcia_access_config(struct pcmcia_device *p_dev,
 }
 
 
-/**
+/*
  * pcmcia_read_config_byte() - read a byte from a card configuration register
  *
  * pcmcia_read_config_byte() reads a byte from a configuration register in
@@ -197,7 +197,7 @@ int pcmcia_read_config_byte(struct pcmcia_device *p_dev, off_t where, u8 *val)
 EXPORT_SYMBOL(pcmcia_read_config_byte);
 
 
-/**
+/*
  * pcmcia_write_config_byte() - write a byte to a card configuration register
  *
  * pcmcia_write_config_byte() writes a byte to a configuration register in
@@ -720,7 +720,8 @@ static irqreturn_t test_action(int cpl, void *dev_id)
 
 /**
  * pcmcia_setup_isa_irq() - determine whether an ISA IRQ can be used
- * @p_dev - the associated PCMCIA device
+ * @p_dev: the associated PCMCIA device
+ * @type:  IRQ type (flags)
  *
  * locking note: must be called with ops_mutex locked.
  */
@@ -785,7 +786,7 @@ void pcmcia_cleanup_irq(struct pcmcia_socket *s)
 
 /**
  * pcmcia_setup_irq() - determine IRQ to be used for device
- * @p_dev - the associated PCMCIA device
+ * @p_dev: the associated PCMCIA device
  *
  * locking note: must be called with ops_mutex locked.
  */
index 3b05760..bb15a8b 100644 (file)
@@ -257,7 +257,7 @@ static void do_io_probe(struct pcmcia_socket *s, unsigned int base,
 
 /*======================================================================*/
 
-/**
+/*
  * readable() - iomem validation function for cards with a valid CIS
  */
 static int readable(struct pcmcia_socket *s, struct resource *res,
@@ -288,7 +288,7 @@ static int readable(struct pcmcia_socket *s, struct resource *res,
        return 0;
 }
 
-/**
+/*
  * checksum() - iomem validation function for simple memory cards
  */
 static int checksum(struct pcmcia_socket *s, struct resource *res,
@@ -343,9 +343,9 @@ static int checksum(struct pcmcia_socket *s, struct resource *res,
  */
 static int do_validate_mem(struct pcmcia_socket *s,
                           unsigned long base, unsigned long size,
-                          int validate (struct pcmcia_socket *s,
-                                        struct resource *res,
-                                        unsigned int *value))
+                          int (*validate)(struct pcmcia_socket *s,
+                                          struct resource *res,
+                                          unsigned int *value))
 {
        struct socket_data *s_data = s->resource_data;
        struct resource *res1, *res2;
@@ -398,12 +398,12 @@ static int do_validate_mem(struct pcmcia_socket *s,
  * function returns the size of the usable memory area.
  */
 static int do_mem_probe(struct pcmcia_socket *s, u_long base, u_long num,
-                       int validate (struct pcmcia_socket *s,
-                                     struct resource *res,
-                                     unsigned int *value),
-                       int fallback (struct pcmcia_socket *s,
-                                     struct resource *res,
-                                     unsigned int *value))
+                       int (*validate)(struct pcmcia_socket *s,
+                                       struct resource *res,
+                                       unsigned int *value),
+                       int (*fallback)(struct pcmcia_socket *s,
+                                       struct resource *res,
+                                       unsigned int *value))
 {
        struct socket_data *s_data = s->resource_data;
        u_long i, j, bad, fail, step;
index 651a36b..983ba98 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
@@ -3854,6 +3855,8 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev)
        return 0;
 }
 
+#define IF_ENABLED(cfg, ptr)   PTR_IF(IS_ENABLED(cfg), (ptr))
+
 static const struct of_device_id ingenic_pinctrl_of_match[] = {
        {
                .compatible = "ingenic,jz4730-pinctrl",
index 9035b17..bbc2884 100644 (file)
@@ -14,7 +14,7 @@
  * This mutex must be held while accessing the EMI unit. We can't rely on the
  * EC mutex because memmap data may be accessed without it being held.
  */
-static struct mutex io_mutex;
+static DEFINE_MUTEX(io_mutex);
 static u16 mec_emi_base, mec_emi_end;
 
 /**
@@ -142,7 +142,6 @@ EXPORT_SYMBOL(cros_ec_lpc_io_bytes_mec);
 
 void cros_ec_lpc_mec_init(unsigned int base, unsigned int end)
 {
-       mutex_init(&io_mutex);
        mec_emi_base = base;
        mec_emi_end = end;
 }
index 0811562..27c068c 100644 (file)
@@ -58,6 +58,7 @@ struct cros_typec_port {
        /* Variables keeping track of switch state. */
        struct typec_mux_state state;
        uint8_t mux_flags;
+       uint8_t role;
 
        /* Port alt modes. */
        struct typec_altmode p_altmode[CROS_EC_ALTMODE_MAX];
@@ -220,6 +221,9 @@ static void cros_typec_remove_partner(struct cros_typec_data *typec,
 {
        struct cros_typec_port *port = typec->ports[port_num];
 
+       if (!port->partner)
+               return;
+
        cros_typec_unregister_altmodes(typec, port_num, true);
 
        cros_typec_usb_disconnect_state(port);
@@ -235,6 +239,9 @@ static void cros_typec_remove_cable(struct cros_typec_data *typec,
 {
        struct cros_typec_port *port = typec->ports[port_num];
 
+       if (!port->cable)
+               return;
+
        cros_typec_unregister_altmodes(typec, port_num, false);
 
        typec_unregister_plug(port->plug);
@@ -253,11 +260,8 @@ static void cros_unregister_ports(struct cros_typec_data *typec)
                if (!typec->ports[i])
                        continue;
 
-               if (typec->ports[i]->partner)
-                       cros_typec_remove_partner(typec, i);
-
-               if (typec->ports[i]->cable)
-                       cros_typec_remove_cable(typec, i);
+               cros_typec_remove_partner(typec, i);
+               cros_typec_remove_cable(typec, i);
 
                usb_role_switch_put(typec->ports[i]->role_sw);
                typec_switch_put(typec->ports[i]->ori_sw);
@@ -483,6 +487,11 @@ static int cros_typec_enable_dp(struct cros_typec_data *typec,
                return -ENOTSUPP;
        }
 
+       if (!pd_ctrl->dp_mode) {
+               dev_err(typec->dev, "No valid DP mode provided.\n");
+               return -EINVAL;
+       }
+
        /* Status VDO. */
        dp_data.status = DP_STATUS_ENABLED;
        if (port->mux_flags & USB_PD_MUX_HPD_IRQ)
@@ -647,11 +656,8 @@ static void cros_typec_set_port_params_v1(struct cros_typec_data *typec,
                                 "Failed to register partner on port: %d\n",
                                 port_num);
        } else {
-               if (typec->ports[port_num]->partner)
-                       cros_typec_remove_partner(typec, port_num);
-
-               if (typec->ports[port_num]->cable)
-                       cros_typec_remove_cable(typec, port_num);
+               cros_typec_remove_partner(typec, port_num);
+               cros_typec_remove_cable(typec, port_num);
        }
 }
 
@@ -905,6 +911,19 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num
                return;
        }
 
+       /* If we got a hard reset, unregister everything and return. */
+       if (resp.events & PD_STATUS_EVENT_HARD_RESET) {
+               cros_typec_remove_partner(typec, port_num);
+               cros_typec_remove_cable(typec, port_num);
+
+               ret = cros_typec_send_clear_event(typec, port_num,
+                                                 PD_STATUS_EVENT_HARD_RESET);
+               if (ret < 0)
+                       dev_warn(typec->dev,
+                                "Failed hard reset event clear, port: %d\n", port_num);
+               return;
+       }
+
        /* Handle any events appropriately. */
        if (resp.events & PD_STATUS_EVENT_SOP_DISC_DONE && !typec->ports[port_num]->sop_disc_done) {
                u16 sop_revision;
@@ -995,10 +1014,12 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
        }
 
        /* No change needs to be made, let's exit early. */
-       if (typec->ports[port_num]->mux_flags == mux_resp.flags)
+       if (typec->ports[port_num]->mux_flags == mux_resp.flags &&
+           typec->ports[port_num]->role == resp.role)
                return 0;
 
        typec->ports[port_num]->mux_flags = mux_resp.flags;
+       typec->ports[port_num]->role = resp.role;
        ret = cros_typec_configure_mux(typec, port_num, mux_resp.flags, &resp);
        if (ret)
                dev_warn(typec->dev, "Configure muxes failed, err = %d\n", ret);
@@ -1027,8 +1048,8 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
        else
                typec->pd_ctrl_ver = 0;
 
-       dev_dbg(typec->dev, "PD Control has version mask 0x%hhx\n",
-               typec->pd_ctrl_ver);
+       dev_dbg(typec->dev, "PD Control has version mask 0x%02x\n",
+               typec->pd_ctrl_ver & 0xff);
 
        return 0;
 }
index 7f36142..48a6617 100644 (file)
@@ -220,7 +220,8 @@ static int cros_usbpd_notify_plat(struct notifier_block *nb,
        if (!host_event)
                return NOTIFY_DONE;
 
-       if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
+       if (host_event & (EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU) |
+                         EC_HOST_EVENT_MASK(EC_HOST_EVENT_USB_MUX))) {
                cros_usbpd_get_event_and_notify(pdnotify->dev, ec_dev);
                return NOTIFY_OK;
        }
index e06d96f..60da7a2 100644 (file)
@@ -256,7 +256,7 @@ static int telem_open(struct inode *inode, struct file *filp)
        sess_data->dev_data = dev_data;
        sess_data->has_msg = false;
 
-       nonseekable_open(inode, filp);
+       stream_open(inode, filp);
        filp->private_data = sess_data;
 
        return 0;
index 03c3ff3..085ad0a 100644 (file)
@@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void)
 
 module_exit(dcdrbu_exit);
 module_init(dcdrbu_init);
-
-/* vim:noet:ts=8:sw=8
-*/
index ca24a78..7365121 100644 (file)
@@ -52,7 +52,7 @@
 #define DASD_ECKD_CCW_RCD               0xFA
 #define DASD_ECKD_CCW_DSO               0xF7
 
-/* Define Subssystem Function / Orders */
+/* Define Subsystem Function / Orders */
 #define DSO_ORDER_RAS                   0x81
 
 /*
 #define DASD_ECKD_PG_GROUPED            0x10
 
 /*
- * Size that is reportet for large volumes in the old 16-bit no_cyl field
+ * Size that is reported for large volumes in the old 16-bit no_cyl field
  */
 #define LV_COMPAT_CYL 0xFFFE
 
@@ -555,7 +555,7 @@ struct dasd_dso_ras_ext_range {
 } __packed;
 
 /*
- * Define Subsytem Operation - Release Allocated Space
+ * Define Subsystem Operation - Release Allocated Space
  */
 struct dasd_dso_ras_data {
        __u8 order;
@@ -676,7 +676,7 @@ struct dasd_eckd_private {
        struct dasd_ext_pool_sum eps;
        u32 real_cyl;
 
-       /* alias managemnet */
+       /* alias management */
        struct dasd_uid uid;
        struct alias_pav_group *pavgroup;
        struct alias_lcu *lcu;
index 3f02602..84f659c 100644 (file)
@@ -1532,8 +1532,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process)
        switch (action) {
        case IO_SCH_ORPH_UNREG:
        case IO_SCH_UNREG:
-               if (!cdev)
-                       css_sch_device_unregister(sch);
+               css_sch_device_unregister(sch);
                break;
        case IO_SCH_ORPH_ATTACH:
        case IO_SCH_UNREG_ATTACH:
index ab42fea..77ccb96 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* NCR (or Symbios) 53c700 and 53c700-66 Driver
  *
index c9f8c49..2df347c 100644 (file)
@@ -1,5 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios
  *
index cb74ab1..9b89c26 100644 (file)
@@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void)
 
 module_init(init_ch_module);
 module_exit(exit_ch_module);
-
-/*
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
index 5d9eeac..45ec9f1 100644 (file)
@@ -616,6 +616,7 @@ static const struct file_operations esas2r_proc_fops = {
 };
 
 static const struct proc_ops esas2r_proc_ops = {
+       .proc_lseek             = default_llseek,
        .proc_ioctl             = esas2r_proc_ioctl,
 #ifdef CONFIG_COMPAT
        .proc_compat_ioctl      = compat_ptr_ioctl,
index e619a82..762cc8b 100644 (file)
@@ -102,7 +102,7 @@ static const char *fnic_fcpio_status_to_str(unsigned int status)
        return fcpio_status_str[status];
 }
 
-static void fnic_cleanup_io(struct fnic *fnic, int exclude_id);
+static void fnic_cleanup_io(struct fnic *fnic);
 
 static inline spinlock_t *fnic_io_lock_hash(struct fnic *fnic,
                                            struct scsi_cmnd *sc)
@@ -638,7 +638,7 @@ static int fnic_fcpio_fw_reset_cmpl_handler(struct fnic *fnic,
        atomic64_inc(&reset_stats->fw_reset_completions);
 
        /* Clean up all outstanding io requests */
-       fnic_cleanup_io(fnic, SCSI_NO_TAG);
+       fnic_cleanup_io(fnic);
 
        atomic64_set(&fnic->fnic_stats.fw_stats.active_fw_reqs, 0);
        atomic64_set(&fnic->fnic_stats.io_stats.active_ios, 0);
@@ -1361,93 +1361,90 @@ int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int copy_work_to_do)
        return wq_work_done;
 }
 
-static void fnic_cleanup_io(struct fnic *fnic, int exclude_id)
+static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data,
+                                bool reserved)
 {
-       int i;
+       struct fnic *fnic = data;
        struct fnic_io_req *io_req;
        unsigned long flags = 0;
-       struct scsi_cmnd *sc;
        spinlock_t *io_lock;
        unsigned long start_time = 0;
        struct fnic_stats *fnic_stats = &fnic->fnic_stats;
 
-       for (i = 0; i < fnic->fnic_max_tag_id; i++) {
-               if (i == exclude_id)
-                       continue;
-
-               io_lock = fnic_io_lock_tag(fnic, i);
-               spin_lock_irqsave(io_lock, flags);
-               sc = scsi_host_find_tag(fnic->lport->host, i);
-               if (!sc) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               io_req = (struct fnic_io_req *)CMD_SP(sc);
-               if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
-                       !(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) {
-                       /*
-                        * We will be here only when FW completes reset
-                        * without sending completions for outstanding ios.
-                        */
-                       CMD_FLAGS(sc) |= FNIC_DEV_RST_DONE;
-                       if (io_req && io_req->dr_done)
-                               complete(io_req->dr_done);
-                       else if (io_req && io_req->abts_done)
-                               complete(io_req->abts_done);
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               } else if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-               if (!io_req) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               CMD_SP(sc) = NULL;
-
-               spin_unlock_irqrestore(io_lock, flags);
+       io_lock = fnic_io_lock_tag(fnic, sc->request->tag);
+       spin_lock_irqsave(io_lock, flags);
 
+       io_req = (struct fnic_io_req *)CMD_SP(sc);
+       if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
+           !(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) {
                /*
-                * If there is a scsi_cmnd associated with this io_req, then
-                * free the corresponding state
+                * We will be here only when FW completes reset
+                * without sending completions for outstanding ios.
                 */
-               start_time = io_req->start_time;
-               fnic_release_ioreq_buf(fnic, io_req, sc);
-               mempool_free(io_req, fnic->io_req_pool);
+               CMD_FLAGS(sc) |= FNIC_DEV_RST_DONE;
+               if (io_req && io_req->dr_done)
+                       complete(io_req->dr_done);
+               else if (io_req && io_req->abts_done)
+                       complete(io_req->abts_done);
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       } else if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
+       if (!io_req) {
+               spin_unlock_irqrestore(io_lock, flags);
+               goto cleanup_scsi_cmd;
+       }
 
-               sc->result = DID_TRANSPORT_DISRUPTED << 16;
-               FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                             "%s: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n",
-                             __func__, sc->request->tag, sc,
-                             (jiffies - start_time));
+       CMD_SP(sc) = NULL;
 
-               if (atomic64_read(&fnic->io_cmpl_skip))
-                       atomic64_dec(&fnic->io_cmpl_skip);
-               else
-                       atomic64_inc(&fnic_stats->io_stats.io_completions);
+       spin_unlock_irqrestore(io_lock, flags);
 
-               /* Complete the command to SCSI */
-               if (sc->scsi_done) {
-                       if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED))
-                               shost_printk(KERN_ERR, fnic->lport->host,
-                               "Calling done for IO not issued to fw: tag:0x%x sc:0x%p\n",
-                                sc->request->tag, sc);
+       /*
+        * If there is a scsi_cmnd associated with this io_req, then
+        * free the corresponding state
+        */
+       start_time = io_req->start_time;
+       fnic_release_ioreq_buf(fnic, io_req, sc);
+       mempool_free(io_req, fnic->io_req_pool);
 
-                       FNIC_TRACE(fnic_cleanup_io,
-                                 sc->device->host->host_no, i, sc,
-                                 jiffies_to_msecs(jiffies - start_time),
-                                 0, ((u64)sc->cmnd[0] << 32 |
-                                 (u64)sc->cmnd[2] << 24 |
-                                 (u64)sc->cmnd[3] << 16 |
-                                 (u64)sc->cmnd[4] << 8 | sc->cmnd[5]),
-                                 (((u64)CMD_FLAGS(sc) << 32) | CMD_STATE(sc)));
+cleanup_scsi_cmd:
+       sc->result = DID_TRANSPORT_DISRUPTED << 16;
+       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
+                     "fnic_cleanup_io: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n",
+                     sc->request->tag, sc, (jiffies - start_time));
 
-                       sc->scsi_done(sc);
-               }
+       if (atomic64_read(&fnic->io_cmpl_skip))
+               atomic64_dec(&fnic->io_cmpl_skip);
+       else
+               atomic64_inc(&fnic_stats->io_stats.io_completions);
+
+       /* Complete the command to SCSI */
+       if (sc->scsi_done) {
+               if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED))
+                       shost_printk(KERN_ERR, fnic->lport->host,
+                                    "Calling done for IO not issued to fw: tag:0x%x sc:0x%p\n",
+                                    sc->request->tag, sc);
+
+               FNIC_TRACE(fnic_cleanup_io,
+                          sc->device->host->host_no, sc->request->tag, sc,
+                          jiffies_to_msecs(jiffies - start_time),
+                          0, ((u64)sc->cmnd[0] << 32 |
+                              (u64)sc->cmnd[2] << 24 |
+                              (u64)sc->cmnd[3] << 16 |
+                              (u64)sc->cmnd[4] << 8 | sc->cmnd[5]),
+                          (((u64)CMD_FLAGS(sc) << 32) | CMD_STATE(sc)));
+
+               sc->scsi_done(sc);
        }
+       return true;
+}
+
+static void fnic_cleanup_io(struct fnic *fnic)
+{
+       scsi_host_busy_iter(fnic->lport->host,
+                           fnic_cleanup_io_iter, fnic);
 }
 
 void fnic_wq_copy_cleanup_handler(struct vnic_wq_copy *wq,
@@ -1558,143 +1555,141 @@ static inline int fnic_queue_abort_io_req(struct fnic *fnic, int tag,
        return 0;
 }
 
-static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id)
+struct fnic_rport_abort_io_iter_data {
+       struct fnic *fnic;
+       u32 port_id;
+       int term_cnt;
+};
+
+static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data,
+                                    bool reserved)
 {
-       int tag;
-       int abt_tag;
-       int term_cnt = 0;
+       struct fnic_rport_abort_io_iter_data *iter_data = data;
+       struct fnic *fnic = iter_data->fnic;
+       int abt_tag = sc->request->tag;
        struct fnic_io_req *io_req;
        spinlock_t *io_lock;
        unsigned long flags;
-       struct scsi_cmnd *sc;
        struct reset_stats *reset_stats = &fnic->fnic_stats.reset_stats;
        struct terminate_stats *term_stats = &fnic->fnic_stats.term_stats;
        struct scsi_lun fc_lun;
        enum fnic_ioreq_state old_ioreq_state;
 
-       FNIC_SCSI_DBG(KERN_DEBUG,
-                     fnic->lport->host,
-                     "fnic_rport_exch_reset called portid 0x%06x\n",
-                     port_id);
-
-       if (fnic->in_remove)
-               return;
-
-       for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) {
-               abt_tag = tag;
-               io_lock = fnic_io_lock_tag(fnic, tag);
-               spin_lock_irqsave(io_lock, flags);
-               sc = scsi_host_find_tag(fnic->lport->host, tag);
-               if (!sc) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
+       io_lock = fnic_io_lock_tag(fnic, abt_tag);
+       spin_lock_irqsave(io_lock, flags);
 
-               io_req = (struct fnic_io_req *)CMD_SP(sc);
+       io_req = (struct fnic_io_req *)CMD_SP(sc);
 
-               if (!io_req || io_req->port_id != port_id) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
+       if (!io_req || io_req->port_id != iter_data->port_id) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
 
-               if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
-                       (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) {
-                       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
+       if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
+           (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) {
+               FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
                        "fnic_rport_exch_reset dev rst not pending sc 0x%p\n",
                        sc);
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
 
-               /*
-                * Found IO that is still pending with firmware and
-                * belongs to rport that went away
-                */
-               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-               if (io_req->abts_done) {
-                       shost_printk(KERN_ERR, fnic->lport->host,
+       /*
+        * Found IO that is still pending with firmware and
+        * belongs to rport that went away
+        */
+       if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
+       if (io_req->abts_done) {
+               shost_printk(KERN_ERR, fnic->lport->host,
                        "fnic_rport_exch_reset: io_req->abts_done is set "
                        "state is %s\n",
                        fnic_ioreq_state_to_str(CMD_STATE(sc)));
-               }
+       }
 
-               if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) {
-                       shost_printk(KERN_ERR, fnic->lport->host,
-                                 "rport_exch_reset "
-                                 "IO not yet issued %p tag 0x%x flags "
-                                 "%x state %d\n",
-                                 sc, tag, CMD_FLAGS(sc), CMD_STATE(sc));
-               }
-               old_ioreq_state = CMD_STATE(sc);
-               CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING;
-               CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE;
-               if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
-                       atomic64_inc(&reset_stats->device_reset_terminates);
-                       abt_tag = (tag | FNIC_TAG_DEV_RST);
-                       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                       "fnic_rport_exch_reset dev rst sc 0x%p\n",
-                       sc);
-               }
+       if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) {
+               shost_printk(KERN_ERR, fnic->lport->host,
+                            "rport_exch_reset "
+                            "IO not yet issued %p tag 0x%x flags "
+                            "%x state %d\n",
+                            sc, abt_tag, CMD_FLAGS(sc), CMD_STATE(sc));
+       }
+       old_ioreq_state = CMD_STATE(sc);
+       CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING;
+       CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE;
+       if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
+               atomic64_inc(&reset_stats->device_reset_terminates);
+               abt_tag |= FNIC_TAG_DEV_RST;
+       }
+       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
+                     "fnic_rport_exch_reset dev rst sc 0x%p\n", sc);
+       BUG_ON(io_req->abts_done);
 
-               BUG_ON(io_req->abts_done);
+       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
+                     "fnic_rport_reset_exch: Issuing abts\n");
 
-               FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                             "fnic_rport_reset_exch: Issuing abts\n");
+       spin_unlock_irqrestore(io_lock, flags);
 
+       /* Now queue the abort command to firmware */
+       int_to_scsilun(sc->device->lun, &fc_lun);
+
+       if (fnic_queue_abort_io_req(fnic, abt_tag,
+                                   FCPIO_ITMF_ABT_TASK_TERM,
+                                   fc_lun.scsi_lun, io_req)) {
+               /*
+                * Revert the cmd state back to old state, if
+                * it hasn't changed in between. This cmd will get
+                * aborted later by scsi_eh, or cleaned up during
+                * lun reset
+                */
+               spin_lock_irqsave(io_lock, flags);
+               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
+                       CMD_STATE(sc) = old_ioreq_state;
                spin_unlock_irqrestore(io_lock, flags);
+       } else {
+               spin_lock_irqsave(io_lock, flags);
+               if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET)
+                       CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED;
+               else
+                       CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
+               spin_unlock_irqrestore(io_lock, flags);
+               atomic64_inc(&term_stats->terminates);
+               iter_data->term_cnt++;
+       }
+       return true;
+}
 
-               /* Now queue the abort command to firmware */
-               int_to_scsilun(sc->device->lun, &fc_lun);
+static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id)
+{
+       struct terminate_stats *term_stats = &fnic->fnic_stats.term_stats;
+       struct fnic_rport_abort_io_iter_data iter_data = {
+               .fnic = fnic,
+               .port_id = port_id,
+               .term_cnt = 0,
+       };
 
-               if (fnic_queue_abort_io_req(fnic, abt_tag,
-                                           FCPIO_ITMF_ABT_TASK_TERM,
-                                           fc_lun.scsi_lun, io_req)) {
-                       /*
-                        * Revert the cmd state back to old state, if
-                        * it hasn't changed in between. This cmd will get
-                        * aborted later by scsi_eh, or cleaned up during
-                        * lun reset
-                        */
-                       spin_lock_irqsave(io_lock, flags);
-                       if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
-                               CMD_STATE(sc) = old_ioreq_state;
-                       spin_unlock_irqrestore(io_lock, flags);
-               } else {
-                       spin_lock_irqsave(io_lock, flags);
-                       if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET)
-                               CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED;
-                       else
-                               CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
-                       spin_unlock_irqrestore(io_lock, flags);
-                       atomic64_inc(&term_stats->terminates);
-                       term_cnt++;
-               }
-       }
-       if (term_cnt > atomic64_read(&term_stats->max_terminates))
-               atomic64_set(&term_stats->max_terminates, term_cnt);
+       FNIC_SCSI_DBG(KERN_DEBUG,
+                     fnic->lport->host,
+                     "fnic_rport_exch_reset called portid 0x%06x\n",
+                     port_id);
+
+       if (fnic->in_remove)
+               return;
+
+       scsi_host_busy_iter(fnic->lport->host, fnic_rport_abort_io_iter,
+                           &iter_data);
+       if (iter_data.term_cnt > atomic64_read(&term_stats->max_terminates))
+               atomic64_set(&term_stats->max_terminates, iter_data.term_cnt);
 
 }
 
 void fnic_terminate_rport_io(struct fc_rport *rport)
 {
-       int tag;
-       int abt_tag;
-       int term_cnt = 0;
-       struct fnic_io_req *io_req;
-       spinlock_t *io_lock;
-       unsigned long flags;
-       struct scsi_cmnd *sc;
-       struct scsi_lun fc_lun;
        struct fc_rport_libfc_priv *rdata;
        struct fc_lport *lport;
        struct fnic *fnic;
-       struct fc_rport *cmd_rport;
-       struct reset_stats *reset_stats;
-       struct terminate_stats *term_stats;
-       enum fnic_ioreq_state old_ioreq_state;
 
        if (!rport) {
                printk(KERN_ERR "fnic_terminate_rport_io: rport is NULL\n");
@@ -1722,108 +1717,7 @@ void fnic_terminate_rport_io(struct fc_rport *rport)
        if (fnic->in_remove)
                return;
 
-       reset_stats = &fnic->fnic_stats.reset_stats;
-       term_stats = &fnic->fnic_stats.term_stats;
-
-       for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) {
-               abt_tag = tag;
-               io_lock = fnic_io_lock_tag(fnic, tag);
-               spin_lock_irqsave(io_lock, flags);
-               sc = scsi_host_find_tag(fnic->lport->host, tag);
-               if (!sc) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               io_req = (struct fnic_io_req *)CMD_SP(sc);
-               if (!io_req) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               cmd_rport = starget_to_rport(scsi_target(sc->device));
-               if (rport != cmd_rport) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
-                       (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) {
-                       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                       "fnic_terminate_rport_io dev rst not pending sc 0x%p\n",
-                       sc);
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-               /*
-                * Found IO that is still pending with firmware and
-                * belongs to rport that went away
-                */
-               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-               if (io_req->abts_done) {
-                       shost_printk(KERN_ERR, fnic->lport->host,
-                       "fnic_terminate_rport_io: io_req->abts_done is set "
-                       "state is %s\n",
-                       fnic_ioreq_state_to_str(CMD_STATE(sc)));
-               }
-               if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) {
-                       FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
-                                 "fnic_terminate_rport_io "
-                                 "IO not yet issued %p tag 0x%x flags "
-                                 "%x state %d\n",
-                                 sc, tag, CMD_FLAGS(sc), CMD_STATE(sc));
-               }
-               old_ioreq_state = CMD_STATE(sc);
-               CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING;
-               CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE;
-               if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
-                       atomic64_inc(&reset_stats->device_reset_terminates);
-                       abt_tag = (tag | FNIC_TAG_DEV_RST);
-                       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                       "fnic_terminate_rport_io dev rst sc 0x%p\n", sc);
-               }
-
-               BUG_ON(io_req->abts_done);
-
-               FNIC_SCSI_DBG(KERN_DEBUG,
-                             fnic->lport->host,
-                             "fnic_terminate_rport_io: Issuing abts\n");
-
-               spin_unlock_irqrestore(io_lock, flags);
-
-               /* Now queue the abort command to firmware */
-               int_to_scsilun(sc->device->lun, &fc_lun);
-
-               if (fnic_queue_abort_io_req(fnic, abt_tag,
-                                           FCPIO_ITMF_ABT_TASK_TERM,
-                                           fc_lun.scsi_lun, io_req)) {
-                       /*
-                        * Revert the cmd state back to old state, if
-                        * it hasn't changed in between. This cmd will get
-                        * aborted later by scsi_eh, or cleaned up during
-                        * lun reset
-                        */
-                       spin_lock_irqsave(io_lock, flags);
-                       if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
-                               CMD_STATE(sc) = old_ioreq_state;
-                       spin_unlock_irqrestore(io_lock, flags);
-               } else {
-                       spin_lock_irqsave(io_lock, flags);
-                       if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET)
-                               CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED;
-                       else
-                               CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
-                       spin_unlock_irqrestore(io_lock, flags);
-                       atomic64_inc(&term_stats->terminates);
-                       term_cnt++;
-               }
-       }
-       if (term_cnt > atomic64_read(&term_stats->max_terminates))
-               atomic64_set(&term_stats->max_terminates, term_cnt);
-
+       fnic_rport_exch_reset(fnic, rport->port_id);
 }
 
 /*
@@ -2118,165 +2012,183 @@ lr_io_req_end:
        return ret;
 }
 
-/*
- * Clean up any pending aborts on the lun
- * For each outstanding IO on this lun, whose abort is not completed by fw,
- * issue a local abort. Wait for abort to complete. Return 0 if all commands
- * successfully aborted, 1 otherwise
- */
-static int fnic_clean_pending_aborts(struct fnic *fnic,
-                                    struct scsi_cmnd *lr_sc,
-                                        bool new_sc)
+struct fnic_pending_aborts_iter_data {
+       struct fnic *fnic;
+       struct scsi_cmnd *lr_sc;
+       struct scsi_device *lun_dev;
+       int ret;
+};
 
+static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
+                                    void *data, bool reserved)
 {
-       int tag, abt_tag;
+       struct fnic_pending_aborts_iter_data *iter_data = data;
+       struct fnic *fnic = iter_data->fnic;
+       struct scsi_device *lun_dev = iter_data->lun_dev;
+       int abt_tag = sc->request->tag;
        struct fnic_io_req *io_req;
        spinlock_t *io_lock;
        unsigned long flags;
-       int ret = 0;
-       struct scsi_cmnd *sc;
        struct scsi_lun fc_lun;
-       struct scsi_device *lun_dev = lr_sc->device;
        DECLARE_COMPLETION_ONSTACK(tm_done);
        enum fnic_ioreq_state old_ioreq_state;
 
-       for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) {
-               io_lock = fnic_io_lock_tag(fnic, tag);
-               spin_lock_irqsave(io_lock, flags);
-               sc = scsi_host_find_tag(fnic->lport->host, tag);
-               /*
-                * ignore this lun reset cmd if issued using new SC
-                * or cmds that do not belong to this lun
-                */
-               if (!sc || ((sc == lr_sc) && new_sc) || sc->device != lun_dev) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               io_req = (struct fnic_io_req *)CMD_SP(sc);
+       if (sc == iter_data->lr_sc || sc->device != lun_dev)
+               return true;
+       if (reserved)
+               return true;
 
-               if (!io_req || sc->device != lun_dev) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-
-               /*
-                * Found IO that is still pending with firmware and
-                * belongs to the LUN that we are resetting
-                */
-               FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
-                             "Found IO in %s on lun\n",
-                             fnic_ioreq_state_to_str(CMD_STATE(sc)));
-
-               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
-               if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
-                       (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) {
-                       FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
-                               "%s dev rst not pending sc 0x%p\n", __func__,
-                               sc);
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
+       io_lock = fnic_io_lock_tag(fnic, abt_tag);
+       spin_lock_irqsave(io_lock, flags);
+       io_req = (struct fnic_io_req *)CMD_SP(sc);
+       if (!io_req) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
 
-               if (io_req->abts_done)
-                       shost_printk(KERN_ERR, fnic->lport->host,
-                         "%s: io_req->abts_done is set state is %s\n",
-                         __func__, fnic_ioreq_state_to_str(CMD_STATE(sc)));
-               old_ioreq_state = CMD_STATE(sc);
-               /*
-                * Any pending IO issued prior to reset is expected to be
-                * in abts pending state, if not we need to set
-                * FNIC_IOREQ_ABTS_PENDING to indicate the IO is abort pending.
-                * When IO is completed, the IO will be handed over and
-                * handled in this function.
-                */
-               CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING;
+       /*
+        * Found IO that is still pending with firmware and
+        * belongs to the LUN that we are resetting
+        */
+       FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
+                     "Found IO in %s on lun\n",
+                     fnic_ioreq_state_to_str(CMD_STATE(sc)));
 
-               BUG_ON(io_req->abts_done);
+       if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
+       if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
+           (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) {
+               FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
+                             "%s dev rst not pending sc 0x%p\n", __func__,
+                             sc);
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
 
-               abt_tag = tag;
-               if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
-                       abt_tag |= FNIC_TAG_DEV_RST;
-                       FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
-                                 "%s: dev rst sc 0x%p\n", __func__, sc);
-               }
+       if (io_req->abts_done)
+               shost_printk(KERN_ERR, fnic->lport->host,
+                            "%s: io_req->abts_done is set state is %s\n",
+                            __func__, fnic_ioreq_state_to_str(CMD_STATE(sc)));
+       old_ioreq_state = CMD_STATE(sc);
+       /*
+        * Any pending IO issued prior to reset is expected to be
+        * in abts pending state, if not we need to set
+        * FNIC_IOREQ_ABTS_PENDING to indicate the IO is abort pending.
+        * When IO is completed, the IO will be handed over and
+        * handled in this function.
+        */
+       CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING;
 
-               CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE;
-               io_req->abts_done = &tm_done;
-               spin_unlock_irqrestore(io_lock, flags);
+       BUG_ON(io_req->abts_done);
 
-               /* Now queue the abort command to firmware */
-               int_to_scsilun(sc->device->lun, &fc_lun);
+       if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
+               abt_tag |= FNIC_TAG_DEV_RST;
+               FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
+                             "%s: dev rst sc 0x%p\n", __func__, sc);
+       }
 
-               if (fnic_queue_abort_io_req(fnic, abt_tag,
-                                           FCPIO_ITMF_ABT_TASK_TERM,
-                                           fc_lun.scsi_lun, io_req)) {
-                       spin_lock_irqsave(io_lock, flags);
-                       io_req = (struct fnic_io_req *)CMD_SP(sc);
-                       if (io_req)
-                               io_req->abts_done = NULL;
-                       if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
-                               CMD_STATE(sc) = old_ioreq_state;
-                       spin_unlock_irqrestore(io_lock, flags);
-                       ret = 1;
-                       goto clean_pending_aborts_end;
-               } else {
-                       spin_lock_irqsave(io_lock, flags);
-                       if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET)
-                               CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED;
-                       spin_unlock_irqrestore(io_lock, flags);
-               }
-               CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
+       CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE;
+       io_req->abts_done = &tm_done;
+       spin_unlock_irqrestore(io_lock, flags);
 
-               wait_for_completion_timeout(&tm_done,
-                                           msecs_to_jiffies
-                                           (fnic->config.ed_tov));
+       /* Now queue the abort command to firmware */
+       int_to_scsilun(sc->device->lun, &fc_lun);
 
-               /* Recheck cmd state to check if it is now aborted */
+       if (fnic_queue_abort_io_req(fnic, abt_tag,
+                                   FCPIO_ITMF_ABT_TASK_TERM,
+                                   fc_lun.scsi_lun, io_req)) {
                spin_lock_irqsave(io_lock, flags);
                io_req = (struct fnic_io_req *)CMD_SP(sc);
-               if (!io_req) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_REQ_NULL;
-                       continue;
-               }
+               if (io_req)
+                       io_req->abts_done = NULL;
+               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
+                       CMD_STATE(sc) = old_ioreq_state;
+               spin_unlock_irqrestore(io_lock, flags);
+               iter_data->ret = FAILED;
+               return false;
+       } else {
+               spin_lock_irqsave(io_lock, flags);
+               if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET)
+                       CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED;
+               spin_unlock_irqrestore(io_lock, flags);
+       }
+       CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
 
-               io_req->abts_done = NULL;
+       wait_for_completion_timeout(&tm_done, msecs_to_jiffies
+                                   (fnic->config.ed_tov));
 
-               /* if abort is still pending with fw, fail */
-               if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE;
-                       ret = 1;
-                       goto clean_pending_aborts_end;
-               }
-               CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE;
+       /* Recheck cmd state to check if it is now aborted */
+       spin_lock_irqsave(io_lock, flags);
+       io_req = (struct fnic_io_req *)CMD_SP(sc);
+       if (!io_req) {
+               spin_unlock_irqrestore(io_lock, flags);
+               CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_REQ_NULL;
+               return true;
+       }
 
-               /* original sc used for lr is handled by dev reset code */
-               if (sc != lr_sc)
-                       CMD_SP(sc) = NULL;
+       io_req->abts_done = NULL;
+
+       /* if abort is still pending with fw, fail */
+       if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) {
                spin_unlock_irqrestore(io_lock, flags);
+               CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE;
+               iter_data->ret = FAILED;
+               return false;
+       }
+       CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE;
 
-               /* original sc used for lr is handled by dev reset code */
-               if (sc != lr_sc) {
-                       fnic_release_ioreq_buf(fnic, io_req, sc);
-                       mempool_free(io_req, fnic->io_req_pool);
-               }
+       /* original sc used for lr is handled by dev reset code */
+       if (sc != iter_data->lr_sc)
+               CMD_SP(sc) = NULL;
+       spin_unlock_irqrestore(io_lock, flags);
 
-               /*
-                * Any IO is returned during reset, it needs to call scsi_done
-                * to return the scsi_cmnd to upper layer.
-                */
-               if (sc->scsi_done) {
-                       /* Set result to let upper SCSI layer retry */
-                       sc->result = DID_RESET << 16;
-                       sc->scsi_done(sc);
-               }
+       /* original sc used for lr is handled by dev reset code */
+       if (sc != iter_data->lr_sc) {
+               fnic_release_ioreq_buf(fnic, io_req, sc);
+               mempool_free(io_req, fnic->io_req_pool);
        }
 
+       /*
+        * Any IO is returned during reset, it needs to call scsi_done
+        * to return the scsi_cmnd to upper layer.
+        */
+       if (sc->scsi_done) {
+               /* Set result to let upper SCSI layer retry */
+               sc->result = DID_RESET << 16;
+               sc->scsi_done(sc);
+       }
+       return true;
+}
+
+/*
+ * Clean up any pending aborts on the lun
+ * For each outstanding IO on this lun, whose abort is not completed by fw,
+ * issue a local abort. Wait for abort to complete. Return 0 if all commands
+ * successfully aborted, 1 otherwise
+ */
+static int fnic_clean_pending_aborts(struct fnic *fnic,
+                                    struct scsi_cmnd *lr_sc,
+                                    bool new_sc)
+
+{
+       int ret = SUCCESS;
+       struct fnic_pending_aborts_iter_data iter_data = {
+               .fnic = fnic,
+               .lun_dev = lr_sc->device,
+               .ret = SUCCESS,
+       };
+
+       if (new_sc)
+               iter_data.lr_sc = lr_sc;
+
+       scsi_host_busy_iter(fnic->lport->host,
+                           fnic_pending_aborts_iter, &iter_data);
+       if (iter_data.ret == FAILED) {
+               ret = iter_data.ret;
+               goto clean_pending_aborts_end;
+       }
        schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov));
 
        /* walk again to check, if IOs are still pending in fw */
@@ -2775,58 +2687,72 @@ call_fc_exch_mgr_reset:
 
 }
 
-/*
- * fnic_is_abts_pending() is a helper function that
- * walks through tag map to check if there is any IOs pending,if there is one,
- * then it returns 1 (true), otherwise 0 (false)
- * if @lr_sc is non NULL, then it checks IOs specific to particular LUN,
- * otherwise, it checks for all IOs.
- */
-int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc)
+static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
+                                  bool reserved)
 {
-       int tag;
+       struct fnic_pending_aborts_iter_data *iter_data = data;
+       struct fnic *fnic = iter_data->fnic;
+       int cmd_state;
        struct fnic_io_req *io_req;
        spinlock_t *io_lock;
        unsigned long flags;
-       int ret = 0;
-       struct scsi_cmnd *sc;
-       struct scsi_device *lun_dev = NULL;
 
-       if (lr_sc)
-               lun_dev = lr_sc->device;
+       /*
+        * ignore this lun reset cmd or cmds that do not belong to
+        * this lun
+        */
+       if (iter_data->lr_sc && sc == iter_data->lr_sc)
+               return true;
+       if (iter_data->lun_dev && sc->device != iter_data->lun_dev)
+               return true;
 
-       /* walk again to check, if IOs are still pending in fw */
-       for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) {
-               sc = scsi_host_find_tag(fnic->lport->host, tag);
-               /*
-                * ignore this lun reset cmd or cmds that do not belong to
-                * this lun
-                */
-               if (!sc || (lr_sc && (sc->device != lun_dev || sc == lr_sc)))
-                       continue;
+       io_lock = fnic_io_lock_hash(fnic, sc);
+       spin_lock_irqsave(io_lock, flags);
 
-               io_lock = fnic_io_lock_hash(fnic, sc);
-               spin_lock_irqsave(io_lock, flags);
+       io_req = (struct fnic_io_req *)CMD_SP(sc);
+       if (!io_req) {
+               spin_unlock_irqrestore(io_lock, flags);
+               return true;
+       }
 
-               io_req = (struct fnic_io_req *)CMD_SP(sc);
+       /*
+        * Found IO that is still pending with firmware and
+        * belongs to the LUN that we are resetting
+        */
+       FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
+                     "Found IO in %s on lun\n",
+                     fnic_ioreq_state_to_str(CMD_STATE(sc)));
+       cmd_state = CMD_STATE(sc);
+       spin_unlock_irqrestore(io_lock, flags);
+       if (cmd_state == FNIC_IOREQ_ABTS_PENDING)
+               iter_data->ret = 1;
 
-               if (!io_req || sc->device != lun_dev) {
-                       spin_unlock_irqrestore(io_lock, flags);
-                       continue;
-               }
+       return iter_data->ret ? false : true;
+}
 
-               /*
-                * Found IO that is still pending with firmware and
-                * belongs to the LUN that we are resetting
-                */
-               FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
-                             "Found IO in %s on lun\n",
-                             fnic_ioreq_state_to_str(CMD_STATE(sc)));
+/*
+ * fnic_is_abts_pending() is a helper function that
+ * walks through tag map to check if there is any IOs pending,if there is one,
+ * then it returns 1 (true), otherwise 0 (false)
+ * if @lr_sc is non NULL, then it checks IOs specific to particular LUN,
+ * otherwise, it checks for all IOs.
+ */
+int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc)
+{
+       struct fnic_pending_aborts_iter_data iter_data = {
+               .fnic = fnic,
+               .lun_dev = NULL,
+               .ret = 0,
+       };
 
-               if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING)
-                       ret = 1;
-               spin_unlock_irqrestore(io_lock, flags);
+       if (lr_sc) {
+               iter_data.lun_dev = lr_sc->device;
+               iter_data.lr_sc = lr_sc;
        }
 
-       return ret;
+       /* walk again to check, if IOs are still pending in fw */
+       scsi_host_busy_iter(fnic->lport->host,
+                           fnic_abts_pending_iter, &iter_data);
+
+       return iter_data.ret;
 }
index 1a3c534..bc33d54 100644 (file)
@@ -7099,23 +7099,3 @@ ips_init_phase2(int index)
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING);
 MODULE_VERSION(IPS_VER_STRING);
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
index 6c0678f..65edf00 100644 (file)
@@ -1211,23 +1211,3 @@ typedef struct {
       IPS_COMPAT_TAMPA, \
       IPS_COMPAT_KEYWEST \
    }
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
index de71d24..6d14a7a 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* PARISC LASI driver for the 53c700 chip
  *
index c2776b8..38cfe1b 100644 (file)
@@ -934,7 +934,7 @@ lpfc_bsg_ct_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
        INIT_LIST_HEAD(&head);
        list_add_tail(&head, &piocbq->list);
 
-       ct_req = (struct lpfc_sli_ct_request *)bdeBuf1;
+       ct_req = (struct lpfc_sli_ct_request *)bdeBuf1->virt;
        evt_req_id = ct_req->FsType;
        cmd = ct_req->CommandResponse.bits.CmdRsp;
 
index 1e4c792..5f018d0 100644 (file)
@@ -254,13 +254,13 @@ lpfc_config_port_prep(struct lpfc_hba *phba)
                if (mb->un.varDmp.word_cnt == 0)
                        break;
 
-               i =  mb->un.varDmp.word_cnt * sizeof(uint32_t);
-               if (offset + i >  DMP_VPD_SIZE)
-                       i =  DMP_VPD_SIZE - offset;
+               if (mb->un.varDmp.word_cnt > DMP_VPD_SIZE - offset)
+                       mb->un.varDmp.word_cnt = DMP_VPD_SIZE - offset;
                lpfc_sli_pcimem_bcopy(((uint8_t *)mb) + DMP_RSP_OFFSET,
-                                     lpfc_vpd_data  + offset, i);
-               offset += i;
-       } while (offset < DMP_VPD_SIZE);
+                                     lpfc_vpd_data + offset,
+                                     mb->un.varDmp.word_cnt);
+               offset += mb->un.varDmp.word_cnt;
+       } while (mb->un.varDmp.word_cnt && offset < DMP_VPD_SIZE);
 
        lpfc_parse_vpd(phba, lpfc_vpd_data, offset);
 
index 06ccc01..573c859 100644 (file)
@@ -11804,13 +11804,20 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport,
                           lpfc_ctx_cmd ctx_cmd)
 {
        struct lpfc_io_buf *lpfc_cmd;
+       IOCB_t *icmd = NULL;
        int rc = 1;
 
        if (!iocbq || iocbq->vport != vport)
                return rc;
 
-       if (!(iocbq->iocb_flag &  LPFC_IO_FCP) ||
-           !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ))
+       if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+           !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
+             iocbq->iocb_flag & LPFC_DRIVER_ABORTED)
+               return rc;
+
+       icmd = &iocbq->iocb;
+       if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+           icmd->ulpCommand == CMD_CLOSE_XRI_CN)
                return rc;
 
        lpfc_cmd = container_of(iocbq, struct lpfc_io_buf, cur_iocbq);
@@ -19770,7 +19777,7 @@ lpfc_sli_get_config_region23(struct lpfc_hba *phba, char *rgn23_data)
        LPFC_MBOXQ_t *pmb = NULL;
        MAILBOX_t *mb;
        uint32_t offset = 0;
-       int i, rc;
+       int rc;
 
        if (!rgn23_data)
                return 0;
@@ -19801,13 +19808,14 @@ lpfc_sli_get_config_region23(struct lpfc_hba *phba, char *rgn23_data)
                if (mb->un.varDmp.word_cnt == 0)
                        break;
 
-               i =  mb->un.varDmp.word_cnt * sizeof(uint32_t);
-               if (offset + i >  DMP_RGN23_SIZE)
-                       i =  DMP_RGN23_SIZE - offset;
+               if (mb->un.varDmp.word_cnt > DMP_RGN23_SIZE - offset)
+                       mb->un.varDmp.word_cnt = DMP_RGN23_SIZE - offset;
+
                lpfc_sli_pcimem_bcopy(((uint8_t *)mb) + DMP_RSP_OFFSET,
-                                     rgn23_data  + offset, i);
-               offset += i;
-       } while (offset < DMP_RGN23_SIZE);
+                                      rgn23_data + offset,
+                                      mb->un.varDmp.word_cnt);
+               offset += mb->un.varDmp.word_cnt;
+       } while (mb->un.varDmp.word_cnt && offset < DMP_RGN23_SIZE);
 
        mempool_free(pmb, phba->mbox_mem_pool);
        return offset;
index 01a1bfb..f0ef8f7 100644 (file)
@@ -781,5 +781,3 @@ typedef struct {
 } __attribute__ ((packed)) mbox_sgl32;
 
 #endif         // _MRAID_MBOX_DEFS_H_
-
-/* vim: set ts=8 sw=8 tw=78: */
index 3a7596e..2ad0aa2 100644 (file)
@@ -282,5 +282,3 @@ struct mraid_pci_blk {
 };
 
 #endif // _MEGA_COMMON_H_
-
-// vim: set ts=8 sw=8 tw=78:
index b1a2d35..145fde3 100644 (file)
@@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha
  */
 module_init(megaraid_init);
 module_exit(megaraid_exit);
-
-/* vim: set ts=8 sw=8 tw=78 ai si: */
index 3e4347c..d2fe7f6 100644 (file)
@@ -230,5 +230,3 @@ typedef struct {
 #define WROUTDOOR(rdev, value) writel(value, (rdev)->baseaddr + 0x2C)
 
 #endif // _MEGARAID_H_
-
-// vim: set ts=8 sw=8 tw=78:
index 8f35174..928da90 100644 (file)
@@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin");
 MODULE_FIRMWARE("qlogic/1280.bin");
 MODULE_FIRMWARE("qlogic/12160.bin");
 MODULE_VERSION(QLA1280_VERSION);
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
index 9c5782e..0de2505 100644 (file)
@@ -1195,6 +1195,9 @@ static int qla24xx_post_prli_work(struct scsi_qla_host *vha, fc_port_t *fcport)
 {
        struct qla_work_evt *e;
 
+       if (vha->host->active_mode == MODE_TARGET)
+               return QLA_FUNCTION_FAILED;
+
        e = qla2x00_alloc_work(vha, QLA_EVT_PRLI);
        if (!e)
                return QLA_FUNCTION_FAILED;
index d74c32f..4eab564 100644 (file)
@@ -7707,6 +7707,7 @@ struct scsi_host_template qla2xxx_driver_template = {
 
        .eh_timed_out           = fc_eh_timed_out,
        .eh_abort_handler       = qla2xxx_eh_abort,
+       .eh_should_retry_cmd    = fc_eh_should_retry_cmd,
        .eh_device_reset_handler = qla2xxx_eh_device_reset,
        .eh_target_reset_handler = qla2xxx_eh_target_reset,
        .eh_bus_reset_handler   = qla2xxx_eh_bus_reset,
index 70165be..a5d1633 100644 (file)
@@ -218,7 +218,7 @@ static const char *sdebug_version_date = "20200710";
  */
 #define SDEBUG_CANQUEUE_WORDS  3       /* a WORD is bits in a long */
 #define SDEBUG_CANQUEUE  (SDEBUG_CANQUEUE_WORDS * BITS_PER_LONG)
-#define DEF_CMD_PER_LUN  255
+#define DEF_CMD_PER_LUN  SDEBUG_CANQUEUE
 
 /* UA - Unit Attention; SA - Service Action; SSU - Start Stop Unit */
 #define F_D_IN                 1       /* Data-in command (e.g. READ) */
@@ -5695,8 +5695,8 @@ MODULE_PARM_DESC(lbpu, "enable LBP, support UNMAP command (def=0)");
 MODULE_PARM_DESC(lbpws, "enable LBP, support WRITE SAME(16) with UNMAP bit (def=0)");
 MODULE_PARM_DESC(lbpws10, "enable LBP, support WRITE SAME(10) with UNMAP bit (def=0)");
 MODULE_PARM_DESC(lowest_aligned, "lowest aligned lba (def=0)");
-MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)");
 MODULE_PARM_DESC(lun_format, "LUN format: 0->peripheral (def); 1 --> flat address method");
+MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)");
 MODULE_PARM_DESC(max_queue, "max number of queued commands (1 to max(def))");
 MODULE_PARM_DESC(medium_error_count, "count of sectors to return follow on MEDIUM error");
 MODULE_PARM_DESC(medium_error_start, "starting sector number to return MEDIUM error");
@@ -5710,7 +5710,7 @@ MODULE_PARM_DESC(opt_xferlen_exp, "optimal transfer length granularity exponent
 MODULE_PARM_DESC(opts, "1->noise, 2->medium_err, 4->timeout, 8->recovered_err... (def=0)");
 MODULE_PARM_DESC(per_host_store, "If set, next positive add_host will get new store (def=0)");
 MODULE_PARM_DESC(physblk_exp, "physical block exponent (def=0)");
-MODULE_PARM_DESC(poll_queues, "support for iouring iopoll queues (1 to max(submit_queues - 1)");
+MODULE_PARM_DESC(poll_queues, "support for iouring iopoll queues (1 to max(submit_queues - 1))");
 MODULE_PARM_DESC(ptype, "SCSI peripheral type(def=0[disk])");
 MODULE_PARM_DESC(random, "If set, uniformly randomize command duration between 0 and delay_in_ns");
 MODULE_PARM_DESC(removable, "claim to have removable media (def=0)");
@@ -7165,12 +7165,15 @@ static int sdebug_change_qdepth(struct scsi_device *sdev, int qdepth)
        }
        num_in_q = atomic_read(&devip->num_in_q);
 
+       if (qdepth > SDEBUG_CANQUEUE) {
+               qdepth = SDEBUG_CANQUEUE;
+               pr_warn("%s: requested qdepth [%d] exceeds canqueue [%d], trim\n", __func__,
+                       qdepth, SDEBUG_CANQUEUE);
+       }
        if (qdepth < 1)
                qdepth = 1;
-       /* allow to exceed max host qc_arr elements for testing */
-       if (qdepth > SDEBUG_CANQUEUE + 10)
-               qdepth = SDEBUG_CANQUEUE + 10;
-       scsi_change_queue_depth(sdev, qdepth);
+       if (qdepth != sdev->queue_depth)
+               scsi_change_queue_depth(sdev, qdepth);
 
        if (SDEBUG_OPT_Q_NOISE & sdebug_opts) {
                sdev_printk(KERN_INFO, sdev, "%s: qdepth=%d, num_in_q=%d\n",
@@ -7558,6 +7561,7 @@ static int sdebug_driver_probe(struct device *dev)
        sdbg_host = to_sdebug_host(dev);
 
        sdebug_driver_template.can_queue = sdebug_max_queue;
+       sdebug_driver_template.cmd_per_lun = sdebug_max_queue;
        if (!sdebug_clustering)
                sdebug_driver_template.dma_boundary = PAGE_SIZE - 1;
 
@@ -7593,7 +7597,11 @@ static int sdebug_driver_probe(struct device *dev)
         * If condition not met, trim poll_queues to 1 (just for simplicity).
         */
        if (poll_queues >= submit_queues) {
-               pr_warn("%s: trim poll_queues to 1\n", my_name);
+               if (submit_queues < 3)
+                       pr_warn("%s: trim poll_queues to 1\n", my_name);
+               else
+                       pr_warn("%s: trim poll_queues to 1. Perhaps try poll_queues=%d\n",
+                               my_name, submit_queues - 1);
                poll_queues = 1;
        }
        if (poll_queues)
index f1553a4..0ffdb8f 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/msdos_partition.h>
 #include <asm/unaligned.h>
 
index 97c6f81..678651b 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* SNI RM driver
  *
index d7c3cff..5d0e98a 100644 (file)
@@ -9,7 +9,7 @@
 #include "ufs.h"
 #include "ufs-sysfs.h"
 
-static const char *ufschd_uic_link_state_to_string(
+static const char *ufshcd_uic_link_state_to_string(
                        enum uic_link_state state)
 {
        switch (state) {
@@ -21,7 +21,7 @@ static const char *ufschd_uic_link_state_to_string(
        }
 }
 
-static const char *ufschd_ufs_dev_pwr_mode_to_string(
+static const char *ufshcd_ufs_dev_pwr_mode_to_string(
                        enum ufs_dev_pwr_mode state)
 {
        switch (state) {
@@ -81,7 +81,7 @@ static ssize_t rpm_target_dev_state_show(struct device *dev,
 {
        struct ufs_hba *hba = dev_get_drvdata(dev);
 
-       return sysfs_emit(buf, "%s\n", ufschd_ufs_dev_pwr_mode_to_string(
+       return sysfs_emit(buf, "%s\n", ufshcd_ufs_dev_pwr_mode_to_string(
                        ufs_pm_lvl_states[hba->rpm_lvl].dev_state));
 }
 
@@ -90,7 +90,7 @@ static ssize_t rpm_target_link_state_show(struct device *dev,
 {
        struct ufs_hba *hba = dev_get_drvdata(dev);
 
-       return sysfs_emit(buf, "%s\n", ufschd_uic_link_state_to_string(
+       return sysfs_emit(buf, "%s\n", ufshcd_uic_link_state_to_string(
                        ufs_pm_lvl_states[hba->rpm_lvl].link_state));
 }
 
@@ -113,7 +113,7 @@ static ssize_t spm_target_dev_state_show(struct device *dev,
 {
        struct ufs_hba *hba = dev_get_drvdata(dev);
 
-       return sysfs_emit(buf, "%s\n", ufschd_ufs_dev_pwr_mode_to_string(
+       return sysfs_emit(buf, "%s\n", ufshcd_ufs_dev_pwr_mode_to_string(
                                ufs_pm_lvl_states[hba->spm_lvl].dev_state));
 }
 
@@ -122,7 +122,7 @@ static ssize_t spm_target_link_state_show(struct device *dev,
 {
        struct ufs_hba *hba = dev_get_drvdata(dev);
 
-       return sysfs_emit(buf, "%s\n", ufschd_uic_link_state_to_string(
+       return sysfs_emit(buf, "%s\n", ufshcd_uic_link_state_to_string(
                                ufs_pm_lvl_states[hba->spm_lvl].link_state));
 }
 
index 0625da7..3eb5493 100644 (file)
@@ -8593,7 +8593,7 @@ static void ufshcd_vreg_set_lpm(struct ufs_hba *hba)
        } else if (!ufshcd_is_ufs_dev_active(hba)) {
                ufshcd_toggle_vreg(hba->dev, hba->vreg_info.vcc, false);
                vcc_off = true;
-               if (!ufshcd_is_link_active(hba)) {
+               if (ufshcd_is_link_hibern8(hba) || ufshcd_is_link_off(hba)) {
                        ufshcd_config_vreg_lpm(hba, hba->vreg_info.vccq);
                        ufshcd_config_vreg_lpm(hba, hba->vreg_info.vccq2);
                }
@@ -8615,7 +8615,7 @@ static int ufshcd_vreg_set_hpm(struct ufs_hba *hba)
            !hba->dev_info.is_lu_power_on_wp) {
                ret = ufshcd_setup_vreg(hba, true);
        } else if (!ufshcd_is_ufs_dev_active(hba)) {
-               if (!ret && !ufshcd_is_link_active(hba)) {
+               if (!ufshcd_is_link_active(hba)) {
                        ret = ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq);
                        if (ret)
                                goto vcc_disable;
@@ -8975,10 +8975,13 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
        if (!hba->is_powered)
                return 0;
 
+       cancel_delayed_work_sync(&hba->rpm_dev_flush_recheck_work);
+
        if ((ufs_get_pm_lvl_to_dev_pwr_mode(hba->spm_lvl) ==
             hba->curr_dev_pwr_mode) &&
            (ufs_get_pm_lvl_to_link_pwr_state(hba->spm_lvl) ==
             hba->uic_link_state) &&
+            pm_runtime_suspended(hba->dev) &&
             !hba->dev_info.b_rpm_dev_flush_capable)
                goto out;
 
index eec2fd5..198d25a 100644 (file)
@@ -1413,7 +1413,7 @@ static int tcmu_run_tmr_queue(struct tcmu_dev *udev)
        return 1;
 }
 
-static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
+static bool tcmu_handle_completions(struct tcmu_dev *udev)
 {
        struct tcmu_mailbox *mb;
        struct tcmu_cmd *cmd;
@@ -1456,7 +1456,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
                        pr_err("cmd_id %u not found, ring is broken\n",
                               entry->hdr.cmd_id);
                        set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
-                       break;
+                       return false;
                }
 
                tcmu_handle_completion(cmd, entry);
index 3ecf422..0221709 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-conmakehash
-consolemap_deftbl.c
-defkeymap.c
+/conmakehash
+/consolemap_deftbl.c
+/defkeymap.c
index a385342..4325bf7 100644 (file)
@@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver);
 EXPORT_SYMBOL(matroxfb_unregister_driver);
 EXPORT_SYMBOL(matroxfb_wait_for_sync);
 EXPORT_SYMBOL(matroxfb_enable_irq);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
index 1e8a38a..e2757ff 100644 (file)
@@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver");
 MODULE_LICENSE("GPL");
 module_init(vga16fb_init);
 module_exit(vga16fb_exit);
-
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
index 39def02..cdb9950 100644 (file)
@@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = {
        NULL,
 };
 
-static struct attribute_group v9fs_attr_group = {
+static const struct attribute_group v9fs_attr_group = {
        .attrs = v9fs_attrs,
 };
 
index 649f04f..59c32c9 100644 (file)
@@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                 * to work.
                 */
                writeback_fid = v9fs_writeback_fid(file_dentry(file));
-               if (IS_ERR(fid)) {
-                       err = PTR_ERR(fid);
+               if (IS_ERR(writeback_fid)) {
+                       err = PTR_ERR(writeback_fid);
                        mutex_unlock(&v9inode->v_mutex);
                        goto out_error;
                }
index 97e7b77..141a856 100644 (file)
@@ -223,10 +223,13 @@ config TMPFS_INODE64
 
          If unsure, say N.
 
+config ARCH_SUPPORTS_HUGETLBFS
+       def_bool n
+
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
-                  SYS_SUPPORTS_HUGETLBFS || BROKEN
+                  ARCH_SUPPORTS_HUGETLBFS || BROKEN
        help
          hugetlbfs is a filesystem backing for HugeTLB pages, based on
          ramfs. For architectures that support it, say Y here and read
@@ -335,8 +338,8 @@ config NFS_COMMON
        default y
 
 config NFS_V4_2_SSC_HELPER
-       tristate
-       default y if NFS_V4=y || NFS_FS=y
+       bool
+       default y if NFS_V4_2
 
 source "net/sunrpc/Kconfig"
 source "fs/ceph/Kconfig"
index a5244e0..b8abccd 100644 (file)
@@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                return;
 
        invalidate_bh_lrus();
@@ -1677,6 +1677,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        struct blk_plug plug;
+       size_t shorted = 0;
        ssize_t ret;
 
        if (bdev_read_only(I_BDEV(bd_inode)))
@@ -1694,12 +1695,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
                return -EOPNOTSUPP;
 
-       iov_iter_truncate(from, size - iocb->ki_pos);
+       size -= iocb->ki_pos;
+       if (iov_iter_count(from) > size) {
+               shorted = iov_iter_count(from) - size;
+               iov_iter_truncate(from, size);
+       }
 
        blk_start_plug(&plug);
        ret = __generic_file_write_iter(iocb, from);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
+       iov_iter_reexpand(from, iov_iter_count(from) + shorted);
        blk_finish_plug(&plug);
        return ret;
 }
@@ -1711,13 +1717,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        loff_t pos = iocb->ki_pos;
+       size_t shorted = 0;
+       ssize_t ret;
 
        if (pos >= size)
                return 0;
 
        size -= pos;
-       iov_iter_truncate(to, size);
-       return generic_file_read_iter(iocb, to);
+       if (iov_iter_count(to) > size) {
+               shorted = iov_iter_count(to) - size;
+               iov_iter_truncate(to, size);
+       }
+
+       ret = generic_file_read_iter(iocb, to);
+       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_read_iter);
 
index 17f93fd..2bea01d 100644 (file)
@@ -591,16 +591,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                free_extent_map(em);
 
                if (page->index == end_index) {
-                       char *userpage;
                        size_t zero_offset = offset_in_page(isize);
 
                        if (zero_offset) {
                                int zeros;
                                zeros = PAGE_SIZE - zero_offset;
-                               userpage = kmap_atomic(page);
-                               memset(userpage + zero_offset, 0, zeros);
+                               memzero_page(page, zero_offset, zeros);
                                flush_dcache_page(page);
-                               kunmap_atomic(userpage);
                        }
                }
 
index f2d1bb2..074a78a 100644 (file)
@@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
        }
 
        if (page->index == last_byte >> PAGE_SHIFT) {
-               char *userpage;
                size_t zero_offset = offset_in_page(last_byte);
 
                if (zero_offset) {
                        iosize = PAGE_SIZE - zero_offset;
-                       userpage = kmap_atomic(page);
-                       memset(userpage + zero_offset, 0, iosize);
+                       memzero_page(page, zero_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
                }
        }
        begin_page_read(fs_info, page);
@@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                u64 disk_bytenr;
 
                if (cur >= last_byte) {
-                       char *userpage;
                        struct extent_state *cached = NULL;
 
                        iosize = PAGE_SIZE - pg_offset;
-                       userpage = kmap_atomic(page);
-                       memset(userpage + pg_offset, 0, iosize);
+                       memzero_page(page, pg_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
                        unlock_extent_cached(tree, cur,
@@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
-                       char *userpage;
                        struct extent_state *cached = NULL;
 
-                       userpage = kmap_atomic(page);
-                       memset(userpage + pg_offset, 0, iosize);
+                       memzero_page(page, pg_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
@@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        }
 
        if (page->index == end_index) {
-               char *userpage;
-
-               userpage = kmap_atomic(page);
-               memset(userpage + pg_offset, 0,
-                      PAGE_SIZE - pg_offset);
-               kunmap_atomic(userpage);
+               memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
                flush_dcache_page(page);
        }
 
index b21d491..4af3360 100644 (file)
@@ -646,17 +646,12 @@ again:
                if (!ret) {
                        unsigned long offset = offset_in_page(total_compressed);
                        struct page *page = pages[nr_pages - 1];
-                       char *kaddr;
 
                        /* zero the tail end of the last page, we might be
                         * sending it down to disk
                         */
-                       if (offset) {
-                               kaddr = kmap_atomic(page);
-                               memset(kaddr + offset, 0,
-                                      PAGE_SIZE - offset);
-                               kunmap_atomic(kaddr);
-                       }
+                       if (offset)
+                               memzero_page(page, offset, PAGE_SIZE - offset);
                        will_compress = 1;
                }
        }
@@ -4833,7 +4828,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
-       char *kaddr;
        bool only_release_metadata = false;
        u32 blocksize = fs_info->sectorsize;
        pgoff_t index = from >> PAGE_SHIFT;
@@ -4925,15 +4919,13 @@ again:
        if (offset != blocksize) {
                if (!len)
                        len = blocksize - offset;
-               kaddr = kmap(page);
                if (front)
-                       memset(kaddr + (block_start - page_offset(page)),
-                               0, offset);
+                       memzero_page(page, (block_start - page_offset(page)),
+                                    offset);
                else
-                       memset(kaddr + (block_start - page_offset(page)) +  offset,
-                               0, len);
+                       memzero_page(page, (block_start - page_offset(page)) + offset,
+                                    len);
                flush_dcache_page(page);
-               kunmap(page);
        }
        ClearPageChecked(page);
        set_page_dirty(page);
@@ -6832,11 +6824,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
         * cover that region here.
         */
 
-       if (max_size + pg_offset < PAGE_SIZE) {
-               char *map = kmap(page);
-               memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
-               kunmap(page);
-       }
+       if (max_size + pg_offset < PAGE_SIZE)
+               memzero_page(page,  pg_offset + max_size,
+                            PAGE_SIZE - max_size - pg_offset);
        kfree(tmp);
        return ret;
 }
@@ -8506,7 +8496,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
-       char *kaddr;
        unsigned long zero_start;
        loff_t size;
        vm_fault_t ret;
@@ -8620,10 +8609,8 @@ again:
                zero_start = PAGE_SIZE;
 
        if (zero_start != PAGE_SIZE) {
-               kaddr = kmap(page);
-               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
+               memzero_page(page, zero_start, PAGE_SIZE - zero_start);
                flush_dcache_page(page);
-               kunmap(page);
        }
        ClearPageChecked(page);
        set_page_dirty(page);
index f4ec06b..3928ecc 100644 (file)
@@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
         * So what's in the range [500, 4095] corresponds to zeroes.
         */
        if (datal < block_size) {
-               char *map;
-
-               map = kmap(page);
-               memset(map + datal, 0, block_size - datal);
+               memzero_page(page, datal, block_size - datal);
                flush_dcache_page(page);
-               kunmap(page);
        }
 
        SetPageUptodate(page);
index d524acf..c3fa7d3 100644 (file)
@@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        unsigned long bytes_left;
        unsigned long total_out = 0;
        unsigned long pg_offset = 0;
-       char *kaddr;
 
        destlen = min_t(unsigned long, destlen, PAGE_SIZE);
        bytes_left = destlen;
@@ -455,9 +454,7 @@ next:
         * end of the inline extent (destlen) to the end of the page
         */
        if (pg_offset < destlen) {
-               kaddr = kmap_atomic(dest_page);
-               memset(kaddr + pg_offset, 0, destlen - pg_offset);
-               kunmap_atomic(kaddr);
+               memzero_page(dest_page, pg_offset, destlen - pg_offset);
        }
        return ret;
 }
index 8e9626d..3e26b46 100644 (file)
@@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
        size_t ret2;
        unsigned long total_out = 0;
        unsigned long pg_offset = 0;
-       char *kaddr;
 
        stream = ZSTD_initDStream(
                        ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
        ret = 0;
 finish:
        if (pg_offset < destlen) {
-               kaddr = kmap_atomic(dest_page);
-               memset(kaddr + pg_offset, 0, destlen - pg_offset);
-               kunmap_atomic(kaddr);
+               memzero_page(dest_page, pg_offset, destlen - pg_offset);
        }
        return ret;
 }
index c2e052c..ea48c01 100644 (file)
@@ -1260,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh)
        int i;
 
        check_irqs_on();
+       /*
+        * the refcount of buffer_head in bh_lru prevents dropping the
+        * attached page(i.e., try_to_free_buffers) so it could cause
+        * failing page migration.
+        * Skip putting upcoming bh into bh_lru until migration is done.
+        */
+       if (lru_cache_disabled())
+               return;
+
        bh_lru_lock();
 
        b = this_cpu_ptr(&bh_lrus);
@@ -1400,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
 }
 EXPORT_SYMBOL(__bread_gfp);
 
+static void __invalidate_bh_lrus(struct bh_lru *b)
+{
+       int i;
+
+       for (i = 0; i < BH_LRU_SIZE; i++) {
+               brelse(b->bhs[i]);
+               b->bhs[i] = NULL;
+       }
+}
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
  * This doesn't race because it runs in each cpu either in irq
@@ -1408,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp);
 static void invalidate_bh_lru(void *arg)
 {
        struct bh_lru *b = &get_cpu_var(bh_lrus);
-       int i;
 
-       for (i = 0; i < BH_LRU_SIZE; i++) {
-               brelse(b->bhs[i]);
-               b->bhs[i] = NULL;
-       }
+       __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
 }
 
-static bool has_bh_in_lru(int cpu, void *dummy)
+bool has_bh_in_lru(int cpu, void *dummy)
 {
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
@@ -1436,6 +1450,16 @@ void invalidate_bh_lrus(void)
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
+void invalidate_bh_lrus_cpu(int cpu)
+{
+       struct bh_lru *b;
+
+       bh_lru_lock();
+       b = per_cpu_ptr(&bh_lrus, cpu);
+       __invalidate_bh_lrus(b);
+       bh_lru_unlock();
+}
+
 void set_bh_page(struct buffer_head *bh,
                struct page *page, unsigned long offset)
 {
index 471e401..94df854 100644 (file)
@@ -6,6 +6,7 @@ config CEPH_FS
        select LIBCRC32C
        select CRYPTO_AES
        select CRYPTO
+       select NETFS_SUPPORT
        default n
        help
          Choose Y or M here to include support for mounting the
index 26e6643..c1570fa 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/signal.h>
 #include <linux/iversion.h>
 #include <linux/ktime.h>
+#include <linux/netfs.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -61,6 +62,9 @@
        (CONGESTION_ON_THRESH(congestion_kb) -                          \
         (CONGESTION_ON_THRESH(congestion_kb) >> 2))
 
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+                                       struct page *page, void **_fsdata);
+
 static inline struct ceph_snap_context *page_snap_context(struct page *page)
 {
        if (PagePrivate(page))
@@ -124,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page)
         * PagePrivate so that we get invalidatepage callback.
         */
        BUG_ON(PagePrivate(page));
-       page->private = (unsigned long)snapc;
-       SetPagePrivate(page);
+       attach_page_private(page, snapc);
 
        ret = __set_page_dirty_nobuffers(page);
        WARN_ON(!PageLocked(page));
@@ -144,19 +147,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
-       struct ceph_snap_context *snapc = page_snap_context(page);
+       struct ceph_snap_context *snapc;
+
+       wait_on_page_fscache(page);
 
        inode = page->mapping->host;
        ci = ceph_inode(inode);
 
-       if (offset != 0 || length != PAGE_SIZE) {
+       if (offset != 0 || length != thp_size(page)) {
                dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
                     inode, page, page->index, offset, length);
                return;
        }
 
-       ceph_invalidate_fscache_page(inode, page);
-
        WARN_ON(!PageLocked(page));
        if (!PagePrivate(page))
                return;
@@ -164,333 +167,222 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
        dout("%p invalidatepage %p idx %lu full dirty page\n",
             inode, page, page->index);
 
+       snapc = detach_page_private(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
        ceph_put_snap_context(snapc);
-       page->private = 0;
-       ClearPagePrivate(page);
 }
 
-static int ceph_releasepage(struct page *page, gfp_t g)
+static int ceph_releasepage(struct page *page, gfp_t gfp)
 {
        dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
             page, page->index, PageDirty(page) ? "" : "not ");
 
-       /* Can we release the page from the cache? */
-       if (!ceph_release_fscache_page(page, g))
-               return 0;
-
+       if (PageFsCache(page)) {
+               if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+                       return 0;
+               wait_on_page_fscache(page);
+       }
        return !PagePrivate(page);
 }
 
-/* read a single page, without unlocking it. */
-static int ceph_do_readpage(struct file *filp, struct page *page)
+static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
 {
-       struct inode *inode = file_inode(filp);
+       struct inode *inode = rreq->mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_osd_client *osdc = &fsc->client->osdc;
-       struct ceph_osd_request *req;
-       struct ceph_vino vino = ceph_vino(inode);
-       int err = 0;
-       u64 off = page_offset(page);
-       u64 len = PAGE_SIZE;
-
-       if (off >= i_size_read(inode)) {
-               zero_user_segment(page, 0, PAGE_SIZE);
-               SetPageUptodate(page);
-               return 0;
-       }
-
-       if (ci->i_inline_version != CEPH_INLINE_NONE) {
-               /*
-                * Uptodate inline data should have been added
-                * into page cache while getting Fcr caps.
-                */
-               if (off == 0)
-                       return -EINVAL;
-               zero_user_segment(page, 0, PAGE_SIZE);
-               SetPageUptodate(page);
-               return 0;
-       }
-
-       err = ceph_readpage_from_fscache(inode, page);
-       if (err == 0)
-               return -EINPROGRESS;
-
-       dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
-            vino.ino, vino.snap, filp, off, len, page, page->index);
-       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
-                                   ci->i_truncate_seq, ci->i_truncate_size,
-                                   false);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       struct ceph_file_layout *lo = &ci->i_layout;
+       u32 blockoff;
+       u64 blockno;
 
-       osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+       /* Expand the start downward */
+       blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+       rreq->start = blockno * lo->stripe_unit;
+       rreq->len += blockoff;
 
-       err = ceph_osdc_start_request(osdc, req, false);
-       if (!err)
-               err = ceph_osdc_wait_request(osdc, req);
-
-       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-                                req->r_end_latency, err);
-
-       ceph_osdc_put_request(req);
-       dout("readpage result %d\n", err);
-
-       if (err == -ENOENT)
-               err = 0;
-       if (err < 0) {
-               ceph_fscache_readpage_cancel(inode, page);
-               if (err == -EBLOCKLISTED)
-                       fsc->blocklisted = true;
-               goto out;
-       }
-       if (err < PAGE_SIZE)
-               /* zero fill remainder of page */
-               zero_user_segment(page, err, PAGE_SIZE);
-       else
-               flush_dcache_page(page);
-
-       SetPageUptodate(page);
-       ceph_readpage_to_fscache(inode, page);
-
-out:
-       return err < 0 ? err : 0;
+       /* Now, round up the length to the next block */
+       rreq->len = roundup(rreq->len, lo->stripe_unit);
 }
 
-static int ceph_readpage(struct file *filp, struct page *page)
+static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
 {
-       int r = ceph_do_readpage(filp, page);
-       if (r != -EINPROGRESS)
-               unlock_page(page);
-       else
-               r = 0;
-       return r;
+       struct inode *inode = subreq->rreq->mapping->host;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       u64 objno, objoff;
+       u32 xlen;
+
+       /* Truncate the extent at the end of the current block */
+       ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+                                     &objno, &objoff, &xlen);
+       subreq->len = min(xlen, fsc->mount_options->rsize);
+       return true;
 }
 
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req)
+static void finish_netfs_read(struct ceph_osd_request *req)
 {
-       struct inode *inode = req->r_inode;
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_osd_data *osd_data;
-       int rc = req->r_result <= 0 ? req->r_result : 0;
-       int bytes = req->r_result >= 0 ? req->r_result : 0;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+       struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+       struct netfs_read_subrequest *subreq = req->r_priv;
        int num_pages;
-       int i;
+       int err = req->r_result;
 
-       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
-       if (rc == -EBLOCKLISTED)
-               ceph_inode_to_client(inode)->blocklisted = true;
+       ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
+                                req->r_end_latency, err);
 
-       /* unlock all pages, zeroing any data we didn't read */
-       osd_data = osd_req_op_extent_osd_data(req, 0);
-       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-       num_pages = calc_pages_for((u64)osd_data->alignment,
-                                       (u64)osd_data->length);
-       for (i = 0; i < num_pages; i++) {
-               struct page *page = osd_data->pages[i];
-
-               if (rc < 0 && rc != -ENOENT) {
-                       ceph_fscache_readpage_cancel(inode, page);
-                       goto unlock;
-               }
-               if (bytes < (int)PAGE_SIZE) {
-                       /* zero (remainder of) page */
-                       int s = bytes < 0 ? 0 : bytes;
-                       zero_user_segment(page, s, PAGE_SIZE);
-               }
-               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
-                    page->index);
-               flush_dcache_page(page);
-               SetPageUptodate(page);
-               ceph_readpage_to_fscache(inode, page);
-unlock:
-               unlock_page(page);
-               put_page(page);
-               bytes -= PAGE_SIZE;
-       }
+       dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
+            subreq->len, i_size_read(req->r_inode));
 
-       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-                                req->r_end_latency, rc);
+       /* no object means success but no data */
+       if (err == -ENOENT)
+               err = 0;
+       else if (err == -EBLOCKLISTED)
+               fsc->blocklisted = true;
+
+       if (err >= 0 && err < subreq->len)
+               __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+
+       netfs_subreq_terminated(subreq, err, true);
 
-       kfree(osd_data->pages);
+       num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
+       ceph_put_page_vector(osd_data->pages, num_pages, false);
+       iput(req->r_inode);
 }
 
-/*
- * start an async read(ahead) operation.  return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
-                     struct list_head *page_list, int max)
+static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
 {
-       struct ceph_osd_client *osdc =
-               &ceph_inode_to_client(inode)->client->osdc;
+       struct netfs_read_request *rreq = subreq->rreq;
+       struct inode *inode = rreq->mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct page *page = lru_to_page(page_list);
-       struct ceph_vino vino;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_request *req;
-       u64 off;
-       u64 len;
-       int i;
+       struct ceph_vino vino = ceph_vino(inode);
+       struct iov_iter iter;
        struct page **pages;
-       pgoff_t next_index;
-       int nr_pages = 0;
-       int got = 0;
-       int ret = 0;
-
-       if (!rw_ctx) {
-               /* caller of readpages does not hold buffer and read caps
-                * (fadvise, madvise and readahead cases) */
-               int want = CEPH_CAP_FILE_CACHE;
-               ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
-                                       true, &got);
-               if (ret < 0) {
-                       dout("start_read %p, error getting cap\n", inode);
-               } else if (!(got & want)) {
-                       dout("start_read %p, no cache cap\n", inode);
-                       ret = 0;
-               }
-               if (ret <= 0) {
-                       if (got)
-                               ceph_put_cap_refs(ci, got);
-                       while (!list_empty(page_list)) {
-                               page = lru_to_page(page_list);
-                               list_del(&page->lru);
-                               put_page(page);
-                       }
-                       return ret;
-               }
-       }
-
-       off = (u64) page_offset(page);
+       size_t page_off;
+       int err = 0;
+       u64 len = subreq->len;
 
-       /* count pages */
-       next_index = page->index;
-       list_for_each_entry_reverse(page, page_list, lru) {
-               if (page->index != next_index)
-                       break;
-               nr_pages++;
-               next_index++;
-               if (max && nr_pages == max)
-                       break;
-       }
-       len = nr_pages << PAGE_SHIFT;
-       dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
-            off, len);
-       vino = ceph_vino(inode);
-       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
-                                   0, 1, CEPH_OSD_OP_READ,
-                                   CEPH_OSD_FLAG_READ, NULL,
-                                   ci->i_truncate_seq, ci->i_truncate_size,
-                                   false);
+       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
+                       0, 1, CEPH_OSD_OP_READ,
+                       CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
+                       NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
        if (IS_ERR(req)) {
-               ret = PTR_ERR(req);
+               err = PTR_ERR(req);
+               req = NULL;
                goto out;
        }
 
-       /* build page vector */
-       nr_pages = calc_pages_for(0, len);
-       pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto out_put;
-       }
-       for (i = 0; i < nr_pages; ++i) {
-               page = list_entry(page_list->prev, struct page, lru);
-               BUG_ON(PageLocked(page));
-               list_del(&page->lru);
-
-               dout("start_read %p adding %p idx %lu\n", inode, page,
-                    page->index);
-               if (add_to_page_cache_lru(page, &inode->i_data, page->index,
-                                         GFP_KERNEL)) {
-                       ceph_fscache_uncache_page(inode, page);
-                       put_page(page);
-                       dout("start_read %p add_to_page_cache failed %p\n",
-                            inode, page);
-                       nr_pages = i;
-                       if (nr_pages > 0) {
-                               len = nr_pages << PAGE_SHIFT;
-                               osd_req_op_extent_update(req, 0, len);
-                               break;
-                       }
-                       goto out_pages;
-               }
-               pages[i] = page;
+       dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+       iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+       err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+       if (err < 0) {
+               dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
+               goto out;
        }
+
+       /* should always give us a page-aligned read */
+       WARN_ON_ONCE(page_off);
+       len = err;
+
        osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
-       req->r_callback = finish_read;
+       req->r_callback = finish_netfs_read;
+       req->r_priv = subreq;
        req->r_inode = inode;
+       ihold(inode);
 
-       dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
-       ret = ceph_osdc_start_request(osdc, req, false);
-       if (ret < 0)
-               goto out_pages;
+       err = ceph_osdc_start_request(req->r_osdc, req, false);
+       if (err)
+               iput(inode);
+out:
        ceph_osdc_put_request(req);
+       if (err)
+               netfs_subreq_terminated(subreq, err, false);
+       dout("%s: result %d\n", __func__, err);
+}
 
-       /* After adding locked pages to page cache, the inode holds cache cap.
-        * So we can drop our cap refs. */
-       if (got)
-               ceph_put_cap_refs(ci, got);
+static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
+{
+}
 
-       return nr_pages;
+static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+{
+       struct inode *inode = mapping->host;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int got = (uintptr_t)priv;
 
-out_pages:
-       for (i = 0; i < nr_pages; ++i) {
-               ceph_fscache_readpage_cancel(inode, pages[i]);
-               unlock_page(pages[i]);
-       }
-       ceph_put_page_vector(pages, nr_pages, false);
-out_put:
-       ceph_osdc_put_request(req);
-out:
        if (got)
                ceph_put_cap_refs(ci, got);
-       return ret;
 }
 
+const struct netfs_read_request_ops ceph_netfs_read_ops = {
+       .init_rreq              = ceph_init_rreq,
+       .is_cache_enabled       = ceph_is_cache_enabled,
+       .begin_cache_operation  = ceph_begin_cache_operation,
+       .issue_op               = ceph_netfs_issue_op,
+       .expand_readahead       = ceph_netfs_expand_readahead,
+       .clamp_length           = ceph_netfs_clamp_length,
+       .check_write_begin      = ceph_netfs_check_write_begin,
+       .cleanup                = ceph_readahead_cleanup,
+};
 
-/*
- * Read multiple pages.  Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
-                         struct list_head *page_list, unsigned nr_pages)
+/* read a single page, without unlocking it. */
+static int ceph_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = file_inode(file);
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_file_info *fi = file->private_data;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_vino vino = ceph_vino(inode);
+       u64 off = page_offset(page);
+       u64 len = thp_size(page);
+
+       if (ci->i_inline_version != CEPH_INLINE_NONE) {
+               /*
+                * Uptodate inline data should have been added
+                * into page cache while getting Fcr caps.
+                */
+               if (off == 0) {
+                       unlock_page(page);
+                       return -EINVAL;
+               }
+               zero_user_segment(page, 0, thp_size(page));
+               SetPageUptodate(page);
+               unlock_page(page);
+               return 0;
+       }
+
+       dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+            vino.ino, vino.snap, file, off, len, page, page->index);
+
+       return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
+}
+
+static void ceph_readahead(struct readahead_control *ractl)
+{
+       struct inode *inode = file_inode(ractl->file);
+       struct ceph_file_info *fi = ractl->file->private_data;
        struct ceph_rw_context *rw_ctx;
-       int rc = 0;
-       int max = 0;
+       int got = 0;
+       int ret = 0;
 
        if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
-               return -EINVAL;
+               return;
 
-       rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
-                                        &nr_pages);
+       rw_ctx = ceph_find_rw_context(fi);
+       if (!rw_ctx) {
+               /*
+                * readahead callers do not necessarily hold Fcb caps
+                * (e.g. fadvise, madvise).
+                */
+               int want = CEPH_CAP_FILE_CACHE;
 
-       if (rc == 0)
-               goto out;
+               ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+               if (ret < 0)
+                       dout("start_read %p, error getting cap\n", inode);
+               else if (!(got & want))
+                       dout("start_read %p, no cache cap\n", inode);
 
-       rw_ctx = ceph_find_rw_context(fi);
-       max = fsc->mount_options->rsize >> PAGE_SHIFT;
-       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
-            inode, file, rw_ctx, nr_pages, max);
-       while (!list_empty(page_list)) {
-               rc = start_read(inode, rw_ctx, page_list, max);
-               if (rc < 0)
-                       goto out;
+               if (ret <= 0)
+                       return;
        }
-out:
-       ceph_fscache_readpages_cancel(inode, page_list);
-
-       dout("readpages %p file %p ret %d\n", inode, file, rc);
-       return rc;
+       netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
 }
 
 struct ceph_writeback_ctl
@@ -585,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode,
                spin_unlock(&ci->i_ceph_lock);
                WARN_ON(!found);
        }
-       if (end > page_offset(page) + PAGE_SIZE)
-               end = page_offset(page) + PAGE_SIZE;
+       if (end > page_offset(page) + thp_size(page))
+               end = page_offset(page) + thp_size(page);
        return end > start ? end - start : 0;
 }
 
@@ -604,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        struct ceph_snap_context *snapc, *oldest;
        loff_t page_off = page_offset(page);
        int err;
-       loff_t len = PAGE_SIZE;
+       loff_t len = thp_size(page);
        struct ceph_writeback_ctl ceph_wbc;
        struct ceph_osd_client *osdc = &fsc->client->osdc;
        struct ceph_osd_request *req;
@@ -632,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        /* is this a partial page at end of file? */
        if (page_off >= ceph_wbc.i_size) {
                dout("%p page eof %llu\n", page, ceph_wbc.i_size);
-               page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+               page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
                return 0;
        }
 
@@ -658,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        }
 
        /* it may be a short write due to an object boundary */
-       WARN_ON_ONCE(len > PAGE_SIZE);
+       WARN_ON_ONCE(len > thp_size(page));
        osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
        dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
 
@@ -667,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        if (!err)
                err = ceph_osdc_wait_request(osdc, req);
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, err);
 
        ceph_osdc_put_request(req);
@@ -695,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
        }
-       page->private = 0;
-       ClearPagePrivate(page);
+       oldest = detach_page_private(page);
+       WARN_ON_ONCE(oldest != snapc);
        end_page_writeback(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
        ceph_put_snap_context(snapc);  /* page's reference */
@@ -755,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req)
                ceph_clear_error_write(ci);
        }
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, rc);
 
        /*
@@ -788,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req)
                                clear_bdi_congested(inode_to_bdi(inode),
                                                    BLK_RW_ASYNC);
 
-                       ceph_put_snap_context(page_snap_context(page));
-                       page->private = 0;
-                       ClearPagePrivate(page);
-                       dout("unlocking %p\n", page);
+                       ceph_put_snap_context(detach_page_private(page));
                        end_page_writeback(page);
+                       dout("unlocking %p\n", page);
 
                        if (remove_page)
                                generic_error_remove_page(inode->i_mapping,
@@ -949,7 +839,7 @@ get_more_pages:
                                    page_offset(page) >= i_size_read(inode)) &&
                                    clear_page_dirty_for_io(page))
                                        mapping->a_ops->invalidatepage(page,
-                                                               0, PAGE_SIZE);
+                                                               0, thp_size(page));
                                unlock_page(page);
                                continue;
                        }
@@ -1038,7 +928,7 @@ get_more_pages:
                        pages[locked_pages++] = page;
                        pvec.pages[i] = NULL;
 
-                       len += PAGE_SIZE;
+                       len += thp_size(page);
                }
 
                /* did we get anything? */
@@ -1087,7 +977,7 @@ new_request:
                        BUG_ON(IS_ERR(req));
                }
                BUG_ON(len < page_offset(pages[locked_pages - 1]) +
-                            PAGE_SIZE - offset);
+                            thp_size(page) - offset);
 
                req->r_callback = writepages_finish;
                req->r_inode = inode;
@@ -1117,7 +1007,7 @@ new_request:
                        }
 
                        set_page_writeback(pages[i]);
-                       len += PAGE_SIZE;
+                       len += thp_size(page);
                }
 
                if (ceph_wbc.size_stable) {
@@ -1126,7 +1016,7 @@ new_request:
                        /* writepages_finish() clears writeback pages
                         * according to the data length, so make sure
                         * data length covers all locked pages */
-                       u64 min_len = len + 1 - PAGE_SIZE;
+                       u64 min_len = len + 1 - thp_size(page);
                        len = get_writepages_data_length(inode, pages[i - 1],
                                                         offset);
                        len = max(len, min_len);
@@ -1302,6 +1192,31 @@ ceph_find_incompatible(struct page *page)
        return NULL;
 }
 
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+                                       struct page *page, void **_fsdata)
+{
+       struct inode *inode = file_inode(file);
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_snap_context *snapc;
+
+       snapc = ceph_find_incompatible(page);
+       if (snapc) {
+               int r;
+
+               unlock_page(page);
+               put_page(page);
+               if (IS_ERR(snapc))
+                       return PTR_ERR(snapc);
+
+               ceph_queue_writeback(inode);
+               r = wait_event_killable(ci->i_cap_wq,
+                                       context_is_writeable_or_written(inode, snapc));
+               ceph_put_snap_context(snapc);
+               return r == 0 ? -EAGAIN : r;
+       }
+       return 0;
+}
+
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
@@ -1312,75 +1227,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_snap_context *snapc;
        struct page *page = NULL;
        pgoff_t index = pos >> PAGE_SHIFT;
-       int pos_in_page = pos & ~PAGE_MASK;
-       int r = 0;
+       int r;
 
-       dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
-
-       for (;;) {
+       /*
+        * Uninlining should have already been done and everything updated, EXCEPT
+        * for inline_version sent to the MDS.
+        */
+       if (ci->i_inline_version != CEPH_INLINE_NONE) {
                page = grab_cache_page_write_begin(mapping, index, flags);
-               if (!page) {
-                       r = -ENOMEM;
-                       break;
-               }
-
-               snapc = ceph_find_incompatible(page);
-               if (snapc) {
-                       if (IS_ERR(snapc)) {
-                               r = PTR_ERR(snapc);
-                               break;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       page = NULL;
-                       ceph_queue_writeback(inode);
-                       r = wait_event_killable(ci->i_cap_wq,
-                                               context_is_writeable_or_written(inode, snapc));
-                       ceph_put_snap_context(snapc);
-                       if (r != 0)
-                               break;
-                       continue;
-               }
-
-               if (PageUptodate(page)) {
-                       dout(" page %p already uptodate\n", page);
-                       break;
-               }
+               if (!page)
+                       return -ENOMEM;
 
                /*
-                * In some cases we don't need to read at all:
-                * - full page write
-                * - write that lies completely beyond EOF
-                * - write that covers the the page from start to EOF or beyond it
+                * The inline_version on a new inode is set to 1. If that's the
+                * case, then the page is brand new and isn't yet Uptodate.
                 */
-               if ((pos_in_page == 0 && len == PAGE_SIZE) ||
-                   (pos >= i_size_read(inode)) ||
-                   (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
-                       zero_user_segments(page, 0, pos_in_page,
-                                          pos_in_page + len, PAGE_SIZE);
-                       break;
+               r = 0;
+               if (index == 0 && ci->i_inline_version != 1) {
+                       if (!PageUptodate(page)) {
+                               WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
+                                         ci->i_inline_version);
+                               r = -EINVAL;
+                       }
+                       goto out;
                }
-
-               /*
-                * We need to read it. If we get back -EINPROGRESS, then the page was
-                * handed off to fscache and it will be unlocked when the read completes.
-                * Refind the page in that case so we can reacquire the page lock. Otherwise
-                * we got a hard error or the read was completed synchronously.
-                */
-               r = ceph_do_readpage(file, page);
-               if (r != -EINPROGRESS)
-                       break;
+               zero_user_segment(page, 0, thp_size(page));
+               SetPageUptodate(page);
+               goto out;
        }
 
+       r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
+                             &ceph_netfs_read_ops, NULL);
+out:
+       if (r == 0)
+               wait_on_page_fscache(page);
        if (r < 0) {
-               if (page) {
-                       unlock_page(page);
+               if (page)
                        put_page(page);
-               }
        } else {
+               WARN_ON_ONCE(!PageLocked(page));
                *pagep = page;
        }
        return r;
@@ -1438,7 +1325,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 
 const struct address_space_operations ceph_aops = {
        .readpage = ceph_readpage,
-       .readpages = ceph_readpages,
+       .readahead = ceph_readahead,
        .writepage = ceph_writepage,
        .writepages = ceph_writepages_start,
        .write_begin = ceph_write_begin,
@@ -1470,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
        struct inode *inode = file_inode(vma->vm_file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_file_info *fi = vma->vm_file->private_data;
-       struct page *pinned_page = NULL;
        loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
        int want, got, err;
        sigset_t oldset;
@@ -1478,21 +1364,20 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 
        ceph_block_sigs(&oldset);
 
-       dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
-            inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
+       dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
+            inode, ceph_vinop(inode), off);
        if (fi->fmode & CEPH_FILE_MODE_LAZY)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
 
        got = 0;
-       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
-                           &got, &pinned_page);
+       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
        if (err < 0)
                goto out_restore;
 
-       dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
-            inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
+       dout("filemap_fault %p %llu got cap refs on %s\n",
+            inode, off, ceph_cap_string(got));
 
        if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
            ci->i_inline_version == CEPH_INLINE_NONE) {
@@ -1500,14 +1385,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                ceph_add_rw_context(fi, &rw_ctx);
                ret = filemap_fault(vmf);
                ceph_del_rw_context(fi, &rw_ctx);
-               dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n",
-                       inode, off, (size_t)PAGE_SIZE,
-                               ceph_cap_string(got), ret);
+               dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
+                    inode, off, ceph_cap_string(got), ret);
        } else
                err = -EAGAIN;
 
-       if (pinned_page)
-               put_page(pinned_page);
        ceph_put_cap_refs(ci, got);
 
        if (err != -EAGAIN)
@@ -1542,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
 out_inline:
-               dout("filemap_fault %p %llu~%zd read inline data ret %x\n",
-                    inode, off, (size_t)PAGE_SIZE, ret);
+               dout("filemap_fault %p %llu read inline data ret %x\n",
+                    inode, off, ret);
        }
 out_restore:
        ceph_restore_sigs(&oldset);
@@ -1553,9 +1435,6 @@ out_restore:
        return ret;
 }
 
-/*
- * Reuse write_begin here for simplicity.
- */
 static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
@@ -1591,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
                        goto out_free;
        }
 
-       if (off + PAGE_SIZE <= size)
-               len = PAGE_SIZE;
+       if (off + thp_size(page) <= size)
+               len = thp_size(page);
        else
-               len = size & ~PAGE_MASK;
+               len = offset_in_thp(page, size);
 
        dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
             inode, ceph_vinop(inode), off, len, size);
@@ -1604,8 +1483,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER;
 
        got = 0;
-       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
-                           &got, NULL);
+       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
        if (err < 0)
                goto out_free;
 
@@ -1832,7 +1710,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, err);
 
 out_put:
@@ -2057,6 +1935,10 @@ int ceph_pool_perm_check(struct inode *inode, int need)
        s64 pool;
        int ret, flags;
 
+       /* Only need to do this for regular files */
+       if (!S_ISREG(inode->i_mode))
+               return 0;
+
        if (ci->i_vino.snap != CEPH_NOSNAP) {
                /*
                 * Pool permission check needs to write to the first object.
index 2f5cb6b..9cfadbb 100644 (file)
@@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
 
        ci->fscache = NULL;
 
-       fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
        fscache_relinquish_cookie(cookie, &ci->i_vino, false);
 }
 
@@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
                dout("fscache_file_set_cookie %p %p disabling cache\n",
                     inode, filp);
                fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
-               fscache_uncache_all_inode_pages(ci->fscache, inode);
        } else {
                fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
                                      ceph_fscache_can_enable, inode);
@@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
        }
 }
 
-static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
-{
-       if (!error)
-               SetPageUptodate(page);
-
-       unlock_page(page);
-}
-
-static inline bool cache_valid(struct ceph_inode_info *ci)
-{
-       return ci->i_fscache_gen == ci->i_rdcache_gen;
-}
-
-
-/* Atempt to read from the fscache,
- *
- * This function is called from the readpage_nounlock context. DO NOT attempt to
- * unlock the page here (or in the callback).
- */
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!cache_valid(ci))
-               return -ENOBUFS;
-
-       ret = fscache_read_or_alloc_page(ci->fscache, page,
-                                        ceph_readpage_from_fscache_complete, NULL,
-                                        GFP_KERNEL);
-
-       switch (ret) {
-               case 0: /* Page found */
-                       dout("page read submitted\n");
-                       return 0;
-               case -ENOBUFS: /* Pages were not found, and can't be */
-               case -ENODATA: /* Pages were not found */
-                       dout("page/inode not in cache\n");
-                       return ret;
-               default:
-                       dout("%s: unknown error ret = %i\n", __func__, ret);
-                       return ret;
-       }
-}
-
-int ceph_readpages_from_fscache(struct inode *inode,
-                                 struct address_space *mapping,
-                                 struct list_head *pages,
-                                 unsigned *nr_pages)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!cache_valid(ci))
-               return -ENOBUFS;
-
-       ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
-                                         ceph_readpage_from_fscache_complete,
-                                         NULL, mapping_gfp_mask(mapping));
-
-       switch (ret) {
-               case 0: /* All pages found */
-                       dout("all-page read submitted\n");
-                       return 0;
-               case -ENOBUFS: /* Some pages were not found, and can't be */
-               case -ENODATA: /* some pages were not found */
-                       dout("page/inode not in cache\n");
-                       return ret;
-               default:
-                       dout("%s: unknown error ret = %i\n", __func__, ret);
-                       return ret;
-       }
-}
-
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!PageFsCache(page))
-               return;
-
-       if (!cache_valid(ci))
-               return;
-
-       ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
-                                GFP_KERNEL);
-       if (ret)
-                fscache_uncache_page(ci->fscache, page);
-}
-
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-
-       if (!PageFsCache(page))
-               return;
-
-       fscache_wait_on_page_write(ci->fscache, page);
-       fscache_uncache_page(ci->fscache, page);
-}
-
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
 {
        if (fscache_cookie_valid(fsc->fscache)) {
@@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
        }
        fsc->fscache = NULL;
 }
-
-/*
- * caller should hold CEPH_CAP_FILE_{RD,CACHE}
- */
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
-       if (cache_valid(ci))
-               return;
-
-       /* resue i_truncate_mutex. There should be no pending
-        * truncate while the caller holds CEPH_CAP_FILE_RD */
-       mutex_lock(&ci->i_truncate_mutex);
-       if (!cache_valid(ci)) {
-               if (fscache_check_consistency(ci->fscache, &ci->i_vino))
-                       fscache_invalidate(ci->fscache);
-               spin_lock(&ci->i_ceph_lock);
-               ci->i_fscache_gen = ci->i_rdcache_gen;
-               spin_unlock(&ci->i_ceph_lock);
-       }
-       mutex_unlock(&ci->i_truncate_mutex);
-}
index 89dbdd1..1409d61 100644 (file)
@@ -9,6 +9,8 @@
 #ifndef _CEPH_CACHE_H
 #define _CEPH_CACHE_H
 
+#include <linux/netfs.h>
+
 #ifdef CONFIG_CEPH_FSCACHE
 
 extern struct fscache_netfs ceph_cache_netfs;
@@ -29,54 +31,37 @@ int ceph_readpages_from_fscache(struct inode *inode,
                                struct address_space *mapping,
                                struct list_head *pages,
                                unsigned *nr_pages);
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
 
 static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
        ci->fscache = NULL;
-       ci->i_fscache_gen = 0;
 }
 
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
 {
-       fscache_invalidate(ceph_inode(inode)->fscache);
+       return ci->fscache;
 }
 
-static inline void ceph_fscache_uncache_page(struct inode *inode,
-                                            struct page *page)
+static inline void ceph_fscache_invalidate(struct inode *inode)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_uncache_page(ci->fscache, page);
+       fscache_invalidate(ceph_inode(inode)->fscache);
 }
 
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
-       struct inode* inode = page->mapping->host;
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_maybe_release_page(ci->fscache, page, gfp);
-}
+       struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode));
 
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
-                                               struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
-               __fscache_uncache_page(ci->fscache, page);
+       if (!cookie)
+               return false;
+       return fscache_cookie_enabled(cookie);
 }
 
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
-                                                struct list_head *pages)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_readpages_cancel(ci->fscache, pages);
-}
+       struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
 
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
-{
-       ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+       return fscache_begin_read_operation(rreq, cookie);
 }
-
 #else
 
 static inline int ceph_fscache_register(void)
@@ -102,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
 }
 
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+       return NULL;
+}
+
 static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
 {
 }
@@ -115,62 +105,19 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode,
 {
 }
 
-static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
-}
-
-static inline void ceph_fscache_uncache_page(struct inode *inode,
-                                            struct page *pages)
-{
-}
-
-static inline int ceph_readpage_from_fscache(struct inode* inode,
-                                            struct page *page)
-{
-       return -ENOBUFS;
-}
-
-static inline int ceph_readpages_from_fscache(struct inode *inode,
-                                             struct address_space *mapping,
-                                             struct list_head *pages,
-                                             unsigned *nr_pages)
-{
-       return -ENOBUFS;
-}
-
-static inline void ceph_readpage_to_fscache(struct inode *inode,
-                                           struct page *page)
-{
-}
-
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
 }
 
-static inline void ceph_invalidate_fscache_page(struct inode *inode,
-                                               struct page *page)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
+       return false;
 }
 
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
-{
-       return 1;
-}
-
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
-                                               struct page *page)
-{
-}
-
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
-                                                struct list_head *pages)
-{
-}
-
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
 {
+       return -ENOBUFS;
 }
-
 #endif
 
-#endif
+#endif /* _CEPH_CACHE_H */
index 3c03fa3..a5e93b1 100644 (file)
@@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
        arg->flush_tid = flush_tid;
        arg->oldest_flush_tid = oldest_flush_tid;
 
-       arg->size = inode->i_size;
+       arg->size = i_size_read(inode);
        ci->i_reported_size = arg->size;
        arg->max_size = ci->i_wanted_max_size;
        if (cap == ci->i_auth_cap) {
@@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
        u32 invalidating_gen = ci->i_rdcache_gen;
 
        spin_unlock(&ci->i_ceph_lock);
+       ceph_fscache_invalidate(inode);
        invalidate_mapping_pages(&inode->i_data, 0, -1);
        spin_lock(&ci->i_ceph_lock);
 
@@ -1884,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 
 bool __ceph_should_report_size(struct ceph_inode_info *ci)
 {
-       loff_t size = ci->vfs_inode.i_size;
+       loff_t size = i_size_read(&ci->vfs_inode);
        /* mds will adjust max size according to the reported size */
        if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
                return false;
@@ -2730,10 +2731,6 @@ again:
                                *got = need | want;
                        else
                                *got = need;
-                       if (S_ISREG(inode->i_mode) &&
-                           (need & CEPH_CAP_FILE_RD) &&
-                           !(*got & CEPH_CAP_FILE_CACHE))
-                               ceph_disable_fscache_readpage(ci);
                        ceph_take_cap_refs(ci, *got, true);
                        ret = 1;
                }
@@ -2858,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
  * due to a small max_size, make sure we check_max_size (and possibly
  * ask the mds) so we don't get hung up indefinitely.
  */
-int ceph_get_caps(struct file *filp, int need, int want,
-                 loff_t endoff, int *got, struct page **pinned_page)
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
 {
        struct ceph_file_info *fi = filp->private_data;
        struct inode *inode = file_inode(filp);
@@ -2957,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
                        struct page *page =
                                find_get_page(inode->i_mapping, 0);
                        if (page) {
-                               if (PageUptodate(page)) {
-                                       *pinned_page = page;
-                                       break;
-                               }
+                               bool uptodate = PageUptodate(page);
+
                                put_page(page);
+                               if (uptodate)
+                                       break;
                        }
                        /*
                         * drop cap refs first because getattr while
@@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want,
                }
                break;
        }
-
-       if (S_ISREG(ci->vfs_inode.i_mode) &&
-           (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
-               ceph_fscache_revalidate_cookie(ci);
-
        *got = _got;
        return 0;
 }
@@ -3308,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode,
        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
             inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
-               inode->i_size);
+               i_size_read(inode));
 
 
        /*
index 66989c8..425f335 100644 (file)
@@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p)
        seq_printf(s, "item          total       avg_lat(us)     min_lat(us)     max_lat(us)     stdev(us)\n");
        seq_printf(s, "-----------------------------------------------------------------------------------\n");
 
-       spin_lock(&m->read_latency_lock);
+       spin_lock(&m->read_metric_lock);
        total = m->total_reads;
        sum = m->read_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->read_latency_min;
        max = m->read_latency_max;
        sq = m->read_latency_sq_sum;
-       spin_unlock(&m->read_latency_lock);
+       spin_unlock(&m->read_metric_lock);
        CEPH_METRIC_SHOW("read", total, avg, min, max, sq);
 
-       spin_lock(&m->write_latency_lock);
+       spin_lock(&m->write_metric_lock);
        total = m->total_writes;
        sum = m->write_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->write_latency_min;
        max = m->write_latency_max;
        sq = m->write_latency_sq_sum;
-       spin_unlock(&m->write_latency_lock);
+       spin_unlock(&m->write_metric_lock);
        CEPH_METRIC_SHOW("write", total, avg, min, max, sq);
 
-       spin_lock(&m->metadata_latency_lock);
+       spin_lock(&m->metadata_metric_lock);
        total = m->total_metadatas;
        sum = m->metadata_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->metadata_latency_min;
        max = m->metadata_latency_max;
        sq = m->metadata_latency_sq_sum;
-       spin_unlock(&m->metadata_latency_lock);
+       spin_unlock(&m->metadata_metric_lock);
        CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq);
 
        seq_printf(s, "\n");
index f7a790e..5624fae 100644 (file)
@@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
+               break;
        case SEEK_SET:
                break;
        case SEEK_END:
                retval = -EOPNOTSUPP;
+               goto out;
        default:
                goto out;
        }
@@ -665,8 +667,8 @@ out:
 /*
  * Handle lookups for the hidden .snap directory.
  */
-int ceph_handle_snapdir(struct ceph_mds_request *req,
-                       struct dentry *dentry, int err)
+struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+                                  struct dentry *dentry, int err)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
@@ -674,18 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
            ceph_snap(parent) == CEPH_NOSNAP &&
-           strcmp(dentry->d_name.name,
-                  fsc->mount_options->snapdir_name) == 0) {
+           strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
+               struct dentry *res;
                struct inode *inode = ceph_get_snapdir(parent);
-               if (IS_ERR(inode))
-                       return PTR_ERR(inode);
-               dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
-                    dentry, dentry, inode);
-               BUG_ON(!d_unhashed(dentry));
-               d_add(dentry, inode);
-               err = 0;
+
+               res = d_splice_alias(inode, dentry);
+               dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
+                    dentry, dentry, inode, res);
+               if (res)
+                       dentry = res;
        }
-       return err;
+       return dentry;
 }
 
 /*
@@ -741,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
        struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
        struct ceph_mds_request *req;
+       struct dentry *res;
        int op;
        int mask;
        int err;
@@ -791,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        req->r_parent = dir;
        set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
-       err = ceph_handle_snapdir(req, dentry, err);
+       res = ceph_handle_snapdir(req, dentry, err);
+       if (IS_ERR(res)) {
+               err = PTR_ERR(res);
+       } else {
+               dentry = res;
+               err = 0;
+       }
        dentry = ceph_finish_lookup(req, dentry, err);
        ceph_mdsc_put_request(req);  /* will dput(dentry) */
        dout("lookup result=%p\n", dentry);
index f22156e..65540a4 100644 (file)
@@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 
        vino.ino = ino;
        vino.snap = CEPH_NOSNAP;
+
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-ESTALE);
+
        inode = ceph_find_inode(sb, vino);
        if (!inode) {
                struct ceph_mds_request *req;
@@ -178,8 +182,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
                return ERR_CAST(inode);
        /* We need LINK caps to reliably check i_nlink */
        err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
-       if (err)
+       if (err) {
+               iput(inode);
                return ERR_PTR(err);
+       }
        /* -ESTALE if inode as been unlinked and no file is open */
        if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
                iput(inode);
@@ -212,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
                vino.ino = sfh->ino;
                vino.snap = sfh->snapid;
        }
+
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-ESTALE);
+
        inode = ceph_find_inode(sb, vino);
        if (inode)
                return d_obtain_alias(inode);
index 209535d..77fc037 100644 (file)
@@ -739,9 +739,12 @@ retry:
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
-       err = ceph_handle_snapdir(req, dentry, err);
-       if (err)
+       dentry = ceph_handle_snapdir(req, dentry, err);
+       if (IS_ERR(dentry)) {
+               err = PTR_ERR(dentry);
                goto out_req;
+       }
+       err = 0;
 
        if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
@@ -892,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
                if (!ret)
                        ret = ceph_osdc_wait_request(osdc, req);
 
-               ceph_update_read_latency(&fsc->mdsc->metric,
+               ceph_update_read_metrics(&fsc->mdsc->metric,
                                         req->r_start_latency,
                                         req->r_end_latency,
                                         ret);
@@ -1034,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
        dout("ceph_aio_complete_req %p rc %d bytes %u\n",
             inode, rc, osd_data->bvec_pos.iter.bi_size);
 
-       /* r_start_latency == 0 means the request was not submitted */
-       if (req->r_start_latency) {
-               if (aio_req->write)
-                       ceph_update_write_latency(metric, req->r_start_latency,
-                                                 req->r_end_latency, rc);
-               else
-                       ceph_update_read_latency(metric, req->r_start_latency,
-                                                req->r_end_latency, rc);
-       }
-
        if (rc == -EOLDSNAPC) {
                struct ceph_aio_work *aio_work;
                BUG_ON(!aio_req->write);
@@ -1086,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
                }
        }
 
+       /* r_start_latency == 0 means the request was not submitted */
+       if (req->r_start_latency) {
+               if (aio_req->write)
+                       ceph_update_write_metrics(metric, req->r_start_latency,
+                                                 req->r_end_latency, rc);
+               else
+                       ceph_update_read_metrics(metric, req->r_start_latency,
+                                                req->r_end_latency, rc);
+       }
+
        put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
                  aio_req->should_dirty);
        ceph_osdc_put_request(req);
@@ -1290,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
                if (write)
-                       ceph_update_write_latency(metric, req->r_start_latency,
+                       ceph_update_write_metrics(metric, req->r_start_latency,
                                                  req->r_end_latency, ret);
                else
-                       ceph_update_read_latency(metric, req->r_start_latency,
+                       ceph_update_read_metrics(metric, req->r_start_latency,
                                                 req->r_end_latency, ret);
 
                size = i_size_read(inode);
@@ -1467,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
-               ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+               ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                          req->r_end_latency, ret);
 out:
                ceph_osdc_put_request(req);
@@ -1510,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
        size_t len = iov_iter_count(to);
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct page *pinned_page = NULL;
        bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
        ssize_t ret;
        int want, got = 0;
@@ -1529,8 +1531,7 @@ again:
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-       ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
-                           &got, &pinned_page);
+       ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
        if (ret < 0) {
                if (iocb->ki_flags & IOCB_DIRECT)
                        ceph_end_io_direct(inode);
@@ -1571,10 +1572,6 @@ again:
 
        dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
             inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
-       if (pinned_page) {
-               put_page(pinned_page);
-               pinned_page = NULL;
-       }
        ceph_put_cap_refs(ci, got);
 
        if (direct_lock)
@@ -1753,8 +1750,7 @@ retry_snap:
        else
                want = CEPH_CAP_FILE_BUFFER;
        got = 0;
-       err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
-                           &got, NULL);
+       err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
        if (err < 0)
                goto out;
 
@@ -2083,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode,
        else
                want = CEPH_CAP_FILE_BUFFER;
 
-       ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+       ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
        if (ret < 0)
                goto unlock;
 
@@ -2121,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got,
 
 retry_caps:
        ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
-                           dst_endoff, dst_got, NULL);
+                           dst_endoff, dst_got);
        if (ret < 0)
                return ret;
 
@@ -2143,7 +2139,7 @@ retry_caps:
                        return ret;
                }
                ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
-                                   CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
+                                   CEPH_CAP_FILE_SHARED, -1, src_got);
                if (ret < 0)
                        return ret;
                /*... drop src_ci caps too, and retry */
index 689e3ff..e1c63ad 100644 (file)
@@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
        struct inode *inode;
 
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-EREMOTEIO);
+
        inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
                             ceph_set_ino_cb, &vino);
        if (!inode)
@@ -99,14 +102,15 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        inode->i_mtime = parent->i_mtime;
        inode->i_ctime = parent->i_ctime;
        inode->i_atime = parent->i_atime;
-       inode->i_op = &ceph_snapdir_iops;
-       inode->i_fop = &ceph_snapdir_fops;
-       ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
        ci->i_rbytes = 0;
        ci->i_btime = ceph_inode(parent)->i_btime;
 
-       if (inode->i_state & I_NEW)
+       if (inode->i_state & I_NEW) {
+               inode->i_op = &ceph_snapdir_iops;
+               inode->i_fop = &ceph_snapdir_fops;
+               ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
                unlock_new_inode(inode);
+       }
 
        return inode;
 }
@@ -628,10 +632,11 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int queue_trunc = 0;
+       loff_t isize = i_size_read(inode);
 
        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
-           (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
-               dout("size %lld -> %llu\n", inode->i_size, size);
+           (truncate_seq == ci->i_truncate_seq && size > isize)) {
+               dout("size %lld -> %llu\n", isize, size);
                if (size > 0 && S_ISDIR(inode->i_mode)) {
                        pr_err("fill_file_size non-zero size for directory\n");
                        size = 0;
@@ -925,6 +930,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
                        ci->i_rfiles = le64_to_cpu(info->rfiles);
                        ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
                        ci->i_dir_pin = iinfo->dir_pin;
+                       ci->i_rsnaps = iinfo->rsnaps;
                        ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
                }
        }
@@ -1818,7 +1824,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
        bool ret;
 
        spin_lock(&ci->i_ceph_lock);
-       dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+       dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
        i_size_write(inode, size);
        inode->i_blocks = calc_inode_blocks(size);
 
@@ -1894,6 +1900,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
 
+       ceph_fscache_invalidate(inode);
        if (invalidate_inode_pages2(inode->i_mapping) < 0) {
                pr_err("invalidate_pages %p fails\n", inode);
        }
@@ -2124,20 +2131,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                }
        }
        if (ia_valid & ATTR_SIZE) {
-               dout("setattr %p size %lld -> %lld\n", inode,
-                    inode->i_size, attr->ia_size);
-               if ((issued & CEPH_CAP_FILE_EXCL) &&
-                   attr->ia_size > inode->i_size) {
+               loff_t isize = i_size_read(inode);
+
+               dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+               if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) {
                        i_size_write(inode, attr->ia_size);
                        inode->i_blocks = calc_inode_blocks(attr->ia_size);
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
                        ia_valid |= ATTR_MTIME;
                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-                          attr->ia_size != inode->i_size) {
+                          attr->ia_size != isize) {
                        req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-                       req->r_args.setattr.old_size =
-                               cpu_to_le64(inode->i_size);
+                       req->r_args.setattr.old_size = cpu_to_le64(isize);
                        mask |= CEPH_SETATTR_SIZE;
                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
                                   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2247,7 +2253,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                return err;
 
        if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size > max(inode->i_size, fsc->max_file_size))
+           attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
                return -EFBIG;
 
        if ((attr->ia_valid & ATTR_SIZE) &&
index 97602ea..c456509 100644 (file)
@@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
 }
 
 /**
- * ceph_end_io_direct - declare the file is being used for direct i/o
+ * ceph_start_io_direct - declare the file is being used for direct i/o
  * @inode: file inode
  *
  * Declare that a direct I/O operation is about to start, and ensure
index d87bd85..e5af591 100644 (file)
@@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end,
                        memset(&info->snap_btime, 0, sizeof(info->snap_btime));
                }
 
+               /* snapshot count, remains zero for v<=3 */
+               if (struct_v >= 4) {
+                       ceph_decode_64_safe(p, end, info->rsnaps, bad);
+               } else {
+                       info->rsnaps = 0;
+               }
+
                *p = end;
        } else {
                if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end,
                }
 
                info->dir_pin = -ENODATA;
-               /* info->snap_btime remains zero */
+               /* info->snap_btime and info->rsnaps remain zero */
        }
        return 0;
 bad:
@@ -433,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end,
 
                ceph_decode_64_safe(p, end, start, bad);
                ceph_decode_64_safe(p, end, len, bad);
+
+               /* Don't accept a delegation of system inodes */
+               if (start < CEPH_INO_SYSTEM_BASE) {
+                       pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+                                       start, len);
+                       continue;
+               }
                while (len--) {
                        int err = xa_insert(&s->s_delegated_inos, ino = start++,
                                            DELEGATED_INO_AVAILABLE,
@@ -3306,7 +3320,7 @@ out_err:
        /* kick calling process */
        complete_request(mdsc, req);
 
-       ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency,
+       ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
                                     req->r_end_latency, err);
 out:
        ceph_mdsc_put_request(req);
@@ -3780,7 +3794,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
                rec.v1.cap_id = cpu_to_le64(cap->cap_id);
                rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
                rec.v1.issued = cpu_to_le32(cap->issued);
-               rec.v1.size = cpu_to_le64(inode->i_size);
+               rec.v1.size = cpu_to_le64(i_size_read(inode));
                ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
                ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
                rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
index eaa7c54..15c11a0 100644 (file)
@@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in {
        s32 dir_pin;
        struct ceph_timespec btime;
        struct ceph_timespec snap_btime;
+       u64 rsnaps;
        u64 change_attr;
 };
 
index 5ec94bd..28b6b42 100644 (file)
@@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        struct ceph_metric_write_latency *write;
        struct ceph_metric_metadata_latency *meta;
        struct ceph_metric_dlease *dlease;
+       struct ceph_opened_files *files;
+       struct ceph_pinned_icaps *icaps;
+       struct ceph_opened_inodes *inodes;
        struct ceph_client_metric *m = &mdsc->metric;
        u64 nr_caps = atomic64_read(&m->total_caps);
        struct ceph_msg *msg;
@@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        s32 len;
 
        len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
-             + sizeof(*meta) + sizeof(*dlease);
+             + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
+             + sizeof(*icaps) + sizeof(*inodes);
 
        msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
        if (!msg) {
@@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
        items++;
 
+       sum = percpu_counter_sum(&m->total_inodes);
+
+       /* encode the opened files metric */
+       files = (struct ceph_opened_files *)(dlease + 1);
+       files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+       files->ver = 1;
+       files->compat = 1;
+       files->data_len = cpu_to_le32(sizeof(*files) - 10);
+       files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
+       files->total = cpu_to_le64(sum);
+       items++;
+
+       /* encode the pinned icaps metric */
+       icaps = (struct ceph_pinned_icaps *)(files + 1);
+       icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+       icaps->ver = 1;
+       icaps->compat = 1;
+       icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10);
+       icaps->pinned_icaps = cpu_to_le64(nr_caps);
+       icaps->total = cpu_to_le64(sum);
+       items++;
+
+       /* encode the opened inodes metric */
+       inodes = (struct ceph_opened_inodes *)(icaps + 1);
+       inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+       inodes->ver = 1;
+       inodes->compat = 1;
+       inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10);
+       inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
+       inodes->total = cpu_to_le64(sum);
+       items++;
+
        put_unaligned_le32(items, &head->num);
        msg->front.iov_len = len;
        msg->hdr.version = cpu_to_le16(1);
@@ -183,21 +219,21 @@ int ceph_metric_init(struct ceph_client_metric *m)
        if (ret)
                goto err_i_caps_mis;
 
-       spin_lock_init(&m->read_latency_lock);
+       spin_lock_init(&m->read_metric_lock);
        m->read_latency_sq_sum = 0;
        m->read_latency_min = KTIME_MAX;
        m->read_latency_max = 0;
        m->total_reads = 0;
        m->read_latency_sum = 0;
 
-       spin_lock_init(&m->write_latency_lock);
+       spin_lock_init(&m->write_metric_lock);
        m->write_latency_sq_sum = 0;
        m->write_latency_min = KTIME_MAX;
        m->write_latency_max = 0;
        m->total_writes = 0;
        m->write_latency_sum = 0;
 
-       spin_lock_init(&m->metadata_latency_lock);
+       spin_lock_init(&m->metadata_metric_lock);
        m->metadata_latency_sq_sum = 0;
        m->metadata_latency_min = KTIME_MAX;
        m->metadata_latency_max = 0;
@@ -274,7 +310,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
        *sq_sump += sq;
 }
 
-void ceph_update_read_latency(struct ceph_client_metric *m,
+void ceph_update_read_metrics(struct ceph_client_metric *m,
                              ktime_t r_start, ktime_t r_end,
                              int rc)
 {
@@ -283,14 +319,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m,
        if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
                return;
 
-       spin_lock(&m->read_latency_lock);
+       spin_lock(&m->read_metric_lock);
        __update_latency(&m->total_reads, &m->read_latency_sum,
                         &m->read_latency_min, &m->read_latency_max,
                         &m->read_latency_sq_sum, lat);
-       spin_unlock(&m->read_latency_lock);
+       spin_unlock(&m->read_metric_lock);
 }
 
-void ceph_update_write_latency(struct ceph_client_metric *m,
+void ceph_update_write_metrics(struct ceph_client_metric *m,
                               ktime_t r_start, ktime_t r_end,
                               int rc)
 {
@@ -299,14 +335,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m,
        if (unlikely(rc && rc != -ETIMEDOUT))
                return;
 
-       spin_lock(&m->write_latency_lock);
+       spin_lock(&m->write_metric_lock);
        __update_latency(&m->total_writes, &m->write_latency_sum,
                         &m->write_latency_min, &m->write_latency_max,
                         &m->write_latency_sq_sum, lat);
-       spin_unlock(&m->write_latency_lock);
+       spin_unlock(&m->write_metric_lock);
 }
 
-void ceph_update_metadata_latency(struct ceph_client_metric *m,
+void ceph_update_metadata_metrics(struct ceph_client_metric *m,
                                  ktime_t r_start, ktime_t r_end,
                                  int rc)
 {
@@ -315,9 +351,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m,
        if (unlikely(rc && rc != -ENOENT))
                return;
 
-       spin_lock(&m->metadata_latency_lock);
+       spin_lock(&m->metadata_metric_lock);
        __update_latency(&m->total_metadatas, &m->metadata_latency_sum,
                         &m->metadata_latency_min, &m->metadata_latency_max,
                         &m->metadata_latency_sq_sum, lat);
-       spin_unlock(&m->metadata_latency_lock);
+       spin_unlock(&m->metadata_metric_lock);
 }
index af6038f..e984eb2 100644 (file)
@@ -14,8 +14,11 @@ enum ceph_metric_type {
        CLIENT_METRIC_TYPE_WRITE_LATENCY,
        CLIENT_METRIC_TYPE_METADATA_LATENCY,
        CLIENT_METRIC_TYPE_DENTRY_LEASE,
+       CLIENT_METRIC_TYPE_OPENED_FILES,
+       CLIENT_METRIC_TYPE_PINNED_ICAPS,
+       CLIENT_METRIC_TYPE_OPENED_INODES,
 
-       CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+       CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES,
 };
 
 /*
@@ -28,6 +31,9 @@ enum ceph_metric_type {
        CLIENT_METRIC_TYPE_WRITE_LATENCY,       \
        CLIENT_METRIC_TYPE_METADATA_LATENCY,    \
        CLIENT_METRIC_TYPE_DENTRY_LEASE,        \
+       CLIENT_METRIC_TYPE_OPENED_FILES,        \
+       CLIENT_METRIC_TYPE_PINNED_ICAPS,        \
+       CLIENT_METRIC_TYPE_OPENED_INODES,       \
                                                \
        CLIENT_METRIC_TYPE_MAX,                 \
 }
@@ -94,6 +100,42 @@ struct ceph_metric_dlease {
        __le64 total;
 } __packed;
 
+/* metric opened files header */
+struct ceph_opened_files {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(opened_files + total) */
+       __le64 opened_files;
+       __le64 total;
+} __packed;
+
+/* metric pinned i_caps header */
+struct ceph_pinned_icaps {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(pinned_icaps + total) */
+       __le64 pinned_icaps;
+       __le64 total;
+} __packed;
+
+/* metric opened inodes header */
+struct ceph_opened_inodes {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(opened_inodes + total) */
+       __le64 opened_inodes;
+       __le64 total;
+} __packed;
+
 struct ceph_metric_head {
        __le32 num;     /* the number of metrics that will be sent */
 } __packed;
@@ -108,21 +150,21 @@ struct ceph_client_metric {
        struct percpu_counter i_caps_hit;
        struct percpu_counter i_caps_mis;
 
-       spinlock_t read_latency_lock;
+       spinlock_t read_metric_lock;
        u64 total_reads;
        ktime_t read_latency_sum;
        ktime_t read_latency_sq_sum;
        ktime_t read_latency_min;
        ktime_t read_latency_max;
 
-       spinlock_t write_latency_lock;
+       spinlock_t write_metric_lock;
        u64 total_writes;
        ktime_t write_latency_sum;
        ktime_t write_latency_sq_sum;
        ktime_t write_latency_min;
        ktime_t write_latency_max;
 
-       spinlock_t metadata_latency_lock;
+       spinlock_t metadata_metric_lock;
        u64 total_metadatas;
        ktime_t metadata_latency_sum;
        ktime_t metadata_latency_sq_sum;
@@ -162,13 +204,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
        percpu_counter_inc(&m->i_caps_mis);
 }
 
-extern void ceph_update_read_latency(struct ceph_client_metric *m,
+extern void ceph_update_read_metrics(struct ceph_client_metric *m,
                                     ktime_t r_start, ktime_t r_end,
                                     int rc);
-extern void ceph_update_write_latency(struct ceph_client_metric *m,
+extern void ceph_update_write_metrics(struct ceph_client_metric *m,
                                      ktime_t r_start, ktime_t r_end,
                                      int rc);
-extern void ceph_update_metadata_latency(struct ceph_client_metric *m,
+extern void ceph_update_metadata_metrics(struct ceph_client_metric *m,
                                         ktime_t r_start, ktime_t r_end,
                                         int rc);
 #endif /* _FS_CEPH_MDS_METRIC_H */
index 0728b01..4ce1805 100644 (file)
@@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 
        BUG_ON(capsnap->writing);
-       capsnap->size = inode->i_size;
+       capsnap->size = i_size_read(inode);
        capsnap->mtime = inode->i_mtime;
        capsnap->atime = inode->i_atime;
        capsnap->ctime = inode->i_ctime;
index c48bb30..db80d89 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/ceph/libceph.h>
 
 #ifdef CONFIG_CEPH_FSCACHE
+#define FSCACHE_USE_NEW_IO_API
 #include <linux/fscache.h>
 #endif
 
@@ -333,7 +334,7 @@ struct ceph_inode_info {
 
        /* for dirs */
        struct timespec64 i_rctime;
-       u64 i_rbytes, i_rfiles, i_rsubdirs;
+       u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps;
        u64 i_files, i_subdirs;
 
        /* quotas */
@@ -427,7 +428,6 @@ struct ceph_inode_info {
 
 #ifdef CONFIG_CEPH_FSCACHE
        struct fscache_cookie *fscache;
-       u32 i_fscache_gen;
 #endif
        errseq_t i_meta_err;
 
@@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
                ci->i_vino.snap == pvino->snap;
 }
 
+/*
+ * The MDS reserves a set of inodes for its own usage. These should never
+ * be accessible by clients, and so the MDS has no reason to ever hand these
+ * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE.
+ *
+ * These come from src/mds/mdstypes.h in the ceph sources.
+ */
+#define CEPH_MAX_MDS           0x100
+#define CEPH_NUM_STRAY         10
+#define CEPH_MDS_INO_MDSDIR_OFFSET     (1 * CEPH_MAX_MDS)
+#define CEPH_INO_SYSTEM_BASE           ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
+
+static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
+{
+       if (vino.ino < CEPH_INO_SYSTEM_BASE &&
+           vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) {
+               WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino);
+               return true;
+       }
+       return false;
+}
 
 static inline struct inode *ceph_find_inode(struct super_block *sb,
                                            struct ceph_vino vino)
 {
+       if (ceph_vino_is_reserved(vino))
+               return NULL;
+
        /*
         * NB: The hashval will be run through the fs/inode.c hash function
         * anyway, so there is no need to squash the inode number down to
@@ -1156,7 +1180,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
                                      int mds, int drop, int unless);
 
 extern int ceph_get_caps(struct file *filp, int need, int want,
-                        loff_t endoff, int *got, struct page **pinned_page);
+                        loff_t endoff, int *got);
 extern int ceph_try_get_caps(struct inode *inode,
                             int need, int want, bool nonblock, int *got);
 
@@ -1193,7 +1217,7 @@ extern const struct dentry_operations ceph_dentry_ops;
 
 extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                         struct dentry *dentry, int err);
index 02f59bc..1242db8 100644 (file)
@@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
        return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
 }
 
+static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+       return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps);
+}
+
 static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
                                        size_t size)
 {
@@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
        XATTR_RSTAT_FIELD(dir, rentries),
        XATTR_RSTAT_FIELD(dir, rfiles),
        XATTR_RSTAT_FIELD(dir, rsubdirs),
+       XATTR_RSTAT_FIELD(dir, rsnaps),
        XATTR_RSTAT_FIELD(dir, rbytes),
        XATTR_RSTAT_FIELD(dir, rctime),
        {
index 9a3aed2..c039536 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset:8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs_internal.h - Internal stuff for configfs
  *
  * Based on sysfs:
index b6098e0..ac5e0c0 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c - Operations for configfs directories.
  *
  * Based on sysfs:
index da8351d..e26060d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c - operations for regular (text) files.
  *
  * Based on sysfs:
index 42c348b..eb5ec3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c - basic inode and dentry operations.
  *
  * Based on sysfs:
index 704a435..254170a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * item.c - library routines for handling generic config items
  *
  * Based on kobject:
index 0c6e8cf..c2d8200 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mount.c - operations for initializing and mounting configfs.
  *
  * Based on sysfs:
index 77c8543..0623c3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.c - operations for configfs symlinks.
  *
  * Based on sysfs:
index b3d27fd..6921624 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -525,7 +525,7 @@ retry:
                dax_disassociate_entry(entry, mapping, false);
                xas_store(xas, NULL);   /* undo the PMD join */
                dax_wake_entry(xas, entry, true);
-               mapping->nrexceptional--;
+               mapping->nrpages -= PG_PMD_NR;
                entry = NULL;
                xas_set(xas, index);
        }
@@ -541,7 +541,7 @@ retry:
                dax_lock_entry(xas, entry);
                if (xas_error(xas))
                        goto out_unlock;
-               mapping->nrexceptional++;
+               mapping->nrpages += 1UL << order;
        }
 
 out_unlock:
@@ -661,7 +661,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
                goto out;
        dax_disassociate_entry(entry, mapping, trunc);
        xas_store(&xas, NULL);
-       mapping->nrexceptional--;
+       mapping->nrpages -= 1UL << dax_entry_order(entry);
        ret = 1;
 out:
        put_unlocked_entry(&xas, entry);
@@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;
 
-       if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+       if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
                return 0;
 
        trace_dax_writeback_range(inode, xas.xa_index, end_index);
index 943e523..345f806 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
@@ -350,7 +350,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * lower_offset_for_page
  *
  * Convert an eCryptfs page index into a lower byte offset
@@ -535,7 +535,7 @@ int ecryptfs_decrypt_page(struct page *page)
                rc = crypt_extent(crypt_stat, page, page,
                                  extent_offset, DECRYPT);
                if (rc) {
-                       printk(KERN_ERR "%s: Error encrypting extent; "
+                       printk(KERN_ERR "%s: Error decrypting extent; "
                               "rc = [%d]\n", __func__, rc);
                        goto out;
                }
@@ -627,9 +627,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
        }
 }
 
-/**
+/*
  * ecryptfs_compute_root_iv
- * @crypt_stats
  *
  * On error, sets the root IV to all 0's.
  */
@@ -1370,7 +1369,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
        return rc;
 }
 
-/**
+/*
  * ecryptfs_read_metadata
  *
  * Common entry point for reading file metadata. From here, we could
@@ -1448,7 +1447,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_encrypt_filename - encrypt filename
  *
  * CBC-encrypts the filename. We do not want to encrypt the same
@@ -1590,11 +1589,10 @@ out:
 
 struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
-struct mutex key_tfm_list_mutex;
+DEFINE_MUTEX(key_tfm_list_mutex);
 
 int __init ecryptfs_init_crypto(void)
 {
-       mutex_init(&key_tfm_list_mutex);
        INIT_LIST_HEAD(&key_tfm_list);
        return 0;
 }
@@ -1877,10 +1875,11 @@ out:
 
 /**
  * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @encoded_name: The encrypted name
+ * @encoded_name_size: Length of the encrypted name
+ * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode
  * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
+ * @name_size: The length of the plaintext name
  *
  * Encrypts and encodes a filename into something that constitutes a
  * valid filename for a filesystem, with printable characters.
@@ -1992,7 +1991,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size)
  * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
  * @plaintext_name: The plaintext name
  * @plaintext_name_size: The plaintext name size
- * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @sb: Ecryptfs's super_block
  * @name: The filename in cipher text
  * @name_size: The cipher text name size
  *
index 1f65e99..cf6d0e8 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * Functions only useful for debugging.
  *
@@ -9,7 +9,7 @@
 
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_dump_auth_tok - debug function to print auth toks
  *
  * This function will print the contents of an ecryptfs authentication
index 44606f0..acaa082 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
index 495fb45..5f2b49e 100644 (file)
@@ -513,7 +513,7 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry)
 }
 
 #define ecryptfs_printk(type, fmt, arg...) \
-        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg)
 __printf(1, 2)
 void __ecryptfs_printk(const char *fmt, ...);
 
index 5fb45d8..18d5b91 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
@@ -19,7 +19,7 @@
 #include <linux/fs_stack.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_read_update_atime
  *
  * generic_file_read updates the atime of upper layer inode.  But, it
index 0a1ab1d..16d50df 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
@@ -199,7 +199,7 @@ out_lock:
        return inode;
 }
 
-/**
+/*
  * ecryptfs_initialize_file
  *
  * Cause the file to be changed from a basic empty file to an ecryptfs
@@ -242,10 +242,8 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_create
- * @dir: The inode of the directory in which to create the file.
- * @dentry: The eCryptfs dentry
  * @mode: The mode of the new file.
  *
  * Creates a new file.
@@ -313,7 +311,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
        return 0;
 }
 
-/**
+/*
  * ecryptfs_lookup_interpose - Dentry interposition for a lookup
  */
 static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
@@ -873,6 +871,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
 
 /**
  * ecryptfs_setattr
+ * @mnt_userns: user namespace of the target mount
  * @dentry: dentry handle to the inode to modify
  * @ia: Structure with flags of what to change and values
  *
index f6a17d2..3fe4196 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * In-kernel key management code.  Includes functions to parse and
  * write authentication token-related packets with the underlying
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * request_key returned an error instead of a valid key address;
  * determine the type of error, make appropriate log entries, and
  * return an error code.
@@ -536,8 +536,9 @@ out:
 
 /**
  * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok_key: key containing the authentication token
  * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
+ * @mount_crypt_stat: inode crypt_stat crypto context
  * @sig: Sig of auth_tok to find
  *
  * For now, this function simply looks at the registered auth_tok's
@@ -576,7 +577,7 @@ ecryptfs_find_auth_tok_for_sig(
        return rc;
 }
 
-/**
+/*
  * write_tag_70_packet can gobble a lot of stack space. We stuff most
  * of the function's parameters in a kmalloc'd struct to help reduce
  * eCryptfs' overall stack usage.
@@ -604,7 +605,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
        struct shash_desc *hash_desc;
 };
 
-/**
+/*
  * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
  * @filename: NULL-terminated filename string
  *
@@ -873,7 +874,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
 };
 
 /**
- * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * ecryptfs_parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
  * @filename: This function kmalloc's the memory for the filename
  * @filename_size: This function sets this to the amount of memory
  *                 kmalloc'd for the filename
@@ -1172,7 +1173,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
        rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n",
-                               cipher_code)
+                               cipher_code);
                goto out;
        }
        crypt_stat->flags |= ECRYPTFS_KEY_VALID;
index a7c903c..ae4cb4e 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
@@ -108,6 +108,7 @@ void ecryptfs_destroy_kthread(void)
  * @lower_file: Result of dentry_open by root on lower dentry
  * @lower_dentry: Lower dentry for file to open
  * @lower_mnt: Lower vfsmount for file to open
+ * @cred: credential to use for this call
  *
  * This function gets a r/w file opened against the lower dentry.
  *
index cdf40a5..d66bbd2 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
@@ -24,7 +24,7 @@
 #include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * Module parameter that defines the ecryptfs_verbosity level.
  */
 int ecryptfs_verbosity = 0;
@@ -34,7 +34,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
                 "Initial verbosity level (0 or 1; defaults to "
                 "0, which is Quiet)");
 
-/**
+/*
  * Module parameter that defines the number of message buffer elements
  */
 unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
@@ -43,7 +43,7 @@ module_param(ecryptfs_message_buf_len, uint, 0);
 MODULE_PARM_DESC(ecryptfs_message_buf_len,
                 "Number of message buffer elements");
 
-/**
+/*
  * Module parameter that defines the maximum guaranteed amount of time to wait
  * for a response from ecryptfsd.  The actual sleep time will be, more than
  * likely, a small amount greater than this specified value, but only less if
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(ecryptfs_message_wait_timeout,
                 "sleep while waiting for a message response from "
                 "userspace");
 
-/**
+/*
  * Module parameter that is an estimate of the maximum number of users
  * that will be concurrently using eCryptfs. Set this to the right
  * value to balance performance and memory use.
@@ -80,7 +80,7 @@ void __ecryptfs_printk(const char *fmt, ...)
        va_end(args);
 }
 
-/**
+/*
  * ecryptfs_init_lower_file
  * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
  *                   the lower dentry and the lower mount set
@@ -221,7 +221,7 @@ static void ecryptfs_init_mount_crypt_stat(
 
 /**
  * ecryptfs_parse_options
- * @sb: The ecryptfs super block
+ * @sbi: The ecryptfs super block
  * @options: The options passed to the kernel
  * @check_ruid: set to 1 if device uid should be checked against the ruid
  *
@@ -466,10 +466,10 @@ out:
 struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 
-/**
- * ecryptfs_get_sb
- * @fs_type
- * @flags
+/*
+ * ecryptfs_mount
+ * @fs_type: The filesystem type that the superblock should belong to
+ * @flags: The flags associated with the mount
  * @dev_name: The path to mount over
  * @raw_data: The options passed into the kernel
  */
@@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
 
+       if (!dev_name) {
+               rc = -EINVAL;
+               err = "Device name cannot be null";
+               goto out;
+       }
+
        rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
        if (rc) {
                err = "Error parsing options";
@@ -635,7 +641,7 @@ static struct file_system_type ecryptfs_fs_type = {
 };
 MODULE_ALIAS_FS("ecryptfs");
 
-/**
+/*
  * inode_info_init_once
  *
  * Initializes the ecryptfs_inode_info_cache when it is created
index c0dfd96..6318f35 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2004-2008 International Business Machines Corp.
 
 static LIST_HEAD(ecryptfs_msg_ctx_free_list);
 static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
-static struct mutex ecryptfs_msg_ctx_lists_mux;
+static DEFINE_MUTEX(ecryptfs_msg_ctx_lists_mux);
 
 static struct hlist_head *ecryptfs_daemon_hash;
-struct mutex ecryptfs_daemon_hash_mux;
+DEFINE_MUTEX(ecryptfs_daemon_hash_mux);
 static int ecryptfs_hash_bits;
 #define ecryptfs_current_euid_hash(uid) \
        hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits)
@@ -147,7 +147,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_exorcise_daemon - Destroy the daemon struct
  *
  * Must be called ceremoniously while in possession of
@@ -181,7 +181,8 @@ out:
 }
 
 /**
- * ecryptfs_process_reponse
+ * ecryptfs_process_response
+ * @daemon: eCryptfs daemon object
  * @msg: The ecryptfs message received; the caller should sanity check
  *       msg->data_len and free the memory
  * @seq: The sequence number of the message; must match the sequence
@@ -250,6 +251,7 @@ out:
  * ecryptfs_send_message_locked
  * @data: The data to send
  * @data_len: The length of data
+ * @msg_type: Type of message
  * @msg_ctx: The message context allocated for the send
  *
  * Must be called with ecryptfs_daemon_hash_mux held.
@@ -359,7 +361,6 @@ int __init ecryptfs_init_messaging(void)
                       "too large, defaulting to [%d] users\n", __func__,
                       ecryptfs_number_of_users);
        }
-       mutex_init(&ecryptfs_daemon_hash_mux);
        mutex_lock(&ecryptfs_daemon_hash_mux);
        ecryptfs_hash_bits = 1;
        while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
@@ -383,7 +384,6 @@ int __init ecryptfs_init_messaging(void)
                rc = -ENOMEM;
                goto out;
        }
-       mutex_init(&ecryptfs_msg_ctx_lists_mux);
        mutex_lock(&ecryptfs_msg_ctx_lists_mux);
        ecryptfs_msg_counter = 0;
        for (i = 0; i < ecryptfs_message_buf_len; i++) {
index 742ece2..4e62c3c 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
@@ -312,6 +312,7 @@ out_unlock_daemon:
 
 /**
  * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @daemon: eCryptfs daemon object
  * @data: Bytes comprising struct ecryptfs_message
  * @data_size: sizeof(struct ecryptfs_message) + data len
  * @seq: Sequence number for miscdev response packet
index 2f333a4..392e721 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * This is where eCryptfs coordinates the symmetric encryption and
  * decryption of the file data as it passes between the lower
@@ -22,7 +22,7 @@
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_get_locked_page
  *
  * Get one page from cache or lower f/s, return error otherwise.
@@ -41,6 +41,7 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 /**
  * ecryptfs_writepage
  * @page: Page that is locked before this call is made
+ * @wbc: Write-back control structure
  *
  * Returns zero on success; non-zero otherwise
  *
@@ -78,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
        }
 }
 
-/**
+/*
  *   Header Extent:
  *     Octets 0-7:        Unencrypted file size (big-endian)
  *     Octets 8-15:       eCryptfs special marker
@@ -229,7 +230,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * Called with lower inode mutex held.
  */
 static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
@@ -368,7 +369,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_write_inode_size_to_header
  *
  * Writes the lower file size to the first 8 bytes of the header.
index 0438997..60bdcad 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2007 International Business Machines Corp.
@@ -230,6 +230,8 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
  * ecryptfs_read_lower_page_segment
  * @page_for_ecryptfs: The page into which data for eCryptfs will be
  *                     written
+ * @page_index: Page index in @page_for_ecryptfs from which to start
+ *             writing
  * @offset_in_page: Offset in @page_for_ecryptfs from which to start
  *                  writing
  * @size: The number of bytes to write into @page_for_ecryptfs
index 6b1853f..39116af 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
@@ -81,7 +81,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 
 /**
  * ecryptfs_statfs
- * @sb: The ecryptfs super block
+ * @dentry: The ecryptfs dentry
  * @buf: The struct kstatfs to fill in with stats
  *
  * Get the filesystem statistics. Currently, we let this pass right through
@@ -108,7 +108,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 /**
  * ecryptfs_evict_inode
- * @inode - The ecryptfs inode
+ * @inode: The ecryptfs inode
  *
  * Called by iput() when the inode reference count reached zero
  * and the inode is not hashed anywhere.  Used to clear anything
@@ -123,7 +123,7 @@ static void ecryptfs_evict_inode(struct inode *inode)
        iput(ecryptfs_inode_to_lower(inode));
 }
 
-/**
+/*
  * ecryptfs_show_options
  *
  * Prints the mount options for a given superblock.
index 73138ea..1e596e1 100644 (file)
@@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep,
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);
+
+       if (!list_empty(&ep->rdllist)) {
+               if (waitqueue_active(&ep->wq))
+                       wake_up(&ep->wq);
+       }
+
        write_unlock_irq(&ep->lock);
 }
 
index f7e3304..860e884 100644 (file)
@@ -771,7 +771,7 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
        /*
         * FAT data is organized as clusters, trim at the granulary of cluster.
         *
-        * fstrim_range is in byte, convert vaules to cluster index.
+        * fstrim_range is in byte, convert values to cluster index.
         * Treat sectors before data region as all used, not to trim them.
         */
        ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
index 84c3810..ea7fc5c 100644 (file)
@@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
        if (mapping) {
                truncate_inode_pages_final(mapping);
                if (!gfs2_withdrawn(sdp))
-                       GLOCK_BUG_ON(gl, mapping->nrpages ||
-                                    mapping->nrexceptional);
+                       GLOCK_BUG_ON(gl, !mapping_empty(mapping));
        }
        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
index 302f451..d92c4af 100644 (file)
@@ -356,7 +356,8 @@ struct hpfs_dirent {
   u8 no_of_acls;                       /* number of ACL's (low 3 bits) */
   u8 ix;                               /* code page index (of filename), see
                                           struct code_page_data */
-  u8 namelen, name[1];                 /* file name */
+  u8 namelen;                          /* file name length */
+  u8 name[];                           /* file name */
   /* dnode_secno down;   btree down pointer, if present,
                          follows name on next word boundary, or maybe it
                          precedes next dirent, which is on a word boundary. */
index 701c82c..a2a4233 100644 (file)
@@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
        struct address_space *mapping = &inode->i_data;
        const pgoff_t start = lstart >> huge_page_shift(h);
        const pgoff_t end = lend >> huge_page_shift(h);
-       struct vm_area_struct pseudo_vma;
        struct pagevec pvec;
        pgoff_t next, index;
        int i, freed = 0;
        bool truncate_op = (lend == LLONG_MAX);
 
-       vma_init(&pseudo_vma, current->mm);
-       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pagevec_init(&pvec);
        next = start;
        while (next < end) {
@@ -482,10 +479,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
-                       u32 hash;
+                       u32 hash = 0;
 
                        index = page->index;
-                       hash = hugetlb_fault_mutex_hash(mapping, index);
                        if (!truncate_op) {
                                /*
                                 * Only need to hold the fault mutex in the
@@ -493,6 +489,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                 * page faults.  Races are not possible in the
                                 * case of truncation.
                                 */
+                               hash = hugetlb_fault_mutex_hash(mapping, index);
                                mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        }
 
@@ -1435,7 +1432,7 @@ static int get_hstate_idx(int page_size_log)
 
        if (!h)
                return -1;
-       return h - hstates;
+       return hstate_index(h);
 }
 
 /*
index 9e192be..c93500d 100644 (file)
@@ -529,7 +529,14 @@ void clear_inode(struct inode *inode)
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
-       BUG_ON(inode->i_data.nrexceptional);
+       /*
+        * Almost always, mapping_empty(&inode->i_data) here; but there are
+        * two known and long-standing ways in which nodes may get left behind
+        * (when deep radix-tree node allocation failed partway; or when THP
+        * collapse_file() failed). Until those two known cases are cleaned up,
+        * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
+        * nor even WARN_ON(!mapping_empty).
+        */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
index 360f813..f46acbb 100644 (file)
@@ -251,7 +251,7 @@ struct io_rsrc_data {
 struct io_buffer {
        struct list_head list;
        __u64 addr;
-       __s32 len;
+       __u32 len;
        __u16 bid;
 };
 
@@ -456,6 +456,7 @@ struct io_ring_ctx {
        spinlock_t                      rsrc_ref_lock;
        struct io_rsrc_node             *rsrc_node;
        struct io_rsrc_node             *rsrc_backup_node;
+       struct io_mapped_ubuf           *dummy_ubuf;
 
        struct io_restriction           restrictions;
 
@@ -702,7 +703,8 @@ enum {
        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 
-       REQ_F_FAIL_LINK_BIT,
+       /* first byte is taken by user flags, shift it to not overlap */
+       REQ_F_FAIL_LINK_BIT     = 8,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
        REQ_F_NOWAIT_BIT,
@@ -1157,6 +1159,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
                goto err;
        __hash_init(ctx->cancel_hash, 1U << hash_bits);
 
+       ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
+       if (!ctx->dummy_ubuf)
+               goto err;
+       /* set invalid range, so io_import_fixed() fails meeting it */
+       ctx->dummy_ubuf->ubuf = -1UL;
+
        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
                goto err;
@@ -1184,6 +1192,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
        return ctx;
 err:
+       kfree(ctx->dummy_ubuf);
        kfree(ctx->cancel_hash);
        kfree(ctx);
        return NULL;
@@ -3977,7 +3986,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
                        break;
 
                buf->addr = addr;
-               buf->len = pbuf->len;
+               buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
                buf->bid = bid;
                addr += pbuf->len;
                bid++;
@@ -6503,14 +6512,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        req->work.creds = NULL;
 
        /* enforce forwards compatibility on users */
-       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
-               req->flags = 0;
+       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
                return -EINVAL;
-       }
-
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
-
        if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
                return -EACCES;
 
@@ -7539,6 +7544,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
                        io_ring_submit_lock(ctx, lock_ring);
                        spin_lock_irqsave(&ctx->completion_lock, flags);
                        io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+                       ctx->cq_extra++;
                        io_commit_cqring(ctx);
                        spin_unlock_irqrestore(&ctx->completion_lock, flags);
                        io_cqring_ev_posted(ctx);
@@ -8111,11 +8117,13 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
        struct io_mapped_ubuf *imu = *slot;
        unsigned int i;
 
-       for (i = 0; i < imu->nr_bvecs; i++)
-               unpin_user_page(imu->bvec[i].bv_page);
-       if (imu->acct_pages)
-               io_unaccount_mem(ctx, imu->acct_pages);
-       kvfree(imu);
+       if (imu != ctx->dummy_ubuf) {
+               for (i = 0; i < imu->nr_bvecs; i++)
+                       unpin_user_page(imu->bvec[i].bv_page);
+               if (imu->acct_pages)
+                       io_unaccount_mem(ctx, imu->acct_pages);
+               kvfree(imu);
+       }
        *slot = NULL;
 }
 
@@ -8132,7 +8140,7 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
        for (i = 0; i < ctx->nr_user_bufs; i++)
                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
        kfree(ctx->user_bufs);
-       kfree(ctx->buf_data);
+       io_rsrc_data_free(ctx->buf_data);
        ctx->user_bufs = NULL;
        ctx->buf_data = NULL;
        ctx->nr_user_bufs = 0;
@@ -8255,6 +8263,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
        size_t size;
        int ret, pret, nr_pages, i;
 
+       if (!iov->iov_base) {
+               *pimu = ctx->dummy_ubuf;
+               return 0;
+       }
+
        ubuf = (unsigned long) iov->iov_base;
        end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = ubuf >> PAGE_SHIFT;
@@ -8352,7 +8365,9 @@ static int io_buffer_validate(struct iovec *iov)
         * constraints here, we'll -EINVAL later when IO is
         * submitted if they are wrong.
         */
-       if (!iov->iov_base || !iov->iov_len)
+       if (!iov->iov_base)
+               return iov->iov_len ? -EFAULT : 0;
+       if (!iov->iov_len)
                return -EFAULT;
 
        /* arbitrary limit, but we need something */
@@ -8385,7 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                return -ENOMEM;
        ret = io_buffers_map_alloc(ctx, nr_args);
        if (ret) {
-               kfree(data);
+               io_rsrc_data_free(data);
                return ret;
        }
 
@@ -8402,6 +8417,10 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                ret = io_buffer_validate(&iov);
                if (ret)
                        break;
+               if (!iov.iov_base && tag) {
+                       ret = -EINVAL;
+                       break;
+               }
 
                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
                                             &last_hpage);
@@ -8451,12 +8470,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
                err = io_buffer_validate(&iov);
                if (err)
                        break;
+               if (!iov.iov_base && tag) {
+                       err = -EINVAL;
+                       break;
+               }
                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
                if (err)
                        break;
 
                i = array_index_nospec(offset, ctx->nr_user_bufs);
-               if (ctx->user_bufs[i]) {
+               if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
                        err = io_queue_rsrc_removal(ctx->buf_data, offset,
                                                    ctx->rsrc_node, ctx->user_bufs[i]);
                        if (unlikely(err)) {
@@ -8604,6 +8627,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        if (ctx->hash_map)
                io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
+       kfree(ctx->dummy_ubuf);
        kfree(ctx);
 }
 
@@ -9607,7 +9631,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
        if (ret)
                goto err;
        /* always set a rsrc node */
-       io_rsrc_node_switch_start(ctx);
+       ret = io_rsrc_node_switch_start(ctx);
+       if (ret)
+               goto err;
        io_rsrc_node_switch(ctx, NULL);
 
        memset(&p->sq_off, 0, sizeof(p->sq_off));
@@ -10136,6 +10162,13 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 
+       BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
+                    sizeof(struct io_uring_rsrc_update));
+       BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
+                    sizeof(struct io_uring_rsrc_update2));
+       /* should fit into one byte */
+       BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
index 0129e6b..f2cd203 100644 (file)
@@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 }
 
 void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
-               void (*merge_private)(struct iomap_ioend *ioend,
-                               struct iomap_ioend *next))
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
 {
        struct iomap_ioend *next;
 
@@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
                        break;
                list_move_tail(&next->io_list, &ioend->io_list);
                ioend->io_size += next->io_size;
-               if (next->io_private && merge_private)
-                       merge_private(ioend, next);
        }
 }
 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
@@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
        ioend->io_inode = inode;
        ioend->io_size = 0;
        ioend->io_offset = offset;
-       ioend->io_private = NULL;
        ioend->io_bio = bio;
        return ioend;
 }
index 94ef92f..4880146 100644 (file)
@@ -767,6 +767,7 @@ repeat:
                        rs.cont_extent = isonum_733(rr->u.CE.extent);
                        rs.cont_offset = isonum_733(rr->u.CE.offset);
                        rs.cont_size = isonum_733(rr->u.CE.size);
+                       break;
                default:
                        break;
                }
index 5c42363..74b2a1d 100644 (file)
@@ -1808,6 +1808,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
 
        if (flags & FL_LAYOUT)
                return 0;
+       if (flags & FL_DELEG)
+               /* We leave these checks to the caller */
+               return 0;
 
        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
index f7786e0..ed9d580 100644 (file)
@@ -137,12 +137,12 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
                list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
                        if (!pnfs_layout_is_valid(lo))
                                continue;
-                       if (stateid != NULL &&
-                           !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+                       if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
                                continue;
-                       if (!nfs_sb_active(server->super))
-                               continue;
-                       inode = igrab(lo->plh_inode);
+                       if (nfs_sb_active(server->super))
+                               inode = igrab(lo->plh_inode);
+                       else
+                               inode = ERR_PTR(-EAGAIN);
                        rcu_read_unlock();
                        if (inode)
                                return inode;
@@ -176,9 +176,10 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
                                continue;
                        if (nfsi->layout != lo)
                                continue;
-                       if (!nfs_sb_active(server->super))
-                               continue;
-                       inode = igrab(lo->plh_inode);
+                       if (nfs_sb_active(server->super))
+                               inode = igrab(lo->plh_inode);
+                       else
+                               inode = ERR_PTR(-EAGAIN);
                        rcu_read_unlock();
                        if (inode)
                                return inode;
index ff5c4d0..cfeaadf 100644 (file)
@@ -476,7 +476,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                        to->to_maxval = to->to_initval;
                to->to_exponential = 0;
                break;
-#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
        case XPRT_TRANSPORT_UDP:
                if (retrans == NFS_UNSPEC_RETRANS)
                        to->to_retries = NFS_DEF_UDP_RETRANS;
@@ -487,7 +486,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
                to->to_exponential = 1;
                break;
-#endif
        default:
                BUG();
        }
@@ -698,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = ctx->flags;
        server->options = ctx->options;
-       server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-               NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-               NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+       server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+
+       switch (clp->rpc_ops->version) {
+       case 2:
+               server->fattr_valid = NFS_ATTR_FATTR_V2;
+               break;
+       case 3:
+               server->fattr_valid = NFS_ATTR_FATTR_V3;
+               break;
+       default:
+               server->fattr_valid = NFS_ATTR_FATTR_V4;
+       }
 
        if (ctx->rsize)
                server->rsize = nfs_block_size(ctx->rsize, NULL);
@@ -794,6 +801,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
        server->maxfilesize = fsinfo->maxfilesize;
 
        server->time_delta = fsinfo->time_delta;
+       server->change_attr_type = fsinfo->change_attr_type;
 
        server->clone_blksize = fsinfo->clone_blksize;
        /* We're airborne Set socket buffersize */
@@ -935,6 +943,8 @@ struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
 
+       server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
        ida_init(&server->openowner_id);
        ida_init(&server->lockowner_id);
        pnfs_init_server(server);
index 04bf806..e6ec6f0 100644 (file)
@@ -114,7 +114,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
        return ret;
 }
 /**
- * nfs_have_delegation - check if inode has a delegation, mark it
+ * nfs4_have_delegation - check if inode has a delegation, mark it
  * NFS_DELEGATION_REFERENCED if there is one.
  * @inode: inode to check
  * @flags: delegation types to check for
@@ -481,6 +481,22 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
        if (freeme == NULL)
                goto out;
 add_new:
+       /*
+        * If we didn't revalidate the change attribute before setting
+        * the delegation, then pre-emptively ask for a full attribute
+        * cache revalidation.
+        */
+       spin_lock(&inode->i_lock);
+       if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE)
+               nfs_set_cache_invalid(inode,
+                       NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+                       NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+                       NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+                       NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA |
+                       NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+                       NFS_INO_INVALID_XATTR);
+       spin_unlock(&inode->i_lock);
+
        list_add_tail_rcu(&delegation->super_list, &server->delegations);
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -488,11 +504,6 @@ add_new:
        atomic_long_inc(&nfs_active_delegations);
 
        trace_nfs4_set_delegation(inode, type);
-
-       spin_lock(&inode->i_lock);
-       if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME))
-               NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED;
-       spin_unlock(&inode->i_lock);
 out:
        spin_unlock(&clp->cl_lock);
        if (delegation != NULL)
@@ -674,7 +685,7 @@ void nfs_inode_evict_delegation(struct inode *inode)
 }
 
 /**
- * nfs_inode_return_delegation - synchronously return a delegation
+ * nfs4_inode_return_delegation - synchronously return a delegation
  * @inode: inode to process
  *
  * This routine will always flush any dirty data to disk on the
@@ -697,7 +708,7 @@ int nfs4_inode_return_delegation(struct inode *inode)
 }
 
 /**
- * nfs_inode_return_delegation_on_close - asynchronously return a delegation
+ * nfs4_inode_return_delegation_on_close - asynchronously return a delegation
  * @inode: inode to process
  *
  * This routine is called on file close in order to determine if the
@@ -811,7 +822,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 }
 
 /**
- * nfs_super_return_all_delegations - return delegations for one superblock
+ * nfs_server_return_all_delegations - return delegations for one superblock
  * @server: pointer to nfs_server to process
  *
  */
index 9b00a0b..c19b4fd 100644 (file)
@@ -84,8 +84,7 @@ int nfs4_inode_make_writeable(struct inode *inode);
 
 static inline int nfs_have_delegated_attributes(struct inode *inode)
 {
-       return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
-               !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+       return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
 }
 
 #endif
index fc4f490..1a6d286 100644 (file)
@@ -866,6 +866,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
                        break;
                }
 
+               verf_arg = verf_res;
+
                status = nfs_readdir_page_filler(desc, entry, pages, pglen,
                                                 arrays, narrays);
        } while (!status && nfs_readdir_page_needs_filling(page));
@@ -927,7 +929,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
                        }
                        return res;
                }
-               memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
+               /*
+                * Set the cookie verifier if the page cache was empty
+                */
+               if (desc->page_index == 0)
+                       memcpy(nfsi->cookieverf, verf,
+                              sizeof(nfsi->cookieverf));
        }
        res = nfs_readdir_search_array(desc);
        if (res == 0) {
@@ -974,10 +981,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
+                          const __be32 *verf)
 {
        struct file     *file = desc->file;
-       struct nfs_inode *nfsi = NFS_I(file_inode(file));
        struct nfs_cache_array *array;
        unsigned int i = 0;
 
@@ -991,7 +998,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
                        desc->eof = true;
                        break;
                }
-               memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
+               memcpy(desc->verf, verf, sizeof(desc->verf));
                if (i < (array->size-1))
                        desc->dir_cookie = array->array[i+1].cookie;
                else
@@ -1048,7 +1055,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 
        for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
                desc->page = arrays[i];
-               nfs_do_filldir(desc);
+               nfs_do_filldir(desc, verf);
        }
        desc->page = NULL;
 
@@ -1069,6 +1076,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct dentry   *dentry = file_dentry(file);
        struct inode    *inode = d_inode(dentry);
+       struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_dir_context *dir_ctx = file->private_data;
        struct nfs_readdir_descriptor *desc;
        int res;
@@ -1122,7 +1130,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
-                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+                       clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
                        nfs_zap_caches(inode);
                        desc->page_index = 0;
                        desc->plus = false;
@@ -1132,7 +1140,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
                if (res < 0)
                        break;
 
-               nfs_do_filldir(desc);
+               nfs_do_filldir(desc, nfsi->cookieverf);
                nfs_readdir_page_unlock_and_put_cached(desc);
        } while (!desc->eof);
 
@@ -1703,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode)
        NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
        nfs_set_cache_invalid(
                inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
-                              NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED);
+                              NFS_INO_INVALID_NLINK);
        spin_unlock(&inode->i_lock);
 }
 
@@ -2940,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask)
 
        if (S_ISDIR(inode->i_mode))
                return 0;
-       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) {
+       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) {
                if (mask & MAY_NOT_BLOCK)
                        return -ECHILD;
                ret = __nfs_revalidate_inode(server, inode);
@@ -2998,16 +3006,10 @@ out_notsup:
        if (mask & MAY_NOT_BLOCK)
                return -ECHILD;
 
-       res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE |
+                                                 NFS_INO_INVALID_OTHER);
        if (res == 0)
                res = generic_permission(&init_user_ns, inode, mask);
        goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- * End:
- */
index f2b34cf..37a1a88 100644 (file)
@@ -169,19 +169,8 @@ out:
 
 static u64 nfs_fetch_iversion(struct inode *inode)
 {
-       struct nfs_server *server = NFS_SERVER(inode);
-
-       /* Is this the right call?: */
-       nfs_revalidate_inode(server, inode);
-       /*
-        * Also, note we're ignoring any returned error.  That seems to be
-        * the practice for cache consistency information elsewhere in
-        * the server, but I'm not sure why.
-        */
-       if (server->nfs_client->rpc_ops->version >= 4)
-               return inode_peek_iversion_raw(inode);
-       else
-               return time_to_chattr(&inode->i_ctime);
+       nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+       return inode_peek_iversion_raw(inode);
 }
 
 const struct export_operations nfs_export_ops = {
index 16ad505..1fef107 100644 (file)
@@ -105,7 +105,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 
        if (filp->f_flags & O_DIRECT)
                goto force_reval;
-       if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
+       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE))
                goto force_reval;
        return 0;
 force_reval:
index 872112b..d383de0 100644 (file)
@@ -106,7 +106,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -ENOBUFS;
        fh->size = be32_to_cpup(p++);
-       if (fh->size > sizeof(struct nfs_fh)) {
+       if (fh->size > NFS_MAXFHSIZE) {
                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
                       fh->size);
                return -EOVERFLOW;
index a06d213..d95c9a3 100644 (file)
@@ -283,20 +283,40 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        return 0;
 }
 
+#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+       return true;
+}
+#else
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+       if (ctx->version == 4)
+               return true;
+       return false;
+}
+#endif
+
 /*
  * Sanity check the NFS transport protocol.
- *
  */
-static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
+static int nfs_validate_transport_protocol(struct fs_context *fc,
+                                          struct nfs_fs_context *ctx)
 {
        switch (ctx->nfs_server.protocol) {
        case XPRT_TRANSPORT_UDP:
+               if (nfs_server_transport_udp_invalid(ctx))
+                       goto out_invalid_transport_udp;
+               break;
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
                break;
        default:
                ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
        }
+       return 0;
+out_invalid_transport_udp:
+       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 }
 
 /*
@@ -305,8 +325,6 @@ static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
  */
 static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx)
 {
-       nfs_validate_transport_protocol(ctx);
-
        if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP ||
            ctx->mount_server.protocol == XPRT_TRANSPORT_TCP)
                        return;
@@ -932,6 +950,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
        struct nfs_fh *mntfh = ctx->mntfh;
        struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
        int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+       int ret;
 
        if (data == NULL)
                goto out_no_data;
@@ -977,6 +996,15 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
                               sizeof(mntfh->data) - mntfh->size);
 
                /*
+                * for proto == XPRT_TRANSPORT_UDP, which is what uses
+                * to_exponential, implying shift: limit the shift value
+                * to BITS_PER_LONG (majortimeo is unsigned long)
+                */
+               if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */
+                       if (data->retrans >= 64) /* shift value is too large */
+                               goto out_invalid_data;
+
+               /*
                 * Translate to nfs_fs_context, which nfs_fill_super
                 * can deal with.
                 */
@@ -1048,6 +1076,10 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
                goto generic;
        }
 
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
+
        ctx->skip_reconfig_option_check = true;
        return 0;
 
@@ -1076,6 +1108,9 @@ out_no_address:
 
 out_invalid_fh:
        return nfs_invalf(fc, "NFS: invalid root filehandle");
+
+out_invalid_data:
+       return nfs_invalf(fc, "NFS: invalid binary mount data");
 }
 
 #if IS_ENABLED(CONFIG_NFS_V4)
@@ -1146,6 +1181,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
 {
        struct nfs_fs_context *ctx = nfs_fc2context(fc);
        struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+       int ret;
        char *c;
 
        if (!data) {
@@ -1218,9 +1254,9 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
        ctx->acdirmin   = data->acdirmin;
        ctx->acdirmax   = data->acdirmax;
        ctx->nfs_server.protocol = data->proto;
-       nfs_validate_transport_protocol(ctx);
-       if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-               goto out_invalid_transport_udp;
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
 done:
        ctx->skip_reconfig_option_check = true;
        return 0;
@@ -1231,9 +1267,6 @@ out_inval_auth:
 
 out_no_address:
        return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
-
-out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 }
 #endif
 
@@ -1298,6 +1331,10 @@ static int nfs_fs_context_validate(struct fs_context *fc)
        if (!nfs_verify_server_address(sap))
                goto out_no_address;
 
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
+
        if (ctx->version == 4) {
                if (IS_ENABLED(CONFIG_NFS_V4)) {
                        if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
@@ -1306,9 +1343,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
                                port = NFS_PORT;
                        max_namelen = NFS4_MAXNAMLEN;
                        max_pathlen = NFS4_MAXPATHLEN;
-                       nfs_validate_transport_protocol(ctx);
-                       if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-                               goto out_invalid_transport_udp;
                        ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL |
                                        NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK |
                                        NFS_MOUNT_LOCAL_FCNTL);
@@ -1317,10 +1351,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
                }
        } else {
                nfs_set_mount_transport_protocol(ctx);
-#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
-              if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-                      goto out_invalid_transport_udp;
-#endif
                if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
                        port = NFS_RDMA_PORT;
        }
@@ -1354,8 +1384,6 @@ out_no_device_name:
 out_v4_not_compiled:
        nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
        return -EPROTONOSUPPORT;
-out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 out_no_address:
        return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
 out_mountproto_mismatch:
index 5a8854d..529c409 100644 (file)
@@ -164,34 +164,19 @@ static int nfs_attribute_timeout(struct inode *inode)
        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
-static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+static bool nfs_check_cache_flags_invalid(struct inode *inode,
+                                         unsigned long flags)
 {
        unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
 
-       /* Special case for the pagecache or access cache */
-       if (flags == NFS_INO_REVAL_PAGECACHE &&
-           !(cache_validity & NFS_INO_REVAL_FORCED))
-               return false;
        return (cache_validity & flags) != 0;
 }
 
-static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
-{
-       unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-
-       if ((cache_validity & flags) != 0)
-               return true;
-       if (nfs_attribute_timeout(inode))
-               return true;
-       return false;
-}
-
 bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
 {
-       if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
-               return nfs_check_cache_invalid_delegated(inode, flags);
-
-       return nfs_check_cache_invalid_not_delegated(inode, flags);
+       if (nfs_check_cache_flags_invalid(inode, flags))
+               return true;
+       return nfs_attribute_cache_expired(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
 
@@ -214,20 +199,21 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 
        if (have_delegation) {
                if (!(flags & NFS_INO_REVAL_FORCED))
-                       flags &= ~NFS_INO_INVALID_OTHER;
-               flags &= ~(NFS_INO_INVALID_CHANGE
-                               | NFS_INO_INVALID_SIZE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_INVALID_XATTR);
-       }
+                       flags &= ~(NFS_INO_INVALID_MODE |
+                                  NFS_INO_INVALID_OTHER |
+                                  NFS_INO_INVALID_XATTR);
+               flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
+       } else if (flags & NFS_INO_REVAL_PAGECACHE)
+               flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
 
        if (!nfs_has_xattr_cache(nfsi))
                flags &= ~NFS_INO_INVALID_XATTR;
+       if (flags & NFS_INO_INVALID_DATA)
+               nfs_fscache_invalidate(inode);
        if (inode->i_mapping->nrpages == 0)
                flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
+       flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
        nfsi->cache_validity |= flags;
-       if (flags & NFS_INO_INVALID_DATA)
-               nfs_fscache_invalidate(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
 
@@ -452,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                .fattr  = fattr
        };
        struct inode *inode = ERR_PTR(-ENOENT);
+       u64 fattr_supported = NFS_SB(sb)->fattr_valid;
        unsigned long hash;
 
        nfs_attr_check_mountpoint(sb, fattr);
@@ -484,8 +471,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                inode->i_mode = fattr->mode;
                nfsi->cache_validity = 0;
                if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
-                               && nfs_server_capable(inode, NFS_CAP_MODE))
-                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+                               && (fattr_supported & NFS_ATTR_FATTR_MODE))
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
                /* Why so? Because we want revalidate for devices/FIFOs, and
                 * that's precisely what we have in nfs_file_inode_operations.
                 */
@@ -530,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                nfsi->attr_gencount = fattr->gencount;
                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                        inode->i_atime = fattr->atime;
-               else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        inode->i_mtime = fattr->mtime;
-               else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
                        inode->i_ctime = fattr->ctime;
-               else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        inode_set_iversion_raw(inode, fattr->change_attr);
@@ -550,29 +537,31 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
                        set_nlink(inode, fattr->nlink);
-               else if (nfs_server_capable(inode, NFS_CAP_NLINK))
-                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+               else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
                        inode->i_uid = fattr->uid;
-               else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+               else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
                        inode->i_gid = fattr->gid;
-               else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+               else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
                if (nfs_server_capable(inode, NFS_CAP_XATTR))
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
                        inode->i_blocks = fattr->du.nfs2.blocks;
+               else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
+                        fattr->size != 0)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                        /*
                         * report the blocks in 512byte units
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-               }
-
-               if (nfsi->cache_validity != 0)
-                       nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+               } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED &&
+                          fattr->size != 0)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
 
                nfs_setsecurity(inode, fattr, label);
 
@@ -634,8 +623,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
        }
 
        /* Optimization: if the end result is no change, don't RPC */
-       attr->ia_valid &= NFS_VALID_ATTRS;
-       if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+       if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0)
                return 0;
 
        trace_nfs_setattr_enter(inode);
@@ -710,12 +698,20 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
        spin_lock(&inode->i_lock);
        NFS_I(inode)->attr_gencount = fattr->gencount;
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
-               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME |
+                                                    NFS_INO_INVALID_BLOCKS);
                nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
                nfs_vmtruncate(inode, attr->ia_size);
        }
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
                NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME;
+               if ((attr->ia_valid & ATTR_KILL_SUID) != 0 &&
+                   inode->i_mode & S_ISUID)
+                       inode->i_mode &= ~S_ISUID;
+               if ((attr->ia_valid & ATTR_KILL_SGID) != 0 &&
+                   (inode->i_mode & (S_ISGID | S_IXGRP)) ==
+                    (S_ISGID | S_IXGRP))
+                       inode->i_mode &= ~S_ISGID;
                if ((attr->ia_valid & ATTR_MODE) != 0) {
                        int mode = attr->ia_mode & S_IALLUGO;
                        mode |= inode->i_mode & ~S_IALLUGO;
@@ -793,14 +789,28 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
        dput(parent);
 }
 
-static bool nfs_need_revalidate_inode(struct inode *inode)
+static u32 nfs_get_valid_attrmask(struct inode *inode)
 {
-       if (NFS_I(inode)->cache_validity &
-                       (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
-               return true;
-       if (nfs_attribute_cache_expired(inode))
-               return true;
-       return false;
+       unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+       u32 reply_mask = STATX_INO | STATX_TYPE;
+
+       if (!(cache_validity & NFS_INO_INVALID_ATIME))
+               reply_mask |= STATX_ATIME;
+       if (!(cache_validity & NFS_INO_INVALID_CTIME))
+               reply_mask |= STATX_CTIME;
+       if (!(cache_validity & NFS_INO_INVALID_MTIME))
+               reply_mask |= STATX_MTIME;
+       if (!(cache_validity & NFS_INO_INVALID_SIZE))
+               reply_mask |= STATX_SIZE;
+       if (!(cache_validity & NFS_INO_INVALID_NLINK))
+               reply_mask |= STATX_NLINK;
+       if (!(cache_validity & NFS_INO_INVALID_MODE))
+               reply_mask |= STATX_MODE;
+       if (!(cache_validity & NFS_INO_INVALID_OTHER))
+               reply_mask |= STATX_UID | STATX_GID;
+       if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
+               reply_mask |= STATX_BLOCKS;
+       return reply_mask;
 }
 
 int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -815,9 +825,13 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 
        trace_nfs_getattr_enter(inode);
 
+       request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
+                       STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
+                       STATX_INO | STATX_SIZE | STATX_BLOCKS;
+
        if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
                nfs_readdirplus_parent_cache_hit(path->dentry);
-               goto out_no_update;
+               goto out_no_revalidate;
        }
 
        /* Flush out writes to the server in order to update c/mtime.  */
@@ -850,14 +864,24 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
        /* Check whether the cached attributes are stale */
        do_update |= force_sync || nfs_attribute_cache_expired(inode);
        cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-       do_update |= cache_validity &
-               (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL);
+       do_update |= cache_validity & NFS_INO_INVALID_CHANGE;
        if (request_mask & STATX_ATIME)
                do_update |= cache_validity & NFS_INO_INVALID_ATIME;
-       if (request_mask & (STATX_CTIME|STATX_MTIME))
-               do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+       if (request_mask & STATX_CTIME)
+               do_update |= cache_validity & NFS_INO_INVALID_CTIME;
+       if (request_mask & STATX_MTIME)
+               do_update |= cache_validity & NFS_INO_INVALID_MTIME;
+       if (request_mask & STATX_SIZE)
+               do_update |= cache_validity & NFS_INO_INVALID_SIZE;
+       if (request_mask & STATX_NLINK)
+               do_update |= cache_validity & NFS_INO_INVALID_NLINK;
+       if (request_mask & STATX_MODE)
+               do_update |= cache_validity & NFS_INO_INVALID_MODE;
+       if (request_mask & (STATX_UID | STATX_GID))
+               do_update |= cache_validity & NFS_INO_INVALID_OTHER;
        if (request_mask & STATX_BLOCKS)
                do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
+
        if (do_update) {
                /* Update the attribute cache */
                if (!(server->flags & NFS_MOUNT_NOAC))
@@ -871,8 +895,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
                nfs_readdirplus_parent_cache_hit(path->dentry);
 out_no_revalidate:
        /* Only return attributes that were revalidated. */
-       stat->result_mask &= request_mask;
-out_no_update:
+       stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;
+
        generic_fillattr(&init_user_ns, inode, stat);
        stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
        if (S_ISDIR(inode->i_mode))
@@ -963,7 +987,6 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct nfs_inode *nfsi;
        struct inode *inode;
-       struct nfs_server *server;
 
        if (!(ctx->mode & FMODE_WRITE))
                return;
@@ -979,10 +1002,10 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
                return;
        if (!list_empty(&nfsi->open_files))
                return;
-       server = NFS_SERVER(inode);
-       if (server->flags & NFS_MOUNT_NOCTO)
+       if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)
                return;
-       nfs_revalidate_inode(server, inode);
+       nfs_revalidate_inode(inode,
+                            NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
 }
 EXPORT_SYMBOL_GPL(nfs_close_context);
 
@@ -1237,16 +1260,16 @@ int nfs_attribute_cache_expired(struct inode *inode)
 
 /**
  * nfs_revalidate_inode - Revalidate the inode attributes
- * @server: pointer to nfs_server struct
  * @inode: pointer to inode struct
+ * @flags: cache flags to check
  *
  * Updates inode attribute information by retrieving the data from the server.
  */
-int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+int nfs_revalidate_inode(struct inode *inode, unsigned long flags)
 {
-       if (!nfs_need_revalidate_inode(inode))
+       if (!nfs_check_cache_invalid(inode, flags))
                return NFS_STALE(inode) ? -ESTALE : 0;
-       return __nfs_revalidate_inode(server, inode);
+       return __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
@@ -1332,7 +1355,7 @@ out:
 
 bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 {
-       return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+       return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) ||
                NFS_STALE(inode);
 }
 
@@ -1468,8 +1491,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        if (!nfs_file_has_buffered_writers(nfsi)) {
                /* Verify a few of the more important attributes */
                if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
-                       invalid |= NFS_INO_INVALID_CHANGE
-                               | NFS_INO_REVAL_PAGECACHE;
+                       invalid |= NFS_INO_INVALID_CHANGE;
 
                ts = inode->i_mtime;
                if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
@@ -1483,28 +1505,21 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                        cur_size = i_size_read(inode);
                        new_isize = nfs_size_to_loff_t(fattr->size);
                        if (cur_size != new_isize)
-                               invalid |= NFS_INO_INVALID_SIZE
-                                       | NFS_INO_REVAL_PAGECACHE;
+                               invalid |= NFS_INO_INVALID_SIZE;
                }
        }
 
        /* Have any file permissions changed? */
        if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_MODE;
        if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_OTHER;
        if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_OTHER;
 
        /* Has the link count changed? */
        if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
-               invalid |= NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_NLINK;
 
        ts = inode->i_atime;
        if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
@@ -1642,41 +1657,142 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
 #endif
 
 /**
- * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * nfs_inode_attrs_cmp_generic - compare attributes
+ * @fattr: attributes
  * @inode: pointer to inode
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. Otherwise it returns
+ * the value '0'.
+ */
+static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr,
+                                      const struct inode *inode)
+{
+       unsigned long attr_gencount = NFS_I(inode)->attr_gencount;
+
+       return (long)(fattr->gencount - attr_gencount) > 0 ||
+              (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0;
+}
+
+/**
+ * nfs_inode_attrs_cmp_monotonic - compare attributes
  * @fattr: attributes
+ * @inode: pointer to inode
  *
  * Attempt to divine whether or not an RPC call reply carrying stale
  * attributes got scheduled after another call carrying updated ones.
  *
- * To do so, the function first assumes that a more recent ctime means
- * that the attributes in fattr are newer, however it also attempt to
- * catch the case where ctime either didn't change, or went backwards
- * (if someone reset the clock on the server) by looking at whether
- * or not this RPC call was started after the inode was last updated.
- * Note also the check for wraparound of 'attr_gencount'
+ * We assume that the server observes monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '0' indicates no measurable change
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent.
+ */
+static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr,
+                                        const struct inode *inode)
+{
+       s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode);
+       if (diff > 0)
+               return 1;
+       return diff == 0 ? 0 : -1;
+}
+
+/**
+ * nfs_inode_attrs_cmp_strict_monotonic - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
  *
- * The function returns 'true' if it thinks the attributes in 'fattr' are
- * more recent than the ones cached in the inode.
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
  *
+ * We assume that the server observes strictly monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent or unchanged.
  */
-static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr,
+                                               const struct inode *inode)
 {
-       const struct nfs_inode *nfsi = NFS_I(inode);
+       return  nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1;
+}
 
-       return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
-               ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+/**
+ * nfs_inode_attrs_cmp - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * This function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. It returns '-1' if
+ * the attributes in @inode are more recent than the ones in @fattr,
+ * and it returns 0 if not sure.
+ */
+static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr,
+                              const struct inode *inode)
+{
+       if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0)
+               return 1;
+       switch (NFS_SERVER(inode)->change_attr_type) {
+       case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+               break;
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+                       break;
+               return nfs_inode_attrs_cmp_monotonic(fattr, inode);
+       default:
+               if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+                       break;
+               return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode);
+       }
+       return 0;
+}
+
+/**
+ * nfs_inode_finish_partial_attr_update - complete a previous inode update
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Returns '1' if the last attribute update left the inode cached
+ * attributes in a partially unrevalidated state, and @fattr
+ * matches the change attribute of that partial update.
+ * Otherwise returns '0'.
+ */
+static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
+                                               const struct inode *inode)
+{
+       const unsigned long check_valid =
+               NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+               NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+               NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
+               NFS_INO_INVALID_NLINK;
+       unsigned long cache_validity = NFS_I(inode)->cache_validity;
+
+       if (!(cache_validity & NFS_INO_INVALID_CHANGE) &&
+           (cache_validity & check_valid) != 0 &&
+           (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+           nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0)
+               return 1;
+       return 0;
 }
 
-static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+static int nfs_refresh_inode_locked(struct inode *inode,
+                                   struct nfs_fattr *fattr)
 {
-       int ret;
+       int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
+       int ret = 0;
 
        trace_nfs_refresh_inode_enter(inode);
 
-       if (nfs_inode_attrs_need_update(inode, fattr))
+       if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
                ret = nfs_update_inode(inode, fattr);
-       else
+       else if (attr_cmp == 0)
                ret = nfs_check_inode_attributes(inode, fattr);
 
        trace_nfs_refresh_inode_exit(inode, ret);
@@ -1761,11 +1877,13 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
  */
 int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
+       int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
        int status;
 
        /* Don't do a WCC update if these attributes are already stale */
-       if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
-                       !nfs_inode_attrs_need_update(inode, fattr)) {
+       if (attr_cmp < 0)
+               return 0;
+       if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
                fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
                                | NFS_ATTR_FATTR_PRESIZE
                                | NFS_ATTR_FATTR_PREMTIME
@@ -1839,9 +1957,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-       struct nfs_server *server;
+       struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_isize, new_isize;
+       u64 fattr_supported = server->fattr_valid;
        unsigned long invalid = 0;
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
@@ -1885,7 +2004,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                goto out_err;
        }
 
-       server = NFS_SERVER(inode);
        /* Update the fsid? */
        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
@@ -1904,14 +2022,17 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
                        | NFS_INO_INVALID_ATIME
                        | NFS_INO_REVAL_FORCED
-                       | NFS_INO_REVAL_PAGECACHE
                        | NFS_INO_INVALID_BLOCKS);
 
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
 
        if (pnfs_layoutcommit_outstanding(inode)) {
-               nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+               nfsi->cache_validity |=
+                       save_cache_validity &
+                       (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+                        NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+                        NFS_INO_INVALID_BLOCKS);
                cache_revalidated = false;
        }
 
@@ -1928,6 +2049,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                save_cache_validity |= NFS_INO_INVALID_CTIME
                                        | NFS_INO_INVALID_MTIME
                                        | NFS_INO_INVALID_SIZE
+                                       | NFS_INO_INVALID_BLOCKS
+                                       | NFS_INO_INVALID_NLINK
+                                       | NFS_INO_INVALID_MODE
                                        | NFS_INO_INVALID_OTHER;
                                if (S_ISDIR(inode->i_mode))
                                        nfs_force_lookup_revalidate(inode);
@@ -1940,28 +2064,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        attr_changed = true;
                }
        } else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_CHANGE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_REVAL_FORCED);
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_CHANGE;
                cache_revalidated = false;
        }
 
        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                inode->i_mtime = fattr->mtime;
-       } else if (server->caps & NFS_CAP_MTIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_MTIME
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_MTIME;
                cache_revalidated = false;
        }
 
        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                inode->i_ctime = fattr->ctime;
-       } else if (server->caps & NFS_CAP_CTIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_CTIME
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_CTIME;
                cache_revalidated = false;
        }
 
@@ -1985,21 +2105,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                        (long long)cur_isize,
                                        (long long)new_isize);
                }
+               if (new_isize == 0 &&
+                   !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED |
+                                     NFS_ATTR_FATTR_BLOCKS_USED))) {
+                       fattr->du.nfs3.used = 0;
+                       fattr->valid |= NFS_ATTR_FATTR_SPACE_USED;
+               }
        } else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_SIZE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_REVAL_FORCED);
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_SIZE;
                cache_revalidated = false;
        }
 
-
        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                inode->i_atime = fattr->atime;
-       else if (server->caps & NFS_CAP_ATIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_ATIME
-                               | NFS_INO_REVAL_FORCED);
+       else if (fattr_supported & NFS_ATTR_FATTR_ATIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_ATIME;
                cache_revalidated = false;
        }
 
@@ -2012,10 +2134,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                | NFS_INO_INVALID_ACL;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_MODE) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_MODE) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_MODE;
                cache_revalidated = false;
        }
 
@@ -2026,10 +2147,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        inode->i_uid = fattr->uid;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_OWNER) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_OTHER;
                cache_revalidated = false;
        }
 
@@ -2040,10 +2160,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        inode->i_gid = fattr->gid;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_OWNER_GROUP) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_OTHER;
                cache_revalidated = false;
        }
 
@@ -2054,10 +2173,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        set_nlink(inode, fattr->nlink);
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_NLINK) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_NLINK;
                cache_revalidated = false;
        }
 
@@ -2066,18 +2184,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-       } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+       } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_BLOCKS;
+               cache_revalidated = false;
+       }
+
+       if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) {
                inode->i_blocks = fattr->du.nfs2.blocks;
-       else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_BLOCKS
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_BLOCKS;
                cache_revalidated = false;
        }
 
        /* Update attrtimeo value if we're out of the unstable period */
        if (attr_changed) {
-               invalid &= ~NFS_INO_INVALID_ATTR;
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
@@ -2094,7 +2216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        nfsi->attrtimeo_timestamp = now;
                }
                /* Set the barrier to be more recent than this fattr */
-               if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+               if ((long)(fattr->gencount - nfsi->attr_gencount) > 0)
                        nfsi->attr_gencount = fattr->gencount;
        }
 
index 7395d09..a36af04 100644 (file)
@@ -181,7 +181,7 @@ struct nfs_mount_request {
        struct net              *net;
 };
 
-extern int nfs_mount(struct nfs_mount_request *info);
+extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans);
 extern void nfs_umount(const struct nfs_mount_request *info);
 
 /* client.c */
index 5088fda..b5551ed 100644 (file)
@@ -104,7 +104,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
 }
 
 /**
- * nfs_end_io_direct - declare the file is being used for direct i/o
+ * nfs_start_io_direct - declare the file is being used for direct i/o
  * @inode: file inode
  *
  * Declare that a direct I/O operation is about to start, and ensure
index dda5c3e..c5e3b6b 100644 (file)
@@ -136,14 +136,16 @@ struct mnt_fhstatus {
 /**
  * nfs_mount - Obtain an NFS file handle for the given host and path
  * @info: pointer to mount request arguments
+ * @timeo: deciseconds the mount waits for a response before it retries
+ * @retrans: number of times the mount retries a request
  *
- * Uses default timeout parameters specified by underlying transport. On
- * successful return, the auth_flavs list and auth_flav_len will be populated
- * with the list from the server or a faked-up list if the server didn't
- * provide one.
+ * Uses timeout parameters specified by caller. On successful return, the
+ * auth_flavs list and auth_flav_len will be populated with the list from the
+ * server or a faked-up list if the server didn't provide one.
  */
-int nfs_mount(struct nfs_mount_request *info)
+int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans)
 {
+       struct rpc_timeout mnt_timeout;
        struct mountres result = {
                .fh             = info->fh,
                .auth_count     = info->auth_flav_len,
@@ -158,6 +160,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
+               .timeout        = &mnt_timeout,
                .servername     = info->hostname,
                .program        = &mnt_program,
                .version        = info->version,
@@ -177,6 +180,7 @@ int nfs_mount(struct nfs_mount_request *info)
        if (info->noresvport)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
+       nfs_init_timeout_values(&mnt_timeout, info->protocol, timeo, retrans);
        mnt_clnt = rpc_create(&args);
        if (IS_ERR(mnt_clnt))
                goto out_clnt_err;
index bb386a6..9ec560a 100644 (file)
@@ -65,7 +65,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
                return ERR_PTR(-EOPNOTSUPP);
 
-       status = nfs_revalidate_inode(server, inode);
+       status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (status < 0)
                return ERR_PTR(status);
 
index ed1c837..e6eca1d 100644 (file)
@@ -433,7 +433,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -EIO;
        length = be32_to_cpup(p++);
-       if (unlikely(length > NFS3_FHSIZE))
+       if (unlikely(length > NFS3_FHSIZE || length == 0))
                goto out_toobig;
        p = xdr_inline_decode(xdr, length);
        if (unlikely(!p))
@@ -442,7 +442,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
        memcpy(fh->data, p, length);
        return 0;
 out_toobig:
-       dprintk("NFS: file handle size (%u) too big\n", length);
+       trace_nfs_xdr_bad_filehandle(xdr, NFSERR_BADHANDLE);
        return -E2BIG;
 }
 
@@ -2227,6 +2227,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
 
        /* ignore properties */
        result->lease_time = 0;
+       result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
        return 0;
 }
 
index 094024b..a243495 100644 (file)
@@ -46,11 +46,12 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 {
        struct inode *inode = file_inode(filep);
        struct nfs_server *server = NFS_SERVER(inode);
+       u32 bitmask[3];
        struct nfs42_falloc_args args = {
                .falloc_fh      = NFS_FH(inode),
                .falloc_offset  = offset,
                .falloc_length  = len,
-               .falloc_bitmask = nfs4_fattr_bitmap,
+               .falloc_bitmask = bitmask,
        };
        struct nfs42_falloc_res res = {
                .falloc_server  = server,
@@ -68,6 +69,10 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                return status;
        }
 
+       memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask));
+       if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)
+               bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
        res.falloc_fattr = nfs_alloc_fattr();
        if (!res.falloc_fattr)
                return -ENOMEM;
@@ -75,7 +80,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        status = nfs4_call_sync(server->client, server, msg,
                                &args.seq_args, &res.seq_res, 0);
        if (status == 0)
-               status = nfs_post_op_update_inode(inode, res.falloc_fattr);
+               status = nfs_post_op_update_inode_force_wcc(inode,
+                                                           res.falloc_fattr);
 
        kfree(res.falloc_fattr);
        return status;
@@ -84,7 +90,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                                loff_t offset, loff_t len)
 {
-       struct nfs_server *server = NFS_SERVER(file_inode(filep));
+       struct inode *inode = file_inode(filep);
+       struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = { };
        struct nfs_lock_context *lock;
        int err;
@@ -93,9 +100,13 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        if (IS_ERR(lock))
                return PTR_ERR(lock);
 
-       exception.inode = file_inode(filep);
+       exception.inode = inode;
        exception.state = lock->open_context->state;
 
+       err = nfs_sync_inode(inode);
+       if (err)
+               goto out;
+
        do {
                err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
                if (err == -ENOTSUPP) {
@@ -104,7 +115,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
-
+out:
        nfs_put_lock_context(lock);
        return err;
 }
@@ -142,16 +153,13 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
                return -EOPNOTSUPP;
 
        inode_lock(inode);
-       err = nfs_sync_inode(inode);
-       if (err)
-               goto out_unlock;
 
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == 0)
                truncate_pagecache_range(inode, offset, (offset + len) -1);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-out_unlock:
+
        inode_unlock(inode);
        return err;
 }
@@ -261,6 +269,33 @@ out:
        return status;
 }
 
+/**
+ * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
+ * @inode: pointer to destination inode
+ * @pos: destination offset
+ * @len: copy length
+ *
+ * Punch a hole in the inode page cache, so that the NFS client will
+ * know to retrieve new data.
+ * Update the file size if necessary, and then mark the inode as having
+ * invalid cached values for change attribute, ctime, mtime and space used.
+ */
+static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+{
+       loff_t newsize = pos + len;
+       loff_t end = newsize - 1;
+
+       truncate_pagecache_range(inode, pos, end);
+       spin_lock(&inode->i_lock);
+       if (newsize > i_size_read(inode))
+               i_size_write(inode, newsize);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_MTIME |
+                                            NFS_INO_INVALID_BLOCKS);
+       spin_unlock(&inode->i_lock);
+}
+
 static ssize_t _nfs42_proc_copy(struct file *src,
                                struct nfs_lock_context *src_lock,
                                struct file *dst,
@@ -354,19 +389,8 @@ static ssize_t _nfs42_proc_copy(struct file *src,
                        goto out;
        }
 
-       truncate_pagecache_range(dst_inode, pos_dst,
-                                pos_dst + res->write_res.count);
-       spin_lock(&dst_inode->i_lock);
-       nfs_set_cache_invalid(
-               dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED |
-                                  NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR |
-                                  NFS_INO_INVALID_DATA);
-       spin_unlock(&dst_inode->i_lock);
-       spin_lock(&src_inode->i_lock);
-       nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE |
-                                                NFS_INO_REVAL_FORCED |
-                                                NFS_INO_INVALID_ATIME);
-       spin_unlock(&src_inode->i_lock);
+       nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+       nfs_invalidate_atime(src_inode);
        status = res->write_res.count;
 out:
        if (args->sync)
@@ -659,7 +683,10 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
        if (status)
                return status;
 
-       return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+       if (whence == SEEK_DATA && res.sr_eof)
+               return -NFS4ERR_NXIO;
+       else
+               return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
 
 loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
@@ -1044,8 +1071,10 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 
        status = nfs4_call_sync(server->client, server, msg,
                                &args.seq_args, &res.seq_res, 0);
-       if (status == 0)
+       if (status == 0) {
+               nfs42_copy_dest_done(dst_inode, dst_offset, count);
                status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+       }
 
        kfree(res.dst_fattr);
        return status;
index 6c2ce79..1c4d2a0 100644 (file)
@@ -168,7 +168,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
  *        make it easier to copy the value after an RPC, even if
  *        the value will not be passed up to application (e.g.
  *        for a 'query' getxattr with NULL buffer).
- * @len:   Length of the value. Can be 0 for zero-length attribues.
+ * @len:   Length of the value. Can be 0 for zero-length attributes.
  *         @value and @pages will be NULL if @len is 0.
  */
 static struct nfs4_xattr_entry *
index 441a2fa..57b3821 100644 (file)
@@ -420,9 +420,7 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
  */
 void nfs42_ssc_register_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
-#endif
 }
 
 /**
@@ -433,9 +431,7 @@ void nfs42_ssc_register_ops(void)
  */
 void nfs42_ssc_unregister_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
-#endif
 }
 #endif /* CONFIG_NFS_V4_2 */
 
index c65c4b4..87d04f2 100644 (file)
@@ -108,9 +108,10 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
                const struct cred *, bool);
 #endif
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-               struct nfs_server *server,
-               struct nfs4_label *label);
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ],
+                            const __u32 *src, struct inode *inode,
+                            struct nfs_server *server,
+                            struct nfs4_label *label);
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 static inline struct nfs4_label *
@@ -263,6 +264,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD1_FS_LAYOUT_TYPES,
                        FATTR4_WORD2_LAYOUT_BLKSIZE
                        | FATTR4_WORD2_CLONE_BLKSIZE
+                       | FATTR4_WORD2_CHANGE_ATTR_TYPE
                        | FATTR4_WORD2_XATTR_SUPPORT
 };
 
@@ -283,7 +285,7 @@ const u32 nfs4_fs_locations_bitmap[3] = {
 };
 
 static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
-               struct inode *inode)
+                                   struct inode *inode, unsigned long flags)
 {
        unsigned long cache_validity;
 
@@ -291,22 +293,20 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
        if (!inode || !nfs4_have_delegation(inode, FMODE_READ))
                return;
 
-       cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-       if (!(cache_validity & NFS_INO_REVAL_FORCED))
-               cache_validity &= ~(NFS_INO_INVALID_CHANGE
-                               | NFS_INO_INVALID_SIZE);
+       cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags;
 
+       /* Remove the attributes over which we have full control */
+       dst[1] &= ~FATTR4_WORD1_RAWDEV;
        if (!(cache_validity & NFS_INO_INVALID_SIZE))
                dst[0] &= ~FATTR4_WORD0_SIZE;
 
        if (!(cache_validity & NFS_INO_INVALID_CHANGE))
                dst[0] &= ~FATTR4_WORD0_CHANGE;
-}
 
-static void nfs4_bitmap_copy_adjust_setattr(__u32 *dst,
-               const __u32 *src, struct inode *inode)
-{
-       nfs4_bitmap_copy_adjust(dst, src, inode);
+       if (!(cache_validity & NFS_INO_INVALID_MODE))
+               dst[1] &= ~FATTR4_WORD1_MODE;
+       if (!(cache_validity & NFS_INO_INVALID_OTHER))
+               dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
 }
 
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -1169,14 +1169,26 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 static void
 nfs4_inc_nlink_locked(struct inode *inode)
 {
-       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_NLINK);
        inc_nlink(inode);
 }
 
 static void
+nfs4_inc_nlink(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       nfs4_inc_nlink_locked(inode);
+       spin_unlock(&inode->i_lock);
+}
+
+static void
 nfs4_dec_nlink_locked(struct inode *inode)
 {
-       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_NLINK);
        drop_nlink(inode);
 }
 
@@ -1186,11 +1198,23 @@ nfs4_update_changeattr_locked(struct inode *inode,
                unsigned long timestamp, unsigned long cache_validity)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+       u64 change_attr = inode_peek_iversion_raw(inode);
 
        cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
 
-       if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
-               nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+       switch (NFS_SERVER(inode)->change_attr_type) {
+       case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+               break;
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               if ((s64)(change_attr - cinfo->after) > 0)
+                       goto out;
+               break;
+       default:
+               if ((s64)(change_attr - cinfo->after) >= 0)
+                       goto out;
+       }
+
+       if (cinfo->atomic && cinfo->before == change_attr) {
                nfsi->attrtimeo_timestamp = jiffies;
        } else {
                if (S_ISDIR(inode->i_mode)) {
@@ -1202,7 +1226,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
                                cache_validity |= NFS_INO_REVAL_PAGECACHE;
                }
 
-               if (cinfo->before != inode_peek_iversion_raw(inode))
+               if (cinfo->before != change_attr)
                        cache_validity |= NFS_INO_INVALID_ACCESS |
                                          NFS_INO_INVALID_ACL |
                                          NFS_INO_INVALID_XATTR;
@@ -1210,8 +1234,9 @@ nfs4_update_changeattr_locked(struct inode *inode,
        inode_set_iversion_raw(inode, cinfo->after);
        nfsi->read_cache_jiffies = timestamp;
        nfsi->attr_gencount = nfs_inc_attr_generation_counter();
-       nfs_set_cache_invalid(inode, cache_validity);
        nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
+out:
+       nfs_set_cache_invalid(inode, cache_validity);
 }
 
 void
@@ -3344,12 +3369,17 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
                .inode = inode,
                .stateid = &arg.stateid,
        };
+       unsigned long adjust_flags = NFS_INO_INVALID_CHANGE;
        int err;
 
+       if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID))
+               adjust_flags |= NFS_INO_INVALID_MODE;
+       if (sattr->ia_valid & (ATTR_UID | ATTR_GID))
+               adjust_flags |= NFS_INO_INVALID_OTHER;
+
        do {
-               nfs4_bitmap_copy_adjust_setattr(bitmask,
-                               nfs4_bitmask(server, olabel),
-                               inode);
+               nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, olabel),
+                                       inode, adjust_flags);
 
                err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
                switch (err) {
@@ -3591,6 +3621,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct inode *inode = calldata->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
        bool is_rdonly, is_wronly, is_rdwr;
        int call_close = 0;
@@ -3647,8 +3678,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
                /* Close-to-open cache consistency revalidation */
                if (!nfs4_have_delegation(inode, FMODE_READ)) {
-                       calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
-                       nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL);
+                       nfs4_bitmask_set(calldata->arg.bitmask_store,
+                                        server->cache_consistency_bitmask,
+                                        inode, server, NULL);
+                       calldata->arg.bitmask = calldata->arg.bitmask_store;
                } else
                        calldata->arg.bitmask = NULL;
        }
@@ -3835,12 +3868,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
                }
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
-               server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
-                               NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-                               NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
-                               NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-                               NFS_CAP_CTIME|NFS_CAP_MTIME|
-                               NFS_CAP_SECURITY_LABEL);
+               server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
+                                 NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+               server->fattr_valid = NFS_ATTR_FATTR_V4;
                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
                                res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
                        server->caps |= NFS_CAP_ACLS;
@@ -3848,25 +3878,29 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_HARDLINKS;
                if (res.has_symlinks != 0)
                        server->caps |= NFS_CAP_SYMLINKS;
-               if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
-                       server->caps |= NFS_CAP_FILEID;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
-                       server->caps |= NFS_CAP_MODE;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
-                       server->caps |= NFS_CAP_NLINK;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
-                       server->caps |= NFS_CAP_OWNER;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
-                       server->caps |= NFS_CAP_OWNER_GROUP;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
-                       server->caps |= NFS_CAP_ATIME;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
-                       server->caps |= NFS_CAP_CTIME;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
-                       server->caps |= NFS_CAP_MTIME;
+               if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_MODE;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER))
+                       server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER |
+                               NFS_ATTR_FATTR_OWNER_NAME);
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP))
+                       server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP |
+                               NFS_ATTR_FATTR_GROUP_NAME);
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
-               if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
-                       server->caps |= NFS_CAP_SECURITY_LABEL;
+               if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL;
 #endif
                memcpy(server->attr_bitmask_nl, res.attr_bitmask,
                                sizeof(server->attr_bitmask));
@@ -4154,8 +4188,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
                task_flags |= RPC_TASK_TIMEOUT;
 
-       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
-
+       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode, 0);
        nfs_fattr_init(fattr);
        nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
        return nfs4_do_call_sync(server->client, server, &msg,
@@ -4582,11 +4615,11 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
        if (status == 0) {
                spin_lock(&dir->i_lock);
-               nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
-                                             NFS_INO_INVALID_DATA);
                /* Removing a directory decrements nlink in the parent */
                if (ftype == NF4DIR && dir->i_nlink > 2)
                        nfs4_dec_nlink_locked(dir);
+               nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+                                             NFS_INO_INVALID_DATA);
                spin_unlock(&dir->i_lock);
        }
        return status;
@@ -4715,11 +4748,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
                        /* Note: If we moved a directory, nlink will change */
                        nfs4_update_changeattr(old_dir, &res->old_cinfo,
                                        res->old_fattr->time_start,
-                                       NFS_INO_INVALID_OTHER |
+                                       NFS_INO_INVALID_NLINK |
                                            NFS_INO_INVALID_DATA);
                        nfs4_update_changeattr(new_dir, &res->new_cinfo,
                                        res->new_fattr->time_start,
-                                       NFS_INO_INVALID_OTHER |
+                                       NFS_INO_INVALID_NLINK |
                                            NFS_INO_INVALID_DATA);
                } else
                        nfs4_update_changeattr(old_dir, &res->old_cinfo,
@@ -4761,12 +4794,13 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
        }
 
        nfs4_inode_make_writeable(inode);
-       nfs4_bitmap_copy_adjust_setattr(bitmask, nfs4_bitmask(server, res.label), inode);
-
+       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.label), inode,
+                               NFS_INO_INVALID_CHANGE);
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
                                       NFS_INO_INVALID_DATA);
+               nfs4_inc_nlink(inode);
                status = nfs_post_op_update_inode(inode, res.fattr);
                if (!status)
                        nfs_setsecurity(inode, res.fattr, res.label);
@@ -4844,12 +4878,12 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
                                    &data->arg.seq_args, &data->res.seq_res, 1);
        if (status == 0) {
                spin_lock(&dir->i_lock);
-               nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
-                               data->res.fattr->time_start,
-                               NFS_INO_INVALID_DATA);
                /* Creating a directory bumps nlink in the parent */
                if (data->arg.ftype == NF4DIR)
                        nfs4_inc_nlink_locked(dir);
+               nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+                                             data->res.fattr->time_start,
+                                             NFS_INO_INVALID_DATA);
                spin_unlock(&dir->i_lock);
                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
        }
@@ -5416,37 +5450,39 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
        return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-                               struct nfs_server *server,
-                               struct nfs4_label *label)
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
+                            struct inode *inode, struct nfs_server *server,
+                            struct nfs4_label *label)
 {
-
        unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+       unsigned int i;
 
-       if ((cache_validity & NFS_INO_INVALID_DATA) ||
-               (cache_validity & NFS_INO_REVAL_PAGECACHE) ||
-               (cache_validity & NFS_INO_REVAL_FORCED) ||
-               (cache_validity & NFS_INO_INVALID_OTHER))
-               nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+       memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
 
+       if (cache_validity & NFS_INO_INVALID_CHANGE)
+               bitmask[0] |= FATTR4_WORD0_CHANGE;
        if (cache_validity & NFS_INO_INVALID_ATIME)
                bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+       if (cache_validity & NFS_INO_INVALID_MODE)
+               bitmask[1] |= FATTR4_WORD1_MODE;
        if (cache_validity & NFS_INO_INVALID_OTHER)
-               bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
-                               FATTR4_WORD1_OWNER_GROUP |
-                               FATTR4_WORD1_NUMLINKS;
+               bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP;
+       if (cache_validity & NFS_INO_INVALID_NLINK)
+               bitmask[1] |= FATTR4_WORD1_NUMLINKS;
        if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
                bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
-       if (cache_validity & NFS_INO_INVALID_CHANGE)
-               bitmask[0] |= FATTR4_WORD0_CHANGE;
        if (cache_validity & NFS_INO_INVALID_CTIME)
                bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
        if (cache_validity & NFS_INO_INVALID_MTIME)
                bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
-       if (cache_validity & NFS_INO_INVALID_SIZE)
-               bitmask[0] |= FATTR4_WORD0_SIZE;
        if (cache_validity & NFS_INO_INVALID_BLOCKS)
                bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+       if (cache_validity & NFS_INO_INVALID_SIZE)
+               bitmask[0] |= FATTR4_WORD0_SIZE;
+
+       for (i = 0; i < NFS4_BITMASK_SZ; i++)
+               bitmask[i] &= server->attr_bitmask[i];
 }
 
 static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
@@ -5459,8 +5495,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
                hdr->args.bitmask = NULL;
                hdr->res.fattr = NULL;
        } else {
-               hdr->args.bitmask = server->cache_consistency_bitmask;
-               nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL);
+               nfs4_bitmask_set(hdr->args.bitmask_store,
+                                server->cache_consistency_bitmask,
+                                hdr->inode, server, NULL);
+               hdr->args.bitmask = hdr->args.bitmask_store;
        }
 
        if (!hdr->pgio_done_cb)
@@ -5858,7 +5896,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
-       ret = nfs_revalidate_inode(server, inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret < 0)
                return ret;
        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
@@ -6502,8 +6540,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
 
        data->args.fhandle = &data->fh;
        data->args.stateid = &data->stateid;
-       data->args.bitmask = server->cache_consistency_bitmask;
-       nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL);
+       nfs4_bitmask_set(data->args.bitmask_store,
+                        server->cache_consistency_bitmask, inode, server,
+                        NULL);
+       data->args.bitmask = data->args.bitmask_store;
        nfs_copy_fh(&data->fh, NFS_FH(inode));
        nfs4_stateid_copy(&data->stateid, stateid);
        data->res.fattr = &data->fattr;
@@ -7250,22 +7290,22 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
 
 #ifdef CONFIG_NFS_V4_1
 struct nfs4_lock_waiter {
-       struct task_struct      *task;
        struct inode            *inode;
-       struct nfs_lowner       *owner;
+       struct nfs_lowner       owner;
+       wait_queue_entry_t      wait;
 };
 
 static int
 nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
 {
-       int ret;
-       struct nfs4_lock_waiter *waiter = wait->private;
+       struct nfs4_lock_waiter *waiter =
+               container_of(wait, struct nfs4_lock_waiter, wait);
 
        /* NULL key means to wake up everyone */
        if (key) {
                struct cb_notify_lock_args      *cbnl = key;
                struct nfs_lowner               *lowner = &cbnl->cbnl_owner,
-                                               *wowner = waiter->owner;
+                                               *wowner = &waiter->owner;
 
                /* Only wake if the callback was for the same owner. */
                if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev)
@@ -7276,53 +7316,45 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo
                        return 0;
        }
 
-       /* override "private" so we can use default_wake_function */
-       wait->private = waiter->task;
-       ret = woken_wake_function(wait, mode, flags, key);
-       if (ret)
-               list_del_init(&wait->entry);
-       wait->private = waiter;
-       return ret;
+       return woken_wake_function(wait, mode, flags, key);
 }
 
 static int
 nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-       int status = -ERESTARTSYS;
        struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs_client *clp = server->nfs_client;
        wait_queue_head_t *q = &clp->cl_lock_waitq;
-       struct nfs_lowner owner = { .clientid = clp->cl_clientid,
-                                   .id = lsp->ls_seqid.owner_id,
-                                   .s_dev = server->s_dev };
-       struct nfs4_lock_waiter waiter = { .task  = current,
-                                          .inode = state->inode,
-                                          .owner = &owner};
-       wait_queue_entry_t wait;
+       struct nfs4_lock_waiter waiter = {
+               .inode = state->inode,
+               .owner = { .clientid = clp->cl_clientid,
+                          .id = lsp->ls_seqid.owner_id,
+                          .s_dev = server->s_dev },
+       };
+       int status;
 
        /* Don't bother with waitqueue if we don't expect a callback */
        if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
                return nfs4_retry_setlk_simple(state, cmd, request);
 
-       init_wait(&wait);
-       wait.private = &waiter;
-       wait.func = nfs4_wake_lock_waiter;
+       init_wait(&waiter.wait);
+       waiter.wait.func = nfs4_wake_lock_waiter;
+       add_wait_queue(q, &waiter.wait);
 
-       while(!signalled()) {
-               add_wait_queue(q, &wait);
+       do {
                status = nfs4_proc_setlk(state, cmd, request);
-               if ((status != -EAGAIN) || IS_SETLK(cmd)) {
-                       finish_wait(q, &wait);
+               if (status != -EAGAIN || IS_SETLK(cmd))
                        break;
-               }
 
                status = -ERESTARTSYS;
                freezer_do_not_count();
-               wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+               wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+                          NFS4_LOCK_MAXTIMEOUT);
                freezer_count();
-               finish_wait(q, &wait);
-       }
+       } while (!signalled());
+
+       remove_wait_queue(q, &waiter.wait);
 
        return status;
 }
@@ -7615,7 +7647,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
                        return -EACCES;
        }
 
-       ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret)
                return ret;
 
@@ -7646,7 +7678,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
                        return 0;
        }
 
-       ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret)
                return ret;
 
@@ -10427,9 +10459,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
 #endif
        NULL
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index ff876dd..db3811a 100644 (file)
@@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp,
        /* Cap maximum reconnect timeout at 1/2 lease period */
        rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
 }
-
-/*
- * Local variables:
- *   c-basic-offset: 8
- * End:
- */
index 3a51351..f22818a 100644 (file)
@@ -645,7 +645,7 @@ void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
 }
 
 /**
- * nfs4_purge_state_owners - Release all cached state owners
+ * nfs4_free_state_owners - Release all cached state owners
  * @head: resulting list of state owners
  *
  * Frees a list of state owners that was generated by
@@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr)
        module_put_and_exit(0);
        return 0;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 48d761e..2ef75ca 100644 (file)
@@ -666,7 +666,42 @@ TRACE_EVENT(nfs4_state_mgr_failed,
                )
 )
 
-TRACE_EVENT(nfs4_xdr_status,
+TRACE_EVENT(nfs4_xdr_bad_operation,
+               TP_PROTO(
+                       const struct xdr_stream *xdr,
+                       u32 op,
+                       u32 expected
+               ),
+
+               TP_ARGS(xdr, op, expected),
+
+               TP_STRUCT__entry(
+                       __field(unsigned int, task_id)
+                       __field(unsigned int, client_id)
+                       __field(u32, xid)
+                       __field(u32, op)
+                       __field(u32, expected)
+               ),
+
+               TP_fast_assign(
+                       const struct rpc_rqst *rqstp = xdr->rqst;
+                       const struct rpc_task *task = rqstp->rq_task;
+
+                       __entry->task_id = task->tk_pid;
+                       __entry->client_id = task->tk_client->cl_clid;
+                       __entry->xid = be32_to_cpu(rqstp->rq_xid);
+                       __entry->op = op;
+                       __entry->expected = expected;
+               ),
+
+               TP_printk(
+                       "task:%u@%d xid=0x%08x operation=%u, expected=%u",
+                       __entry->task_id, __entry->client_id, __entry->xid,
+                       __entry->op, __entry->expected
+               )
+);
+
+DECLARE_EVENT_CLASS(nfs4_xdr_event,
                TP_PROTO(
                        const struct xdr_stream *xdr,
                        u32 op,
@@ -701,6 +736,16 @@ TRACE_EVENT(nfs4_xdr_status,
                        __entry->op
                )
 );
+#define DEFINE_NFS4_XDR_EVENT(name) \
+       DEFINE_EVENT(nfs4_xdr_event, name, \
+                       TP_PROTO( \
+                               const struct xdr_stream *xdr, \
+                               u32 op, \
+                               u32 error \
+                       ), \
+                       TP_ARGS(xdr, op, error))
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status);
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_bad_filehandle);
 
 DECLARE_EVENT_CLASS(nfs4_cb_error_class,
                TP_PROTO(
index ac6b79e..a8cff19 100644 (file)
@@ -144,7 +144,17 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
  * layout types will be returned.
  */
 #define decode_fsinfo_maxsz    (op_decode_hdr_maxsz + \
-                                nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
+                                nfs4_fattr_bitmap_maxsz + 1 + \
+                                1 /* lease time */ + \
+                                2 /* max filesize */ + \
+                                2 /* max read */ + \
+                                2 /* max write */ + \
+                                nfstime4_maxsz /* time delta */ + \
+                                5 /* fs layout types */ + \
+                                1 /* layout blksize */ + \
+                                1 /* clone blksize */ + \
+                                1 /* change attr type */ + \
+                                1 /* xattr support */)
 #define encode_renew_maxsz     (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz     (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -3200,9 +3210,7 @@ out_status:
        *nfs_retval = nfs4_stat_to_errno(nfserr);
        return true;
 out_bad_operation:
-       dprintk("nfs: Server returned operation"
-               " %d but we issued a request for %d\n",
-                       opnum, expected);
+       trace_nfs4_xdr_bad_operation(xdr, opnum, expected);
        *nfs_retval = -EREMOTEIO;
        return false;
 out_overflow:
@@ -3487,8 +3495,11 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
                if (unlikely(!p))
                        return -EIO;
                len = be32_to_cpup(p);
-               if (len > NFS4_FHSIZE)
-                       return -EIO;
+               if (len > NFS4_FHSIZE || len == 0) {
+                       trace_nfs4_xdr_bad_filehandle(xdr, OP_READDIR,
+                                                     NFS4ERR_BADHANDLE);
+                       return -EREMOTEIO;
+               }
                p = xdr_inline_decode(xdr, len);
                if (unlikely(!p))
                        return -EIO;
@@ -4837,6 +4848,32 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
        return 0;
 }
 
+static int decode_attr_change_attr_type(struct xdr_stream *xdr,
+                                       uint32_t *bitmap,
+                                       enum nfs4_change_attr_type *res)
+{
+       u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
+       dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+       if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+               if (xdr_stream_decode_u32(xdr, &tmp))
+                       return -EIO;
+               bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE;
+       }
+
+       switch(tmp) {
+       case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR:
+       case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER:
+       case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS:
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               *res = tmp;
+               break;
+       default:
+               *res = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+       }
+       return 0;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        unsigned int savep;
@@ -4885,6 +4922,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        if (status)
                goto xdr_error;
 
+       status = decode_attr_change_attr_type(xdr, bitmap,
+                                             &fsinfo->change_attr_type);
+       if (status)
+               goto xdr_error;
+
        status = decode_attr_xattrsupport(xdr, bitmap,
                                          &fsinfo->xattr_support);
        if (status)
@@ -4913,8 +4955,10 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -EIO;
        len = be32_to_cpup(p);
-       if (len > NFS4_FHSIZE)
-               return -EIO;
+       if (len > NFS4_FHSIZE || len == 0) {
+               trace_nfs4_xdr_bad_filehandle(xdr, OP_GETFH, NFS4ERR_BADHANDLE);
+               return -EREMOTEIO;
+       }
        fh->size = len;
        p = xdr_inline_decode(xdr, len);
        if (unlikely(!p))
@@ -7629,9 +7673,3 @@ const struct rpc_version nfs_version4 = {
        .procs                  = nfs4_procedures,
        .counts                 = nfs_version4_counts,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index a90b363..5d1bfcc 100644 (file)
@@ -12,3 +12,4 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_bad_filehandle);
index 5a59dcd..eb1ef34 100644 (file)
@@ -45,6 +45,11 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
+TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
 
 #define nfs_show_cache_validity(v) \
        __print_flags(v, "|", \
@@ -60,7 +65,11 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
                        { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
                        { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
                        { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
-                       { NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
+                       { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \
+                       { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
+                       { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
+                       { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
+                       { NFS_INO_INVALID_MODE, "INVALID_MODE" })
 
 TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
 TRACE_DEFINE_ENUM(NFS_INO_STALE);
@@ -1392,7 +1401,7 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
                        { NFSERR_BADTYPE, "BADTYPE" }, \
                        { NFSERR_JUKEBOX, "JUKEBOX" })
 
-TRACE_EVENT(nfs_xdr_status,
+DECLARE_EVENT_CLASS(nfs_xdr_event,
                TP_PROTO(
                        const struct xdr_stream *xdr,
                        int error
@@ -1434,6 +1443,15 @@ TRACE_EVENT(nfs_xdr_status,
                        nfs_show_status(__entry->error)
                )
 );
+#define DEFINE_NFS_XDR_EVENT(name) \
+       DEFINE_EVENT(nfs_xdr_event, name, \
+                       TP_PROTO( \
+                               const struct xdr_stream *xdr, \
+                               int error \
+                       ), \
+                       TP_ARGS(xdr, error))
+DEFINE_NFS_XDR_EVENT(nfs_xdr_status);
+DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle);
 
 #endif /* _TRACE_NFS_H */
 
index 78c9c4b..6c20b28 100644 (file)
@@ -577,7 +577,7 @@ static void nfs_clear_request(struct nfs_page *req)
 }
 
 /**
- * nfs_release_request - Release the count on an NFS read/write request
+ * nfs_free_request - Release the count on an NFS read/write request
  * @req: request to release
  *
  * Note: Should never be called with the spinlock held!
@@ -1152,7 +1152,7 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
 }
 
 /**
- * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * __nfs_pageio_add_request - Attempt to coalesce a request into a page list.
  * @desc: destination io descriptor
  * @req: request
  *
index 102b66e..03e0b34 100644 (file)
@@ -1344,7 +1344,7 @@ _pnfs_return_layout(struct inode *ino)
        }
        valid_layout = pnfs_layout_is_valid(lo);
        pnfs_clear_layoutcommit(ino, &tmp_list);
-       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
+       pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0);
 
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                struct pnfs_layout_range range = {
@@ -2410,9 +2410,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                        .iomode = IOMODE_ANY,
                        .length = NFS4_MAX_UINT64,
                };
-               pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
-               pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
-                                               &range, 0);
+               pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0);
                goto out_forget;
        } else {
                /* We have a completely new layout */
@@ -2468,6 +2466,9 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 
        assert_spin_locked(&lo->plh_inode->i_lock);
 
+       if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+               tmp_list = &lo->plh_return_segs;
+
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
                        dprintk("%s: marking lseg %p iomode %d "
@@ -2475,6 +2476,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                lseg, lseg->pls_range.iomode,
                                lseg->pls_range.offset,
                                lseg->pls_range.length);
+                       if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                               tmp_list = &lo->plh_return_segs;
                        if (mark_lseg_invalid(lseg, tmp_list))
                                continue;
                        remaining++;
index 73ab7c5..ea19dbf 100644 (file)
@@ -91,6 +91,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        info->dtpref = fsinfo.tsize;
        info->maxfilesize = 0x7FFFFFFF;
        info->lease_time = 0;
+       info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
        return 0;
 }
 
index 4aaa1f5..19a212f 100644 (file)
@@ -116,16 +116,12 @@ static void unregister_nfs4_fs(void)
 #ifdef CONFIG_NFS_V4_2
 static void nfs_ssc_register_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
-#endif
 }
 
 static void nfs_ssc_unregister_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
-#endif
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -867,7 +863,7 @@ static int nfs_request_mount(struct fs_context *fc,
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-       status = nfs_mount(&request);
+       status = nfs_mount(&request, ctx->timeo, ctx->retrans);
        if (status != 0) {
                dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
                                request.hostname, status);
index f05a903..3bf8217 100644 (file)
@@ -764,9 +764,6 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
         * with invalidate/truncate.
         */
        spin_lock(&mapping->private_lock);
-       if (!nfs_have_writebacks(inode) &&
-           NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-               inode_inc_iversion_raw(inode);
        if (likely(!PageSwapCache(req->wb_page))) {
                set_bit(PG_MAPPED, &req->wb_flags);
                SetPagePrivate(req->wb_page);
@@ -1293,7 +1290,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
        if (nfs_have_delegated_attributes(inode))
                goto out;
        if (nfsi->cache_validity &
-           (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE))
+           (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE))
                return false;
        smp_rmb();
        if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0)
@@ -1604,7 +1601,7 @@ static int nfs_writeback_done(struct rpc_task *task,
        /* Deal with the suid/sgid bit corner case */
        if (nfs_should_remove_suid(inode)) {
                spin_lock(&inode->i_lock);
-               nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
                spin_unlock(&inode->i_lock);
        }
        return 0;
index 5fa38ad..f229172 100644 (file)
@@ -138,7 +138,7 @@ config NFSD_FLEXFILELAYOUT
 
 config NFSD_V4_2_INTER_SSC
        bool "NFSv4.2 inter server to server COPY"
-       depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
+       depends on NFSD_V4 && NFS_V4_2
        help
          This option enables support for NFSv4.2 inter server to
          server copy where the destination server calls the NFSv4.2
index daf43b9..f4ce93d 100644 (file)
@@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = {
        .vs_rpcb_optnl          = true,
        .vs_need_cong_ctrl      = true,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 7698172..b517a87 100644 (file)
@@ -354,6 +354,124 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
        .release        = nfsd4_cb_notify_lock_release,
 };
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *     OPEN allow read, deny write
+ *     OPEN allow both, deny none
+ *     DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static unsigned int
+bmap_to_share_mode(unsigned long bmap)
+{
+       int i;
+       unsigned int access = 0;
+
+       for (i = 1; i < 4; i++) {
+               if (test_bit(i, &bmap))
+                       access |= i;
+       }
+       return access;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+       stp->st_access_bmap |= mask;
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+       stp->st_access_bmap &= ~mask;
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       return (bool)(stp->st_access_bmap & mask);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+       stp->st_deny_bmap |= mask;
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+       stp->st_deny_bmap &= ~mask;
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       return (bool)(stp->st_deny_bmap & mask);
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+       switch (access & NFS4_SHARE_ACCESS_BOTH) {
+       case NFS4_SHARE_ACCESS_READ:
+               return O_RDONLY;
+       case NFS4_SHARE_ACCESS_WRITE:
+               return O_WRONLY;
+       case NFS4_SHARE_ACCESS_BOTH:
+               return O_RDWR;
+       }
+       WARN_ON_ONCE(1);
+       return O_RDONLY;
+}
+
+static inline int
+access_permit_read(struct nfs4_ol_stateid *stp)
+{
+       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
+}
+
+static inline int
+access_permit_write(struct nfs4_ol_stateid *stp)
+{
+       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
+}
+
 static inline struct nfs4_stateowner *
 nfs4_get_stateowner(struct nfs4_stateowner *sop)
 {
@@ -543,14 +661,12 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+static unsigned int file_hashval(struct svc_fh *fh)
 {
-       return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
-}
+       struct inode *inode = d_inode(fh->fh_dentry);
 
-static unsigned int file_hashval(struct knfsd_fh *fh)
-{
-       return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+       /* XXX: why not (here & in file cache) use inode? */
+       return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
 }
 
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
@@ -1153,108 +1269,6 @@ static unsigned int clientstr_hashval(struct xdr_netobj name)
 }
 
 /*
- * We store the NONE, READ, WRITE, and BOTH bits separately in the
- * st_{access,deny}_bmap field of the stateid, in order to track not
- * only what share bits are currently in force, but also what
- * combinations of share bits previous opens have used.  This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
- *
- * XXX: This enforcement is actually incomplete, since we don't keep
- * track of access/deny bit combinations; so, e.g., we allow:
- *
- *     OPEN allow read, deny write
- *     OPEN allow both, deny none
- *     DOWNGRADE allow read, deny none
- *
- * which we should reject.
- */
-static unsigned int
-bmap_to_share_mode(unsigned long bmap) {
-       int i;
-       unsigned int access = 0;
-
-       for (i = 1; i < 4; i++) {
-               if (test_bit(i, &bmap))
-                       access |= i;
-       }
-       return access;
-}
-
-/* set share access for a given stateid */
-static inline void
-set_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
-       stp->st_access_bmap |= mask;
-}
-
-/* clear share access for a given stateid */
-static inline void
-clear_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
-       stp->st_access_bmap &= ~mask;
-}
-
-/* test whether a given stateid has access */
-static inline bool
-test_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       return (bool)(stp->st_access_bmap & mask);
-}
-
-/* set share deny for a given stateid */
-static inline void
-set_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
-       stp->st_deny_bmap |= mask;
-}
-
-/* clear share deny for a given stateid */
-static inline void
-clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
-       stp->st_deny_bmap &= ~mask;
-}
-
-/* test whether a given stateid is denying specific access */
-static inline bool
-test_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       return (bool)(stp->st_deny_bmap & mask);
-}
-
-static int nfs4_access_to_omode(u32 access)
-{
-       switch (access & NFS4_SHARE_ACCESS_BOTH) {
-       case NFS4_SHARE_ACCESS_READ:
-               return O_RDONLY;
-       case NFS4_SHARE_ACCESS_WRITE:
-               return O_WRONLY;
-       case NFS4_SHARE_ACCESS_BOTH:
-               return O_RDWR;
-       }
-       WARN_ON_ONCE(1);
-       return O_RDONLY;
-}
-
-/*
  * A stateid that had a deny mode associated with it is being released
  * or downgraded. Recalculate the deny mode on the file.
  */
@@ -3125,6 +3139,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out_nolock;
                }
                new->cl_mach_cred = true;
+               break;
        case SP4_NONE:
                break;
        default:                                /* checked by xdr code */
@@ -4072,7 +4087,7 @@ static struct nfs4_file *nfsd4_alloc_file(void)
 }
 
 /* OPEN Share state helper functions */
-static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
                                struct nfs4_file *fp)
 {
        lockdep_assert_held(&state_lock);
@@ -4082,12 +4097,14 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        INIT_LIST_HEAD(&fp->fi_stateids);
        INIT_LIST_HEAD(&fp->fi_delegations);
        INIT_LIST_HEAD(&fp->fi_clnt_odstate);
-       fh_copy_shallow(&fp->fi_fhandle, fh);
+       fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
        fp->fi_deleg_file = NULL;
        fp->fi_had_conflict = false;
        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
+       fp->fi_aliased = false;
+       fp->fi_inode = d_inode(fh->fh_dentry);
 #ifdef CONFIG_NFSD_PNFS
        INIT_LIST_HEAD(&fp->fi_lo_states);
        atomic_set(&fp->fi_lo_recalls, 0);
@@ -4426,13 +4443,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 
 /* search file_hashtbl[] for file */
 static struct nfs4_file *
-find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
+find_file_locked(struct svc_fh *fh, unsigned int hashval)
 {
        struct nfs4_file *fp;
 
        hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
                                lockdep_is_held(&state_lock)) {
-               if (fh_match(&fp->fi_fhandle, fh)) {
+               if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
                        if (refcount_inc_not_zero(&fp->fi_ref))
                                return fp;
                }
@@ -4440,8 +4457,32 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        return NULL;
 }
 
-struct nfs4_file *
-find_file(struct knfsd_fh *fh)
+static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
+                                    unsigned int hashval)
+{
+       struct nfs4_file *fp;
+       struct nfs4_file *ret = NULL;
+       bool alias_found = false;
+
+       spin_lock(&state_lock);
+       hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+                                lockdep_is_held(&state_lock)) {
+               if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
+                       if (refcount_inc_not_zero(&fp->fi_ref))
+                               ret = fp;
+               } else if (d_inode(fh->fh_dentry) == fp->fi_inode)
+                       fp->fi_aliased = alias_found = true;
+       }
+       if (likely(ret == NULL)) {
+               nfsd4_init_file(fh, hashval, new);
+               new->fi_aliased = alias_found;
+               ret = new;
+       }
+       spin_unlock(&state_lock);
+       return ret;
+}
+
+static struct nfs4_file * find_file(struct svc_fh *fh)
 {
        struct nfs4_file *fp;
        unsigned int hashval = file_hashval(fh);
@@ -4453,7 +4494,7 @@ find_file(struct knfsd_fh *fh)
 }
 
 static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
 {
        struct nfs4_file *fp;
        unsigned int hashval = file_hashval(fh);
@@ -4464,15 +4505,7 @@ find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
        if (fp)
                return fp;
 
-       spin_lock(&state_lock);
-       fp = find_file_locked(fh, hashval);
-       if (likely(fp == NULL)) {
-               nfsd4_init_file(fh, hashval, new);
-               fp = new;
-       }
-       spin_unlock(&state_lock);
-
-       return fp;
+       return insert_file(new, fh, hashval);
 }
 
 /*
@@ -4485,7 +4518,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
        struct nfs4_file *fp;
        __be32 ret = nfs_ok;
 
-       fp = find_file(&current_fh->fh_handle);
+       fp = find_file(current_fh);
        if (!fp)
                return ret;
        /* Check for conflicting share reservations */
@@ -4880,6 +4913,11 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
        if (nf)
                nfsd_file_put(nf);
 
+       status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode,
+                                                               access));
+       if (status)
+               goto out_put_access;
+
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status)
                goto out_put_access;
@@ -4951,6 +4989,65 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
        return fl;
 }
 
+static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
+                                        struct nfs4_file *fp)
+{
+       struct nfs4_ol_stateid *st;
+       struct file *f = fp->fi_deleg_file->nf_file;
+       struct inode *ino = locks_inode(f);
+       int writes;
+
+       writes = atomic_read(&ino->i_writecount);
+       if (!writes)
+               return 0;
+       /*
+        * There could be multiple filehandles (hence multiple
+        * nfs4_files) referencing this file, but that's not too
+        * common; let's just give up in that case rather than
+        * trying to go look up all the clients using that other
+        * nfs4_file as well:
+        */
+       if (fp->fi_aliased)
+               return -EAGAIN;
+       /*
+        * If there's a close in progress, make sure that we see it
+        * clear any fi_fds[] entries before we see it decrement
+        * i_writecount:
+        */
+       smp_mb__after_atomic();
+
+       if (fp->fi_fds[O_WRONLY])
+               writes--;
+       if (fp->fi_fds[O_RDWR])
+               writes--;
+       if (writes > 0)
+               return -EAGAIN; /* There may be non-NFSv4 writers */
+       /*
+        * It's possible there are non-NFSv4 write opens in progress,
+        * but if they haven't incremented i_writecount yet then they
+        * also haven't called break lease yet; so, they'll break this
+        * lease soon enough.  So, all that's left to check for is NFSv4
+        * opens:
+        */
+       spin_lock(&fp->fi_lock);
+       list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
+               if (st->st_openstp == NULL /* it's an open */ &&
+                   access_permit_write(st) &&
+                   st->st_stid.sc_client != clp) {
+                       spin_unlock(&fp->fi_lock);
+                       return -EAGAIN;
+               }
+       }
+       spin_unlock(&fp->fi_lock);
+       /*
+        * There's a small chance that we could be racing with another
+        * NFSv4 open.  However, any open that hasn't added itself to
+        * the fi_stateids list also hasn't called break_lease yet; so,
+        * they'll break this lease soon enough.
+        */
+       return 0;
+}
+
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
@@ -4970,9 +5067,12 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 
        nf = find_readable_file(fp);
        if (!nf) {
-               /* We should always have a readable file here */
-               WARN_ON_ONCE(1);
-               return ERR_PTR(-EBADF);
+               /*
+                * We probably could attempt another open and get a read
+                * delegation, but for now, don't bother until the
+                * client actually sends us one.
+                */
+               return ERR_PTR(-EAGAIN);
        }
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5007,6 +5107,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                locks_free_lock(fl);
        if (status)
                goto out_clnt_odstate;
+       status = nfsd4_check_conflicting_opens(clp, fp);
+       if (status)
+               goto out_unlock;
 
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5088,17 +5191,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
                                goto out_no_deleg;
                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
                                goto out_no_deleg;
-                       /*
-                        * Also, if the file was opened for write or
-                        * create, there's a good chance the client's
-                        * about to write to it, resulting in an
-                        * immediate recall (since we don't support
-                        * write delegations):
-                        */
-                       if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                               goto out_no_deleg;
-                       if (open->op_create == NFS4_OPEN_CREATE)
-                               goto out_no_deleg;
                        break;
                default:
                        goto out_no_deleg;
@@ -5161,7 +5253,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
         * and check for delegations in the process of being recalled.
         * If not found, create the nfs4_file struct
         */
-       fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
+       fp = find_or_add_file(open->op_file, current_fh);
        if (fp != open->op_file) {
                status = nfs4_check_deleg(cl, open, &dp);
                if (status)
@@ -5502,21 +5594,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
        return nfs_ok;
 }
 
-static inline int
-access_permit_read(struct nfs4_ol_stateid *stp)
-{
-       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
-               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
-               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
-}
-
-static inline int
-access_permit_write(struct nfs4_ol_stateid *stp)
-{
-       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
-               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
-}
-
 static
 __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
 {
@@ -6288,15 +6365,6 @@ out:
        return status;
 }
 
-static inline u64
-end_offset(u64 start, u64 len)
-{
-       u64 end;
-
-       end = start + len;
-       return end >= start ? end: NFS4_MAX_UINT64;
-}
-
 /* last octet in a range */
 static inline u64
 last_byte_offset(u64 start, u64 len)
@@ -6865,11 +6933,20 @@ out:
 static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
 {
        struct nfsd_file *nf;
-       __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
-       if (!err) {
-               err = nfserrno(vfs_test_lock(nf->nf_file, lock));
-               nfsd_file_put(nf);
-       }
+       __be32 err;
+
+       err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+       if (err)
+               return err;
+       fh_lock(fhp); /* to block new leases till after test_lock: */
+       err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
+                                                       NFSD_MAY_READ));
+       if (err)
+               goto out;
+       err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+out:
+       fh_unlock(fhp);
+       nfsd_file_put(nf);
        return err;
 }
 
index e0f06d3..7abeccb 100644 (file)
@@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
        nfsd4_sequence_done(resp);
        return 1;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 853bf50..c2c3d90 100644 (file)
@@ -1166,6 +1166,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
                inode->i_fop = &simple_dir_operations;
                inode->i_op = &simple_dir_inode_operations;
                inc_nlink(inode);
+               break;
        default:
                break;
        }
index 82ba034..dd5d699 100644 (file)
@@ -308,7 +308,7 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
 
 static int nfsd_users = 0;
 
-static int nfsd_startup_generic(int nrservs)
+static int nfsd_startup_generic(void)
 {
        int ret;
 
@@ -374,7 +374,7 @@ void nfsd_reset_boot_verifier(struct nfsd_net *nn)
        write_sequnlock(&nn->boot_lock);
 }
 
-static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
+static int nfsd_startup_net(struct net *net, const struct cred *cred)
 {
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        int ret;
@@ -382,7 +382,7 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre
        if (nn->nfsd_net_up)
                return 0;
 
-       ret = nfsd_startup_generic(nrservs);
+       ret = nfsd_startup_generic();
        if (ret)
                return ret;
        ret = nfsd_init_socks(net, cred);
@@ -790,7 +790,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 
        nfsd_up_before = nn->nfsd_net_up;
 
-       error = nfsd_startup_net(nrservs, net, cred);
+       error = nfsd_startup_net(net, cred);
        if (error)
                goto out_destroy;
        error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
index 54cab65..e73bdbb 100644 (file)
@@ -516,6 +516,8 @@ struct nfs4_clnt_odstate {
  */
 struct nfs4_file {
        refcount_t              fi_ref;
+       struct inode *          fi_inode;
+       bool                    fi_aliased;
        spinlock_t              fi_lock;
        struct hlist_node       fi_hash;        /* hash on fi_fhandle */
        struct list_head        fi_stateids;
@@ -669,7 +671,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name
                                struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
-struct nfs4_file *find_file(struct knfsd_fh *fh);
 void put_nfs4_file(struct nfs4_file *fi);
 extern void nfs4_put_copy(struct nfsd4_copy *copy);
 extern struct nfsd4_copy *
index fe540a3..a7c4252 100644 (file)
@@ -866,9 +866,3 @@ struct nfsd4_operation {
 
 
 #endif
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 025fb08..ce14477 100644 (file)
@@ -293,7 +293,7 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
  * nilfs_cpfile_delete_checkpoints - delete checkpoints
  * @cpfile: inode of checkpoint file
  * @start: start checkpoint number
- * @end: end checkpoint numer
+ * @end: end checkpoint number
  *
  * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
  * the period from @start to @end, excluding @end itself. The checkpoints
index 3fcb935..640ac8f 100644 (file)
@@ -1043,7 +1043,7 @@ out:
  * @inode: inode object
  * @argp: pointer on argument from userspace
  *
- * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * Description: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
  * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
  * performs the actual trim operation.
  *
@@ -1085,7 +1085,7 @@ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
  * @inode: inode object
  * @argp: pointer on argument from userspace
  *
- * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * Description: nilfs_ioctl_set_alloc_range() function defines lower limit
  * of segments in bytes and upper limit of segments in bytes.
  * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
  *
index cd4da95..686c8ee 100644 (file)
@@ -2214,7 +2214,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
  * nilfs_construct_segment - construct a logical segment
  * @sb: super block
  *
- * Return Value: On success, 0 is retured. On errors, one of the following
+ * Return Value: On success, 0 is returned. On errors, one of the following
  * negative error code is returned.
  *
  * %-EROFS - Read only filesystem.
@@ -2251,7 +2251,7 @@ int nilfs_construct_segment(struct super_block *sb)
  * @start: start byte offset
  * @end: end byte offset (inclusive)
  *
- * Return Value: On success, 0 is retured. On errors, one of the following
+ * Return Value: On success, 0 is returned. On errors, one of the following
  * negative error code is returned.
  *
  * %-EROFS - Read only filesystem.
index 221a1cc..8b7b01a 100644 (file)
@@ -195,7 +195,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 /**
  * load_nilfs - load and recover the nilfs
  * @nilfs: the_nilfs structure to be released
- * @sb: super block isntance used to recover past segment
+ * @sb: super block instance used to recover past segment
  *
  * load_nilfs() searches and load the latest super root,
  * attaches the last segment, and does recovery if needed.
index 5259bad..5c72a7e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 4e86450..f59d8d0 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 7871078..e032f2e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.c
  *
  * Extent allocs and frees
index 7f973dd..4af7aba 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.h
  *
  * Function prototypes
index ad20403..1294925 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
index 70ed438..3a52011 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  */
 
index dabfef9..863a531 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.c
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
index 8f17d2c..d0578e9 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.h
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
index f0b104e..e775877 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * io.c
  *
  * Buffer cache handling
index 1c5e533..2d51649 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_buffer_head.h
  *
  * Buffer cache handling functions defined
index 12a7590..e829c25 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index beed31e..1d4100a 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
index 1d696c9..810d328 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index 446e452..b73fc42 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index 667a5c5..7524994 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * netdebug.c
  *
  * debug functionality for o2net
index 7a7640c..bb82e6b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index 3e00066..3490e77 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * nodemanager.h
  *
  * Function prototypes
index 760d850..6088c9f 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_heartbeat.h
  *
  * On-disk structures for ocfs2_heartbeat
index 21ad307..c9a0b77 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_nodemanager.h
  *
  * Header describing the interface between userspace and the kernel
index cea739b..189c111 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
index 6d45ce8..d64bf44 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index d6067c3..022f716 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.c
  *
  * OCFS2 cluster sysfs interface
index ce38051..70aaba6 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.h
  *
  * Function prototypes for o2cb sysfs interface
index 3bd8119..f660c0d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
index 736338f..a75b551 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * tcp.h
  *
  * Function prototypes
index e6a2b9d..601c99b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index 42a61ee..04fc834 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.c
  *
  * dentry cache handling code
index 3686a52..7f246c5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.h
  *
  * Function prototypes
index bdfba9d..bd8d534 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c
  *
  * Creates, reads, walks and deletes directory-nodes
index e3e7d5d..4b9f5a1 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.h
  *
  * Function prototypes
index 6456c0f..bae60ca 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmapi.h
  *
  * externally exported dlm interfaces
index 70a1076..c681ba9 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmast.c
  *
  * AST and BAST functionality for local and remote nodes
index 58d57e2..fd20227 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmcommon.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 6051edc..450d46e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.c
  *
  * underlying calls for lock conversion
index 12d9c28..1f37171 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 4b8b41d..d442cf5 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.c
  *
  * debug functionality for the dlm
index f8fd868..e08f735 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.h
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
index 357cfc7..9f90fc9 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.c
  *
  * defines domain join / leave apis
index 7c21664..815abe3 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 83f0760..041fd17 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmlock.c
  *
  * underlying calls for lock creation
index f105746..4960a6d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmmod.c
  *
  * standalone DLM module
index afc5173..0e7aad1 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmrecovery.c
  *
  * recovery stuff
index 5ccc4ff..c350bd4 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmthread.c
  *
  * standalone DLM module
index dcb17ca..61103b2 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmunlock.c
  *
  * underlying calls for unlocking locks
index b2870f1..fa0a14f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmfs.c
  *
  * Code which implements the kernel side of a minimal userspace
index 339f098..29f183a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.c
  *
  * Code which implements the kernel side of a minimal userspace
index 0558ae7..47ba18e 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.h
  *
  * Userspace dlm defines
index 0fbe8bf..48fd369 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.c
  *
  * Code which implements an OCFS2 specific interface to our DLM.
index b8fbed2..e5da580 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.h
  *
  * description here
index 69ed278..eaa8c80 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.c
  *
  * Functions to facilitate NFS exporting
index d485da0..6363574 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.h
  *
  * Function prototypes
index 7b93e9c..70a768b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.c
  *
  * Block/Cluster mapping functions
index e5464f6..bc4ed59 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.h
  *
  * In-memory file extent mappings for OCFS2.
index db8a626..f17c3d3 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c
  *
  * File open, close, extend, truncate
index 8536cec..71db8f3 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.h
  *
  * Function prototypes
index 50f11bf..90b8d30 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.c
  *
  * Code which implements online file check.
index 4d00677..d3bcb8b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.h
  *
  * Online file check.
index 60c5f99..9099d8f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.c
  *
  * Register ourselves with the heartbaet service, keep our node maps
index 5fedb2d..f1f8b18 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
index 7c9dfd5..bc8f32f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c
  *
  * vfs' aops, fops, dops and iops
index 51a4f71..82b28fd 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.h
  *
  * Function prototypes
index db52e84..4e589ce 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.c
  *
  * Defines functions of journalling api
index bfe611e..d158acb 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.h
  *
  * Defines journalling api and structures.
index fc8252a..5f6bacb 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.c
  *
  * Node local data allocation
index e8a5cea..08f925b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.h
  *
  * Function prototypes
index 7edc4e5..fab7c6a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.c
  *
  * Userspace file locking support
index 389fe1f..b52de39 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.h
  *
  * Function prototypes for Userspace file locking support
index 25cabbf..1834f26 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mmap.c
  *
  * Code to deal with the mess that is clustered mmap.
index 758d966..192cad0 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.c
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
index 28cac43..987f9e5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.h
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
index 05ced86..2c46ff6 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.c
  *
  * Create and rename file, directory, symlinks
index cc091ed..9cc891e 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.h
  *
  * Function prototypes
index 01ae48c..6dbcf3d 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs1_fs_compat.h
  *
  * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
index 7993d52..bb62cc2 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2.h
  *
  * Defines macros and structures used in OCFS2
index 19137c6..638d875 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_fs.h
  *
  * On-disk structures for OCFS2.
index 273616b..9680797 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_ioctl.h
  *
  * Defines OCFS2 ioctls.
index b4be849..8ac357c 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockid.h
  *
  * Defines OCFS2 lockid bits.
index 5c9c105..31a5e16 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockingver.h
  *
  * Defines OCFS2 Locking version values.
index c19a463..7f6355c 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.c
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
index 0b90144..8197a94 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.h
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
index bf3842e..769e466 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.c
  *
  * Allocation reservations implementation
index 6ac8812..677c506 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.h
  *
  * Allocation reservations function prototypes and structures.
index 24eb52f..d65d43c 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.c
  *
  * volume resize.
index 0af0c02..4990637 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.h
  *
  * Function prototypes
index 4da0e4b..0b0ae3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slot_map.c
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
index 93b53e7..a436445 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slotmap.h
  *
  * description here
index f700120..88f75f7 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_o2cb.c
  *
  * Code which interfaces ocfs2 with the o2cb stack.
index 7397064..85a4762 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_user.c
  *
  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
index 8d33ebc..d50e8b8 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.c
  *
  * Code which implements an OCFS2 specific interface to underlying
index e9d26cb..3636847 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.h
  *
  * Glue to the underlying cluster stack.
index 8c8cf7f..8521942 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.c
  *
  * metadata alloc and free
index 50b3625..5805a03 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.h
  *
  * Defines sub allocator api
index 079f882..c86bd4e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.c
  *
  * load/unload driver, mount/dismount volumes
index 76facaf..8312651 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.h
  *
  * Function prototypes
index 94cfacc..f755a49 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  *  linux/cluster/ssi/cfs/symlink.c
  *
  *     This program is free software; you can redistribute it and/or
index 167094d..ffcf021 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.h
  *
  * Function prototypes
index bb701c4..53a945d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.c
  *
  * Initialize, read, write, etc. system files.
index a83dd96..2b38c75 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.h
  *
  * Function prototypes
index 580852b..0985492 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.c
  *
  * Tracking the up-to-date-ness of a local buffer_head with respect to
index 77a30ca..85d9413 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.h
  *
  * Cluster uptodate tracking
index 36ae47a..dd784eb 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 9c80382..00308b5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index bc86aa8..5b78739 100644 (file)
@@ -166,15 +166,8 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
        const char              *cp = name, *next;
        struct proc_dir_entry   *de;
 
-       de = *ret;
-       if (!de)
-               de = &proc_root;
-
-       while (1) {
-               next = strchr(cp, '/');
-               if (!next)
-                       break;
-
+       de = *ret ?: &proc_root;
+       while ((next = strchr(cp, '/')) != NULL) {
                de = pde_subdir_find(de, cp, next - cp);
                if (!de) {
                        WARN(1, "name '%s'\n", name);
@@ -756,7 +749,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
-                       if (unlikely(pde_is_permanent(root))) {
+                       if (unlikely(pde_is_permanent(next))) {
                                write_unlock(&proc_subdir_lock);
                                WARN(1, "removing permanent /proc entry '%s/%s'",
                                        next->parent->name, next->name);
index bde6b6f..599eb72 100644 (file)
@@ -273,25 +273,15 @@ void proc_entry_rundown(struct proc_dir_entry *de)
        spin_unlock(&de->pde_unload_lock);
 }
 
-static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
-{
-       typeof_member(struct proc_ops, proc_lseek) lseek;
-
-       lseek = pde->proc_ops->proc_lseek;
-       if (!lseek)
-               lseek = default_llseek;
-       return lseek(file, offset, whence);
-}
-
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;
 
        if (pde_is_permanent(pde)) {
-               return pde_lseek(pde, file, offset, whence);
+               return pde->proc_ops->proc_lseek(file, offset, whence);
        } else if (use_pde(pde)) {
-               rv = pde_lseek(pde, file, offset, whence);
+               rv = pde->proc_ops->proc_lseek(file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
@@ -493,7 +483,6 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
 
 static int proc_reg_open(struct inode *inode, struct file *file)
 {
-       struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
@@ -507,9 +496,6 @@ static int proc_reg_open(struct inode *inode, struct file *file)
                return rv;
        }
 
-       if (fs_info->pidonly == PROC_PIDONLY_ON)
-               return -ENOENT;
-
        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
index 66c7dd1..dea0f5e 100644 (file)
@@ -1563,7 +1563,7 @@ err_register_leaves:
 }
 
 /**
- * register_sysctl_table_path - register a sysctl table hierarchy
+ * register_sysctl_paths - register a sysctl table hierarchy
  * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
index e862cab..fc97845 100644 (file)
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_PKEY_BIT4)]   = "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+               [ilog2(VM_UFFD_MINOR)]  = "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
        };
        size_t i;
 
index 155b828..4a7cb16 100644 (file)
@@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void)
  * (available at http://www.namesys.com/legalese.html)
  *
  */
-
-/*
- * Make Linus happy.
- * Local variables:
- * c-indentation-style: "K&R"
- * mode-name: "LC"
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
index 9b2467e..3612945 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-mkutf8data
-utf8data.h
+/mkutf8data
+/utf8data.h
index 0be8cdd..14f9228 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -196,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;
        msg.arg.pagefault.address = address;
+       /*
+        * These flags indicate why the userfault occurred:
+        * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+        * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+        * - Neither of these flags being set indicates a MISSING fault.
+        *
+        * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+        * fault. Otherwise, it was a read fault.
+        */
        if (flags & FAULT_FLAG_WRITE)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
-                * was not set in a UFFD_EVENT_PAGEFAULT, it means it
-                * was a read fault, otherwise if set it means it's
-                * a write fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
-                * not set in a UFFD_EVENT_PAGEFAULT, it means it was
-                * a missing fault, otherwise if set it means it's a
-                * write protect fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+       if (reason & VM_UFFD_MINOR)
+               msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
@@ -400,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
        BUG_ON(ctx->mm != mm);
 
-       VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
-       VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+       /* Any unrecognized flag is a bug. */
+       VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+       /* 0 or > 1 flags set is a bug; we expect exactly 1. */
+       VM_BUG_ON(!reason || (reason & (reason - 1)));
 
        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
@@ -611,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                for (vma = mm->mmap; vma; vma = vma->vm_next)
                        if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
                                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-                               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+                               vma->vm_flags &= ~__VM_UFFD_FLAGS;
                        }
                mmap_write_unlock(mm);
 
@@ -643,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               vma->vm_flags &= ~__VM_UFFD_FLAGS;
                return 0;
        }
 
@@ -725,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
        } else {
                /* Drop uffd context if remap feature not enabled */
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               vma->vm_flags &= ~__VM_UFFD_FLAGS;
        }
 }
 
@@ -866,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                cond_resched();
                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
-                      !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(vma->vm_flags & __VM_UFFD_FLAGS));
                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
                        prev = vma;
                        continue;
                }
-               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
                                 new_flags, vma->anon_vma,
                                 vma->vm_file, vma->vm_pgoff,
@@ -1261,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
                                     unsigned long vm_flags)
 {
        /* FIXME: add WP support to hugetlbfs and shmem */
-       return vma_is_anonymous(vma) ||
-               ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
-                !(vm_flags & VM_UFFD_WP));
+       if (vm_flags & VM_UFFD_WP) {
+               if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
+                       return false;
+       }
+
+       if (vm_flags & VM_UFFD_MINOR) {
+               /* FIXME: Add minor fault interception for shmem. */
+               if (!is_vm_hugetlb_page(vma))
+                       return false;
+       }
+
+       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+              vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1289,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
-       if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
-                                    UFFDIO_REGISTER_MODE_WP))
+       if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
                vm_flags |= VM_UFFD_WP;
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+               goto out;
+#endif
+               vm_flags |= VM_UFFD_MINOR;
+       }
 
        ret = validate_range(mm, &uffdio_register.range.start,
                             uffdio_register.range.len);
@@ -1340,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
                /* check not compatible vmas */
                ret = -EINVAL;
@@ -1420,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);
 
-               new_flags = (vma->vm_flags &
-                            ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
+               new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
                prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                 vma_policy(vma),
@@ -1449,6 +1463,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                vma->vm_flags = new_flags;
                vma->vm_userfaultfd_ctx.ctx = ctx;
 
+               if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+                       hugetlb_unshare_all_pmds(vma);
+
        skip:
                prev = vma;
                start = vma->vm_end;
@@ -1470,6 +1487,10 @@ out_unlock:
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
 
+               /* CONTINUE ioctl is only supported for MINOR ranges. */
+               if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+                       ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
@@ -1540,7 +1561,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
                /*
                 * Check not compatible vmas, not strictly required
@@ -1591,7 +1612,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }
 
-               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                 vma_policy(vma),
@@ -1823,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
        return ret;
 }
 
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+       __s64 ret;
+       struct uffdio_continue uffdio_continue;
+       struct uffdio_continue __user *user_uffdio_continue;
+       struct userfaultfd_wake_range range;
+
+       user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+       ret = -EAGAIN;
+       if (READ_ONCE(ctx->mmap_changing))
+               goto out;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+                          /* don't copy the output fields */
+                          sizeof(uffdio_continue) - (sizeof(__s64))))
+               goto out;
+
+       ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+                            uffdio_continue.range.len);
+       if (ret)
+               goto out;
+
+       ret = -EINVAL;
+       /* double check for wraparound just in case. */
+       if (uffdio_continue.range.start + uffdio_continue.range.len <=
+           uffdio_continue.range.start) {
+               goto out;
+       }
+       if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+               goto out;
+
+       if (mmget_not_zero(ctx->mm)) {
+               ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
+                                    uffdio_continue.range.len,
+                                    &ctx->mmap_changing);
+               mmput(ctx->mm);
+       } else {
+               return -ESRCH;
+       }
+
+       if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+               return -EFAULT;
+       if (ret < 0)
+               goto out;
+
+       /* len == 0 would wake all */
+       BUG_ON(!ret);
+       range.len = ret;
+       if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+               range.start = uffdio_continue.range.start;
+               wake_userfault(ctx, &range);
+       }
+       ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+       return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
        /*
@@ -1859,6 +1940,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                goto err_out;
        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+       uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+#endif
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -1907,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
+       case UFFDIO_CONTINUE:
+               ret = userfaultfd_continue(ctx, arg);
+               break;
        }
        return ret;
 }
index 6c5f8d1..e32a183 100644 (file)
@@ -253,7 +253,8 @@ xfs_ag_resv_init(
        xfs_agnumber_t                  agno = pag->pag_agno;
        xfs_extlen_t                    ask;
        xfs_extlen_t                    used;
-       int                             error = 0;
+       int                             error = 0, error2;
+       bool                            has_resv = false;
 
        /* Create the metadata reservation. */
        if (pag->pag_meta_resv.ar_asked == 0) {
@@ -291,6 +292,8 @@ xfs_ag_resv_init(
                        if (error)
                                goto out;
                }
+               if (ask)
+                       has_resv = true;
        }
 
        /* Create the RMAPBT metadata reservation */
@@ -304,19 +307,28 @@ xfs_ag_resv_init(
                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
                if (error)
                        goto out;
+               if (ask)
+                       has_resv = true;
        }
 
-#ifdef DEBUG
-       /* need to read in the AGF for the ASSERT below to work */
-       error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0);
-       if (error)
-               return error;
-
-       ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
-              xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
-              pag->pagf_freeblks + pag->pagf_flcount);
-#endif
 out:
+       /*
+        * Initialize the pagf if we have at least one active reservation on the
+        * AG. This may have occurred already via reservation calculation, but
+        * fall back to an explicit init to ensure the in-core allocbt usage
+        * counters are initialized as soon as possible. This is important
+        * because filesystems with large perag reservations are susceptible to
+        * free space reservation problems that the allocbt counter is used to
+        * address.
+        */
+       if (has_resv) {
+               error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
+               if (error2)
+                       return error2;
+               ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+                      xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
+                      pag->pagf_freeblks + pag->pagf_flcount);
+       }
        return error;
 }
 
index aaa1910..82b7cbb 100644 (file)
@@ -718,7 +718,6 @@ xfs_alloc_update_counters(
        agbp->b_pag->pagf_freeblks += len;
        be32_add_cpu(&agf->agf_freeblks, len);
 
-       xfs_trans_agblocks_delta(tp, len);
        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
                     be32_to_cpu(agf->agf_length))) {
                xfs_buf_mark_corrupt(agbp);
@@ -2739,7 +2738,6 @@ xfs_alloc_get_freelist(
        pag = agbp->b_pag;
        ASSERT(!pag->pagf_agflreset);
        be32_add_cpu(&agf->agf_flcount, -1);
-       xfs_trans_agflist_delta(tp, -1);
        pag->pagf_flcount--;
 
        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
@@ -2846,7 +2844,6 @@ xfs_alloc_put_freelist(
        pag = agbp->b_pag;
        ASSERT(!pag->pagf_agflreset);
        be32_add_cpu(&agf->agf_flcount, 1);
-       xfs_trans_agflist_delta(tp, 1);
        pag->pagf_flcount++;
 
        logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
@@ -3036,6 +3033,7 @@ xfs_alloc_read_agf(
        struct xfs_agf          *agf;           /* ag freelist header */
        struct xfs_perag        *pag;           /* per allocation group data */
        int                     error;
+       int                     allocbt_blks;
 
        trace_xfs_alloc_read_agf(mp, agno);
 
@@ -3066,6 +3064,19 @@ xfs_alloc_read_agf(
                pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
                pag->pagf_init = 1;
                pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+
+               /*
+                * Update the in-core allocbt counter. Filter out the rmapbt
+                * subset of the btreeblks counter because the rmapbt is managed
+                * by perag reservation. Subtract one for the rmapbt root block
+                * because the rmap counter includes it while the btreeblks
+                * counter only tracks non-root blocks.
+                */
+               allocbt_blks = pag->pagf_btreeblks;
+               if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+                       allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
+               if (allocbt_blks > 0)
+                       atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
        }
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
index 8e01231..a43e4c5 100644 (file)
@@ -71,9 +71,9 @@ xfs_allocbt_alloc_block(
                return 0;
        }
 
+       atomic64_inc(&cur->bc_mp->m_allocbt_blks);
        xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
 
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
 
        *stat = 1;
@@ -95,9 +95,9 @@ xfs_allocbt_free_block(
        if (error)
                return error;
 
+       atomic64_dec(&cur->bc_mp->m_allocbt_blks);
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
 
index 8bd00da..3e15ea2 100644 (file)
@@ -368,10 +368,10 @@ static inline int xfs_ilog_fdata(int w)
  * directly mirrors the xfs_dinode structure as it must contain all the same
  * information.
  */
-typedef uint64_t xfs_ictimestamp_t;
+typedef uint64_t xfs_log_timestamp_t;
 
 /* Legacy timestamp encoding format. */
-struct xfs_legacy_ictimestamp {
+struct xfs_log_legacy_timestamp {
        int32_t         t_sec;          /* timestamp seconds */
        int32_t         t_nsec;         /* timestamp nanoseconds */
 };
@@ -393,9 +393,9 @@ struct xfs_log_dinode {
        uint16_t        di_projid_hi;   /* higher part of owner's project id */
        uint8_t         di_pad[6];      /* unused, zeroed space */
        uint16_t        di_flushiter;   /* incremented on flush */
-       xfs_ictimestamp_t di_atime;     /* time last accessed */
-       xfs_ictimestamp_t di_mtime;     /* time last modified */
-       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+       xfs_log_timestamp_t di_atime;   /* time last accessed */
+       xfs_log_timestamp_t di_mtime;   /* time last modified */
+       xfs_log_timestamp_t di_ctime;   /* time created/inode modified */
        xfs_fsize_t     di_size;        /* number of bytes in file */
        xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
@@ -420,7 +420,7 @@ struct xfs_log_dinode {
        uint8_t         di_pad2[12];    /* more padding for future expansion */
 
        /* fields only written to during inode creation */
-       xfs_ictimestamp_t di_crtime;    /* time created */
+       xfs_log_timestamp_t di_crtime;  /* time created */
        xfs_ino_t       di_ino;         /* inode number */
        uuid_t          di_uuid;        /* UUID of the filesystem */
 
index beb81c8..9f5bcbd 100644 (file)
@@ -103,7 +103,6 @@ xfs_rmapbt_alloc_block(
        xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
                        false);
 
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
        be32_add_cpu(&agf->agf_rmap_blocks, 1);
        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
@@ -136,7 +135,6 @@ xfs_rmapbt_free_block(
 
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
 
        pag = cur->bc_ag.agbp->b_pag;
        xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
index 60e6d25..dfbbcbd 100644 (file)
@@ -926,9 +926,19 @@ xfs_log_sb(
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf          *bp = xfs_trans_getsb(tp);
 
-       mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
-       mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
-       mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+       /*
+        * Lazy sb counters don't update the in-core superblock so do that now.
+        * If this is at unmount, the counters will be exactly correct, but at
+        * any other time they will only be ballpark correct because of
+        * reservations that have been taken out percpu counters. If we have an
+        * unclean shutdown, this will be corrected by log recovery rebuilding
+        * the counters from the AGF block counts.
+        */
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb)) {
+               mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+               mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+               mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+       }
 
        xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
index 749faa1..7a2f9b5 100644 (file)
@@ -416,6 +416,10 @@ xchk_agf_xref_btreeblks(
        xfs_agblock_t           btreeblks;
        int                     error;
 
+       /* agf_btreeblks didn't exist before lazysbcount */
+       if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb))
+               return;
+
        /* Check agf_rmap_blocks; set up for agf_btreeblks check */
        if (sc->sa.rmap_cur) {
                error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks);
@@ -581,7 +585,8 @@ xchk_agf(
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
        if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
-       if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+       if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) &&
+           pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
        xfs_perag_put(pag);
 
index 7b4386c..f1d1a8c 100644 (file)
@@ -13,6 +13,7 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_health.h"
+#include "xfs_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -143,6 +144,35 @@ xchk_setup_fscounters(
        return xchk_trans_alloc(sc, 0);
 }
 
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
+static int
+xchk_fscount_btreeblks(
+       struct xfs_scrub        *sc,
+       struct xchk_fscounters  *fsc,
+       xfs_agnumber_t          agno)
+{
+       xfs_extlen_t            blocks;
+       int                     error;
+
+       error = xchk_ag_init(sc, agno, &sc->sa);
+       if (error)
+               return error;
+
+       error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+       if (error)
+               goto out_free;
+       fsc->fdblocks += blocks - 1;
+
+       error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+       if (error)
+               goto out_free;
+       fsc->fdblocks += blocks - 1;
+
+out_free:
+       xchk_ag_free(sc, &sc->sa);
+       return error;
+}
+
 /*
  * Calculate what the global in-core counters ought to be from the incore
  * per-AG structure.  Callers can compare this to the actual in-core counters
@@ -182,7 +212,15 @@ retry:
                /* Add up the free/freelist/bnobt/cntbt blocks */
                fsc->fdblocks += pag->pagf_freeblks;
                fsc->fdblocks += pag->pagf_flcount;
-               fsc->fdblocks += pag->pagf_btreeblks;
+               if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) {
+                       fsc->fdblocks += pag->pagf_btreeblks;
+               } else {
+                       error = xchk_fscount_btreeblks(sc, fsc, agno);
+                       if (error) {
+                               xfs_perag_put(pag);
+                               break;
+                       }
+               }
 
                /*
                 * Per-AG reservations are taken out of the incore counters,
index 9b08db4..826caa6 100644 (file)
@@ -146,7 +146,7 @@ xfs_end_io(
        while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
                        io_list))) {
                list_del_init(&ioend->io_list);
-               iomap_ioend_try_merge(ioend, &tmp, NULL);
+               iomap_ioend_try_merge(ioend, &tmp);
                xfs_end_ioend(ioend);
        }
 }
index b33c894..be9cf88 100644 (file)
@@ -69,8 +69,6 @@ xfs_resizefs_init_new_ags(
        if (error)
                return error;
 
-       xfs_trans_agblocks_delta(tp, id->nfree);
-
        if (delta) {
                *lastag_extended = true;
                error = xfs_ag_extend_space(mp, tp, id, delta);
index c1b3268..6764d12 100644 (file)
@@ -299,18 +299,18 @@ xfs_inode_item_format_attr_fork(
  * Convert an incore timestamp to a log timestamp.  Note that the log format
  * specifies host endian format!
  */
-static inline xfs_ictimestamp_t
+static inline xfs_log_timestamp_t
 xfs_inode_to_log_dinode_ts(
        struct xfs_inode                *ip,
        const struct timespec64         tv)
 {
-       struct xfs_legacy_ictimestamp   *lits;
-       xfs_ictimestamp_t               its;
+       struct xfs_log_legacy_timestamp *lits;
+       xfs_log_timestamp_t             its;
 
        if (xfs_inode_has_bigtime(ip))
                return xfs_inode_encode_bigtime(tv);
 
-       lits = (struct xfs_legacy_ictimestamp *)&its;
+       lits = (struct xfs_log_legacy_timestamp *)&its;
        lits->t_sec = tv.tv_sec;
        lits->t_nsec = tv.tv_nsec;
 
index cb44f76..7b79518 100644 (file)
@@ -125,17 +125,17 @@ static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
 static inline xfs_timestamp_t
 xfs_log_dinode_to_disk_ts(
        struct xfs_log_dinode           *from,
-       const xfs_ictimestamp_t         its)
+       const xfs_log_timestamp_t       its)
 {
        struct xfs_legacy_timestamp     *lts;
-       struct xfs_legacy_ictimestamp   *lits;
+       struct xfs_log_legacy_timestamp *lits;
        xfs_timestamp_t                 ts;
 
        if (xfs_log_dinode_has_bigtime(from))
                return cpu_to_be64(its);
 
        lts = (struct xfs_legacy_timestamp *)&ts;
-       lits = (struct xfs_legacy_ictimestamp *)&its;
+       lits = (struct xfs_log_legacy_timestamp *)&its;
        lts->t_sec = cpu_to_be32(lits->t_sec);
        lts->t_nsec = cpu_to_be32(lits->t_nsec);
 
index 0604183..c19a82a 100644 (file)
@@ -355,13 +355,15 @@ xfs_log_writable(
        struct xfs_mount        *mp)
 {
        /*
-        * Never write to the log on norecovery mounts, if the block device is
-        * read-only, or if the filesystem is shutdown. Read-only mounts still
-        * allow internal writes for log recovery and unmount purposes, so don't
-        * restrict that case here.
+        * Do not write to the log on norecovery mounts, if the data or log
+        * devices are read-only, or if the filesystem is shutdown. Read-only
+        * mounts allow internal writes for log recovery and unmount purposes,
+        * so don't restrict that case.
         */
        if (mp->m_flags & XFS_MOUNT_NORECOVERY)
                return false;
+       if (xfs_readonly_buftarg(mp->m_ddev_targp))
+               return false;
        if (xfs_readonly_buftarg(mp->m_log->l_targ))
                return false;
        if (XFS_FORCED_SHUTDOWN(mp))
index cb1e2c4..bdfee19 100644 (file)
@@ -1188,6 +1188,7 @@ xfs_mod_fdblocks(
        int64_t                 lcounter;
        long long               res_used;
        s32                     batch;
+       uint64_t                set_aside;
 
        if (delta > 0) {
                /*
@@ -1227,8 +1228,20 @@ xfs_mod_fdblocks(
        else
                batch = XFS_FDBLOCKS_BATCH;
 
+       /*
+        * Set aside allocbt blocks because these blocks are tracked as free
+        * space but not available for allocation. Technically this means that a
+        * single reservation cannot consume all remaining free space, but the
+        * ratio of allocbt blocks to usable free blocks should be rather small.
+        * The tradeoff without this is that filesystems that maintain high
+        * perag block reservations can over reserve physical block availability
+        * and fail physical allocation, which leads to much more serious
+        * problems (i.e. transaction abort, pagecache discards, etc.) than
+        * slightly premature -ENOSPC.
+        */
+       set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
        percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
-       if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
+       if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
                                     XFS_FDBLOCKS_BATCH) >= 0) {
                /* we had space! */
                return 0;
index 81829d1..bb67274 100644 (file)
@@ -170,6 +170,12 @@ typedef struct xfs_mount {
         * extents or anything related to the rt device.
         */
        struct percpu_counter   m_delalloc_blks;
+       /*
+        * Global count of allocation btree blocks in use across all AGs. Only
+        * used when perag reservation is enabled. Helps prevent block
+        * reservation from attempting to reserve allocation btree blocks.
+        */
+       atomic64_t              m_allocbt_blks;
 
        struct radix_tree_root  m_perag_tree;   /* per-ag accounting info */
        spinlock_t              m_perag_lock;   /* lock for m_perag_tree */
index 0aa87c2..2599192 100644 (file)
@@ -126,8 +126,8 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64,             16);
        XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode,            176);
        XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,           28);
-       XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t,                8);
-       XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp,    8);
+       XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t,              8);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp,  8);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,   52);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
        XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
index 4dd4af6..060695d 100644 (file)
@@ -1522,7 +1522,8 @@ xfs_reflink_unshare(
        if (error)
                goto out;
 
-       error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
+       error = filemap_write_and_wait_range(inode->i_mapping, offset,
+                       offset + len - 1);
        if (error)
                goto out;
 
index bcc9780..586f299 100644 (file)
@@ -488,13 +488,6 @@ xfs_trans_apply_sb_deltas(
        sbp = bp->b_addr;
 
        /*
-        * Check that superblock mods match the mods made to AGF counters.
-        */
-       ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
-              (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
-               tp->t_ag_btree_delta));
-
-       /*
         * Only update the superblock counters if we are logging them
         */
        if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
@@ -629,6 +622,9 @@ xfs_trans_unreserve_and_mod_sb(
 
        /* apply remaining deltas */
        spin_lock(&mp->m_sb_lock);
+       mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
+       mp->m_sb.sb_icount += idelta;
+       mp->m_sb.sb_ifree += ifreedelta;
        mp->m_sb.sb_frextents += rtxdelta;
        mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
        mp->m_sb.sb_agcount += tp->t_agcount_delta;
index 9dd745c..ee42d98 100644 (file)
@@ -140,11 +140,6 @@ typedef struct xfs_trans {
        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
        int64_t                 t_frextents_delta;/* superblock freextents chg*/
        int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#if defined(DEBUG) || defined(XFS_WARN)
-       int64_t                 t_ag_freeblks_delta; /* debugging counter */
-       int64_t                 t_ag_flist_delta; /* debugging counter */
-       int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
        int64_t                 t_dblocks_delta;/* superblock dblocks change */
        int64_t                 t_agcount_delta;/* superblock agcount change */
        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
@@ -165,16 +160,6 @@ typedef struct xfs_trans {
  */
 #define        xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
 
-#if defined(DEBUG) || defined(XFS_WARN)
-#define        xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
-#define        xfs_trans_agflist_delta(tp, d)  ((tp)->t_ag_flist_delta += (int64_t)d)
-#define        xfs_trans_agbtree_delta(tp, d)  ((tp)->t_ag_btree_delta += (int64_t)d)
-#else
-#define        xfs_trans_agblocks_delta(tp, d)
-#define        xfs_trans_agflist_delta(tp, d)
-#define        xfs_trans_agbtree_delta(tp, d)
-#endif
-
 /*
  * XFS transaction mechanism exported interfaces.
  */
index 9fdf213..0d132ee 100644 (file)
@@ -2,6 +2,13 @@
 #ifndef _ASM_GENERIC_BITOPS_FIND_H_
 #define _ASM_GENERIC_BITOPS_FIND_H_
 
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long nbits,
+               unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
-               size, unsigned long offset);
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+                           unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_and_bit
@@ -27,9 +48,23 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_and_bit(const unsigned long *addr1,
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
-               unsigned long offset);
+               unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_zero_bit
@@ -42,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
  * Returns the bit number of the next zero bit
  * If no bits are zero, returns @size.
  */
-extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
-               long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+                                unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
 #endif
 
 #ifdef CONFIG_GENERIC_FIND_FIRST_BIT
@@ -56,8 +105,17 @@ extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-extern unsigned long find_first_bit(const unsigned long *addr,
-                                   unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_first_bit(addr, size);
+}
 
 /**
  * find_first_zero_bit - find the first cleared bit in a memory region
@@ -67,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-extern unsigned long find_first_zero_bit(const unsigned long *addr,
-                                        unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_first_zero_bit(addr, size);
+}
 #else /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
 #ifndef find_first_bit
@@ -80,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr,
 
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __fls(val) : size;
+       }
+
+       return _find_last_bit(addr, size);
+}
+#endif
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
index 188d3eb..5a28629 100644 (file)
@@ -2,8 +2,10 @@
 #ifndef _ASM_GENERIC_BITOPS_LE_H_
 #define _ASM_GENERIC_BITOPS_LE_H_
 
+#include <asm-generic/bitops/find.h>
 #include <asm/types.h>
 #include <asm/byteorder.h>
+#include <linux/swab.h>
 
 #if defined(__LITTLE_ENDIAN)
 
@@ -32,13 +34,41 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
 #define BITOP_LE_SWIZZLE       ((BITS_PER_LONG-1) & ~0x7)
 
 #ifndef find_next_zero_bit_le
-extern unsigned long find_next_zero_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *(const unsigned long *)addr;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = swab(val) | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+}
 #endif
 
 #ifndef find_next_bit_le
-extern unsigned long find_next_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *(const unsigned long *)addr;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = swab(val) & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+}
 #endif
 
 #ifndef find_first_zero_bit_le
index 3905c1c..1023e2a 100644 (file)
 #define BITS_PER_LONG_LONG 64
 #endif
 
+/*
+ * small_const_nbits(n) is true precisely when it is known at compile-time
+ * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows
+ * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps
+ * of size 0 are very rare, and a compile-time-known-size 0 is most likely
+ * a sign of error. They will be handled correctly by the bit/bitmap APIs,
+ * but using the out-of-line functions, so that the inline implementations
+ * can unconditionally dereference the pointer(s).
+ */
+#define small_const_nbits(nbits) \
+       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
 #endif /* __ASM_GENERIC_BITS_PER_LONG */
index 76d456c..e93375c 100644 (file)
@@ -1064,17 +1064,6 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 #endif
 #endif /* CONFIG_GENERIC_IOMAP */
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#ifndef xlate_dev_kmem_ptr
-#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
-static inline void *xlate_dev_kmem_ptr(void *addr)
-{
-       return addr;
-}
-#endif
-
 #ifndef xlate_dev_mem_ptr
 #define xlate_dev_mem_ptr xlate_dev_mem_ptr
 static inline void *xlate_dev_mem_ptr(phys_addr_t addr)
diff --git a/include/dt-bindings/input/atmel-maxtouch.h b/include/dt-bindings/input/atmel-maxtouch.h
new file mode 100644 (file)
index 0000000..7345ab3
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _DT_BINDINGS_ATMEL_MAXTOUCH_H
+#define _DT_BINDINGS_ATMEL_MAXTOUCH_H
+
+#define ATMEL_MXT_WAKEUP_NONE          0
+#define ATMEL_MXT_WAKEUP_I2C_SCL       1
+#define ATMEL_MXT_WAKEUP_GPIO          2
+
+#endif /* _DT_BINDINGS_ATMEL_MAXTOUCH_H */
diff --git a/include/linux/align.h b/include/linux/align.h
new file mode 100644 (file)
index 0000000..2b4acec
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ALIGN_H
+#define _LINUX_ALIGN_H
+
+#include <linux/const.h>
+
+/* @a is a power of 2 value */
+#define ALIGN(x, a)            __ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)       __ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define __ALIGN_MASK(x, mask)  __ALIGN_KERNEL_MASK((x), (mask))
+#define PTR_ALIGN(p, a)                ((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define PTR_ALIGN_DOWN(p, a)   ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a)               (((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif /* _LINUX_ALIGN_H */
index 0a17cd2..cce4ad3 100644 (file)
@@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev,
        return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
 }
 
-void async_unregister_domain(struct async_domain *domain);
 extern void async_synchronize_full(void);
 extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
index a0b4cfd..f1a99f0 100644 (file)
@@ -106,6 +106,8 @@ static inline void *bio_data(struct bio *bio)
        return NULL;
 }
 
+extern unsigned int bio_max_size(struct bio *bio);
+
 /**
  * bio_full - check if the bio is full
  * @bio:       bio to check
@@ -119,7 +121,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return true;
 
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
+       if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
                return true;
 
        return false;
index 73d0394..a36cfce 100644 (file)
@@ -4,8 +4,9 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/align.h>
 #include <linux/bitops.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
@@ -229,14 +230,6 @@ int bitmap_print_to_pagebuf(bool list, char *buf,
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-/*
- * The static inlines below do not handle constant nbits==0 correctly,
- * so make such users (should any ever turn up) call the out-of-line
- * versions.
- */
-#define small_const_nbits(nbits) \
-       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
-
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
        unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
index a5a4830..26bf15e 100644 (file)
@@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 })
 #endif
 
-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @addr: The address to start the search at
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
-                                  unsigned long size);
-#endif
-
 #endif /* __KERNEL__ */
 #endif
index 3bd3ee6..3594869 100644 (file)
@@ -313,12 +313,12 @@ struct blk_mq_ops {
         */
        void (*put_budget)(struct request_queue *, int);
 
-       /*
-        * @set_rq_budget_toekn: store rq's budget token
+       /**
+        * @set_rq_budget_token: store rq's budget token
         */
        void (*set_rq_budget_token)(struct request *, int);
-       /*
-        * @get_rq_budget_toekn: retrieve rq's budget token
+       /**
+        * @get_rq_budget_token: retrieve rq's budget token
         */
        int (*get_rq_budget_token)(struct request *);
 
index b91ba62..9fb255b 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/minmax.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
-#include <linux/pagemap.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
@@ -327,6 +326,8 @@ enum blk_bounce {
 };
 
 struct queue_limits {
+       unsigned int            bio_max_bytes;
+
        enum blk_bounce         bounce;
        unsigned long           seg_boundary_mask;
        unsigned long           virt_boundary_mask;
index 6023a13..0684151 100644 (file)
@@ -302,10 +302,11 @@ struct bpf_verifier_state_list {
 };
 
 /* Possible states for alu_state member. */
-#define BPF_ALU_SANITIZE_SRC           1U
-#define BPF_ALU_SANITIZE_DST           2U
+#define BPF_ALU_SANITIZE_SRC           (1U << 0)
+#define BPF_ALU_SANITIZE_DST           (1U << 1)
 #define BPF_ALU_NEG_VALUE              (1U << 2)
 #define BPF_ALU_NON_POINTER            (1U << 3)
+#define BPF_ALU_IMMEDIATE              (1U << 4)
 #define BPF_ALU_SANITIZE               (BPF_ALU_SANITIZE_SRC | \
                                         BPF_ALU_SANITIZE_DST)
 
index 6b47f94..e7e99da 100644 (file)
@@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
 struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
 void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(int cpu);
+bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
@@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus_cpu(int cpu) {}
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
 #define buffer_heads_over_limit 0
 
 #endif /* CONFIG_BLOCK */
index 217999c..53fd8c3 100644 (file)
@@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
                                        unsigned int order_per_bit,
                                        const char *name,
                                        struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
                              bool no_warn);
-extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 #endif
index ed4070e..4221888 100644 (file)
@@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
 }
 
 #ifdef CONFIG_COMPACTION
-extern int sysctl_compact_memory;
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void *buffer, size_t *length, loff_t *ppos);
index acac0b5..98dd7b3 100644 (file)
@@ -75,7 +75,6 @@
        __diag_push();                                                          \
        __diag_ignore(GCC, 8, "-Wattribute-alias",                              \
                      "Type aliasing is used to sanitize syscall arguments");\
-       asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));       \
        asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))        \
                __attribute__((alias(__stringify(__se_compat_sys##name))));     \
        ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);                         \
index 2e8c69b..97cfd13 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs.h - definitions for the device driver filesystem
  *
  * Based on sysfs:
index bceb064..4d7fced 100644 (file)
@@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx)
        }
 }
 
+static __always_inline bool context_tracking_guest_enter(void)
+{
+       if (context_tracking_enabled())
+               __context_tracking_enter(CONTEXT_GUEST);
+
+       return context_tracking_enabled_this_cpu();
+}
+
+static __always_inline void context_tracking_guest_exit(void)
+{
+       if (context_tracking_enabled())
+               __context_tracking_exit(CONTEXT_GUEST);
+}
 
 /**
  * ct_state() - return the current context tracking state if known
@@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
+static inline bool context_tracking_guest_enter(void) { return false; }
+static inline void context_tracking_guest_exit(void) { }
+
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
@@ -102,80 +118,4 @@ extern void context_tracking_init(void);
 static inline void context_tracking_init(void) { }
 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */
 
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
-{
-       instrumentation_begin();
-       if (vtime_accounting_enabled_this_cpu())
-               vtime_guest_enter(current);
-       else
-               current->flags |= PF_VCPU;
-       instrumentation_end();
-
-       if (context_tracking_enabled())
-               __context_tracking_enter(CONTEXT_GUEST);
-
-       /* KVM does not hold any references to rcu protected data when it
-        * switches CPU into a guest mode. In fact switching to a guest mode
-        * is very similar to exiting to userspace from rcu point of view. In
-        * addition CPU may stay in a guest mode for quite a long time (up to
-        * one time slice). Lets treat guest mode as quiescent state, just like
-        * we do with user-mode execution.
-        */
-       if (!context_tracking_enabled_this_cpu()) {
-               instrumentation_begin();
-               rcu_virt_note_context_switch(smp_processor_id());
-               instrumentation_end();
-       }
-}
-
-static __always_inline void guest_exit_irqoff(void)
-{
-       if (context_tracking_enabled())
-               __context_tracking_exit(CONTEXT_GUEST);
-
-       instrumentation_begin();
-       if (vtime_accounting_enabled_this_cpu())
-               vtime_guest_exit(current);
-       else
-               current->flags &= ~PF_VCPU;
-       instrumentation_end();
-}
-
-#else
-static __always_inline void guest_enter_irqoff(void)
-{
-       /*
-        * This is running in ioctl context so its safe
-        * to assume that it's the stime pending cputime
-        * to flush.
-        */
-       instrumentation_begin();
-       vtime_account_kernel(current);
-       current->flags |= PF_VCPU;
-       rcu_virt_note_context_switch(smp_processor_id());
-       instrumentation_end();
-}
-
-static __always_inline void guest_exit_irqoff(void)
-{
-       instrumentation_begin();
-       /* Flush the guest cputime we spent on the guest */
-       vtime_account_kernel(current);
-       current->flags &= ~PF_VCPU;
-       instrumentation_end();
-}
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-
-static inline void guest_exit(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       guest_exit_irqoff();
-       local_irq_restore(flags);
-}
-
 #endif
index 13c8dab..674045c 100644 (file)
@@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
  * Williams, Ross N., ross<at>ross.net
  * (see URL http://www.ross.net/crc/download/crc_v3.txt).
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc);
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc);
 
 #endif /* __CRC8_H_ */
index ac0e5f9..1497132 100644 (file)
@@ -53,7 +53,6 @@ do {                                                  \
                groups_free(group_info);                \
 } while (0)
 
-extern struct group_info init_groups;
 #ifdef CONFIG_MULTIUSER
 extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
index 2d3bdcc..21651f9 100644 (file)
@@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
                return 0;
 }
 
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {
-       if (current->delays)
-               current->delays->flags |= flag;
+       if (p->delays)
+               p->delays->flags |= flag;
 }
 
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {
-       if (current->delays)
-               current->delays->flags &= ~flag;
+       if (p->delays)
+               p->delays->flags &= ~flag;
 }
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
@@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 
 static inline void delayacct_blkio_start(void)
 {
-       delayacct_set_flag(DELAYACCT_PF_BLKIO);
+       delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
        if (current->delays)
                __delayacct_blkio_start();
 }
@@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p)
 {
        if (p->delays)
                __delayacct_blkio_end(p);
-       delayacct_clear_flag(DELAYACCT_PF_BLKIO);
+       delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
 static inline int delayacct_add_tsk(struct taskstats *d,
@@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void)
 }
 
 #else
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {}
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {}
 static inline void delayacct_init(void)
 {}
index 12766ed..c3c88fd 100644 (file)
@@ -145,7 +145,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM           ((__force fmode_t)0x1000)
 
-/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
 #define FMODE_UNSIGNED_OFFSET  ((__force fmode_t)0x2000)
 
 /* File is opened with O_PATH; almost nothing can be done with it */
@@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
  * @nrpages: Number of page entries, protected by the i_pages lock.
- * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
  * @writeback_index: Writeback starts here.
  * @a_ops: Methods.
  * @flags: Error bits and flags (AS_*).
@@ -463,7 +462,6 @@ struct address_space {
        struct rb_root_cached   i_mmap;
        struct rw_semaphore     i_mmap_rwsem;
        unsigned long           nrpages;
-       unsigned long           nrexceptional;
        pgoff_t                 writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long           flags;
index 6cb8230..939b1a8 100644 (file)
@@ -404,4 +404,3 @@ s_fields                                                            \
 
 /* }}}1 */
 #endif /* GENL_MAGIC_FUNC_H */
-/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
index 35d21fd..f81d489 100644 (file)
@@ -283,4 +283,3 @@ enum {                                                                      \
 
 /* }}}1 */
 #endif /* GENL_MAGIC_STRUCT_H */
-/* vim: set foldmethod=marker nofoldenable : */
index 26f4d90..11da8af 100644 (file)
@@ -490,7 +490,7 @@ static inline int gfp_zonelist(gfp_t flags)
 
 /*
  * We get the zone list from the current node and the gfp_mask.
- * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
+ * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
  * There are two zonelists per node, one for all zones with memory and
  * one containing just zones from the node the zonelist belongs to.
  *
@@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end,
 extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
                                       int nid, nodemask_t *nodemask);
 #endif
-void free_contig_range(unsigned long pfn, unsigned int nr_pages);
+void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_CMA
 /* CMA stuff */
index 44170f3..832b49b 100644 (file)
@@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
        kunmap_local(to);
 }
 
+static inline void memzero_page(struct page *page, size_t offset, size_t len)
+{
+       char *addr = kmap_atomic(page);
+       memset(addr + offset, 0, len);
+       kunmap_atomic(addr);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
index ba973ef..9626fda 100644 (file)
@@ -87,9 +87,6 @@ enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
-#ifdef CONFIG_DEBUG_VM
-       TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
-#endif
 };
 
 struct kobject;
index cccd1aa..b92f25c 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kref.h>
 #include <linux/pgtable.h>
 #include <linux/gfp.h>
+#include <linux/userfaultfd_k.h>
 
 struct ctl_table;
 struct user_struct;
@@ -134,11 +135,14 @@ void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
+#ifdef CONFIG_USERFAULTFD
 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
                                struct vm_area_struct *dst_vma,
                                unsigned long dst_addr,
                                unsigned long src_addr,
+                               enum mcopy_atomic_mode mode,
                                struct page **pagep);
+#endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
@@ -152,7 +156,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud);
 
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
@@ -161,7 +166,7 @@ extern struct list_head huge_boot_pages;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
@@ -187,6 +192,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -308,16 +314,19 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
        BUG();
 }
 
+#ifdef CONFIG_USERFAULTFD
 static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                                                pte_t *dst_pte,
                                                struct vm_area_struct *dst_vma,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
+                                               enum mcopy_atomic_mode mode,
                                                struct page **pagep)
 {
        BUG();
        return 0;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
@@ -368,6 +377,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
        return 0;
 }
 
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
@@ -555,6 +566,7 @@ HPAGEFLAG(Freed, freed)
 #define HSTATE_NAME_LEN 32
 /* Defines one hugetlb page size */
 struct hstate {
+       struct mutex resize_lock;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
@@ -583,6 +595,7 @@ struct huge_bootmem_page {
        struct hstate *hstate;
 };
 
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -865,6 +878,12 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
+static inline int isolate_or_dissolve_huge_page(struct page *page,
+                                               struct list_head *list)
+{
+       return -ENOMEM;
+}
+
 static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
@@ -1039,4 +1058,14 @@ static inline __init void hugetlb_cma_check(void)
 }
 #endif
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
+
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
index b2412b4..40fc581 100644 (file)
@@ -25,7 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 extern struct nsproxy init_nsproxy;
-extern struct group_info init_groups;
 extern struct cred init_cred;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
index 85c1571..1bbe9af 100644 (file)
@@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long);
 
 #ifdef CONFIG_BLK_DEV_INITRD
 extern void __init reserve_initrd_mem(void);
+extern void wait_for_initramfs(void);
 #else
 static inline void __init reserve_initrd_mem(void) {}
+static inline void wait_for_initramfs(void) {}
 #endif
 
 extern phys_addr_t phys_initrd_start;
index d202fd2..c87d0cb 100644 (file)
@@ -198,7 +198,6 @@ struct iomap_ioend {
        struct inode            *io_inode;      /* file being written to */
        size_t                  io_size;        /* size of the extent */
        loff_t                  io_offset;      /* offset in the file */
-       void                    *io_private;    /* file system private data */
        struct bio              *io_bio;        /* bio being built */
        struct bio              io_inline_bio;  /* MUST BE LAST! */
 };
@@ -234,9 +233,7 @@ struct iomap_writepage_ctx {
 
 void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
-               struct list_head *more_ioends,
-               void (*merge_private)(struct iomap_ioend *ioend,
-                               struct iomap_ioend *next));
+               struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
 int iomap_writepage(struct page *page, struct writeback_control *wbc,
                struct iomap_writepage_ctx *wpc,
index 24a59cb..cc8fa10 100644 (file)
  */
 #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
-/*
- * IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y'
- * or 'm', NULL otherwise.
- */
-#define IF_ENABLED(option, ptr) (IS_ENABLED(option) ? (ptr) : NULL)
-
 #endif /* __LINUX_KCONFIG_H */
index 5b7ed6d..15d8bad 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_KERNEL_H
 
 #include <stdarg.h>
+#include <linux/align.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
  */
 #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
 
-/* @a is a power of 2 value */
-#define ALIGN(x, a)            __ALIGN_KERNEL((x), (a))
-#define ALIGN_DOWN(x, a)       __ALIGN_KERNEL((x) - ((a) - 1), (a))
-#define __ALIGN_MASK(x, mask)  __ALIGN_KERNEL_MASK((x), (mask))
-#define PTR_ALIGN(p, a)                ((typeof(p))ALIGN((unsigned long)(p), (a)))
-#define PTR_ALIGN_DOWN(p, a)   ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
-#define IS_ALIGNED(x, a)               (((x) & ((typeof(x))(a) - 1)) == 0)
-
 /* generic data direction definitions */
 #define READ                   0
 #define WRITE                  1
@@ -48,6 +41,8 @@
  */
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+#define PTR_IF(cond, ptr)      ((cond) ? (ptr) : NULL)
+
 #define u64_to_user_ptr(x) (           \
 {                                      \
        typecheck(u64, (x));            \
index 8895b95..2f34487 100644 (file)
@@ -338,6 +338,51 @@ struct kvm_vcpu {
        struct kvm_dirty_ring dirty_ring;
 };
 
+/* must be called with irqs disabled */
+static __always_inline void guest_enter_irqoff(void)
+{
+       /*
+        * This is running in ioctl context so its safe to assume that it's the
+        * stime pending cputime to flush.
+        */
+       instrumentation_begin();
+       vtime_account_guest_enter();
+       instrumentation_end();
+
+       /*
+        * KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_guest_enter()) {
+               instrumentation_begin();
+               rcu_virt_note_context_switch(smp_processor_id());
+               instrumentation_end();
+       }
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+       context_tracking_guest_exit();
+
+       instrumentation_begin();
+       /* Flush the guest cputime we spent on the guest */
+       vtime_account_guest_exit();
+       instrumentation_end();
+}
+
+static inline void guest_exit(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+}
+
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
        /*
index 5904716..c193be7 100644 (file)
@@ -114,12 +114,13 @@ struct batched_lruvec_stat {
 };
 
 /*
- * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
- * which have elements charged to this memcg.
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to this memcg.
  */
-struct memcg_shrinker_map {
+struct shrinker_info {
        struct rcu_head rcu;
-       unsigned long map[];
+       atomic_long_t *nr_deferred;
+       unsigned long *map;
 };
 
 /*
@@ -145,7 +146,7 @@ struct mem_cgroup_per_node {
 
        struct mem_cgroup_reclaim_iter  iter;
 
-       struct memcg_shrinker_map __rcu *shrinker_map;
+       struct shrinker_info __rcu      *shrinker_info;
 
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long           usage_in_excess;/* Set to the value by which */
@@ -1610,10 +1611,10 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
        return false;
 }
 
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-                                  int nid, int shrinker_id);
+int alloc_shrinker_info(struct mem_cgroup *memcg);
+void free_shrinker_info(struct mem_cgroup *memcg);
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
+void reparent_shrinker_deferred(struct mem_cgroup *memcg);
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1623,8 +1624,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
        return false;
 }
 
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-                                         int nid, int shrinker_id)
+static inline void set_shrinker_bit(struct mem_cgroup *memcg,
+                                   int nid, int shrinker_id)
 {
 }
 #endif
index 4da95e6..97e92e8 100644 (file)
@@ -29,6 +29,11 @@ struct memory_block {
        int online_type;                /* for passing data to online routine */
        int nid;                        /* NID for this memory block */
        struct device dev;
+       /*
+        * Number of vmemmap pages. These pages
+        * lay at the beginning of the memory block.
+        */
+       unsigned long nr_vmemmap_pages;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
index 7288aa5..28f32fd 100644 (file)
@@ -56,6 +56,14 @@ typedef int __bitwise mhp_t;
 #define MHP_MERGE_RESOURCE     ((__force mhp_t)BIT(0))
 
 /*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+
+/*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
  * pgprot: page protection flags to apply to newly created page tables
@@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 /* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                                    struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-                       int online_type, int nid);
+                       struct zone *zone);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
                                         unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
                                      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
index f5b464d..45a79da 100644 (file)
@@ -17,7 +17,7 @@ struct device;
  * @alloc: track pages consumed, private to vmemmap_populate()
  */
 struct vmem_altmap {
-       const unsigned long base_pfn;
+       unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
index 3a38963..4bb4e51 100644 (file)
@@ -27,6 +27,7 @@ enum migrate_reason {
        MR_MEMPOLICY_MBIND,
        MR_NUMA_MISPLACED,
        MR_CONTIG_RANGE,
+       MR_LONGTERM_PIN,
        MR_TYPES
 };
 
@@ -43,10 +44,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
                unsigned long private, enum migrate_mode mode, int reason);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-extern void putback_movable_page(struct page *page);
 
-extern void migrate_prep(void);
-extern void migrate_prep_local(void);
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
@@ -66,9 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page,
 static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
        { return -EBUSY; }
 
-static inline int migrate_prep(void) { return -ENOSYS; }
-static inline int migrate_prep_local(void) { return -ENOSYS; }
-
 static inline void migrate_page_states(struct page *newpage, struct page *page)
 {
 }
index 011f436..322ec61 100644 (file)
@@ -106,7 +106,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
  * embedding these tags into addresses that point to these memory regions, and
  * checking that the memory and the pointer tags match on memory accesses)
  * redefine this macro to strip tags from pointers.
- * It's defined as noop for arcitectures that don't support memory tagging.
+ * It's defined as noop for architectures that don't support memory tagging.
  */
 #ifndef untagged_addr
 #define untagged_addr(addr) (addr)
@@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_GROWSUP    VM_NONE
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define VM_UFFD_MINOR_BIT     37
+# define VM_UFFD_MINOR         BIT(VM_UFFD_MINOR_BIT)  /* UFFD minor faults */
+#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+# define VM_UFFD_MINOR         VM_NONE
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP      (VM_RAND_READ | VM_SEQ_READ)
 
@@ -1134,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
+static inline bool is_zone_movable_page(const struct page *page)
+{
+       return page_zonenum(page) == ZONE_MOVABLE;
+}
+
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 void free_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
@@ -1543,6 +1555,20 @@ static inline unsigned long page_to_section(const struct page *page)
 }
 #endif
 
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_pinnable_page(struct page *page)
+{
+       return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
+               is_zero_pfn(page_to_pfn(page));
+}
+#else
+static inline bool is_pinnable_page(struct page *page)
+{
+       return true;
+}
+#endif
+
 static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
index 3b22057..0d53eba 100644 (file)
@@ -55,7 +55,7 @@ enum migratetype {
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.  What is important though
         * is that a range of pageblocks must be aligned to
-        * MAX_ORDER_NR_PAGES should biggest page be bigger then
+        * MAX_ORDER_NR_PAGES should biggest page be bigger than
         * a single pageblock.
         */
        MIGRATE_CMA,
@@ -407,8 +407,13 @@ enum zone_type {
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
-        *    essentially turn such pages unmovable. Memory offlining might
-        *    retry a long time.
+        *    essentially turn such pages unmovable. Therefore, we do not allow
+        *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
+        *    faulted, they come from the right zone right away. However, it is
+        *    still possible that address space already has pages in
+        *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
+        *    touches that memory before pinning). In such case we migrate them
+        *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
@@ -427,6 +432,15 @@ enum zone_type {
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
+        * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
+        *    situations where ZERO_PAGE(0) which is allocated differently
+        *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
+        *    cannot be migrated.
+        * 7. Memory-hotplug: when using memmap_on_memory and onlining the
+        *    memory to the MOVABLE zone, the vmemmap pages are also placed in
+        *    such zone. Such pages cannot be really moved around as they are
+        *    self-stored in the range, but they are treated as movable when
+        *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1383,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
 #endif
-#endif
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
index 2aab961..4f9a4b3 100644 (file)
@@ -53,8 +53,7 @@ int arpt_register_table(struct net *net, const struct xt_table *table,
                        const struct arpt_replace *repl,
                        const struct nf_hook_ops *ops);
 void arpt_unregister_table(struct net *net, const char *name);
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-                                   const struct nf_hook_ops *ops);
+void arpt_unregister_table_pre_exit(struct net *net, const char *name);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
                                  const struct nf_hook_state *state,
                                  struct xt_table *table);
index 5b4c67c..15004c4 100644 (file)
@@ -452,6 +452,7 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_CLONE_BLKSIZE     (1UL << 13)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE  (1UL << 15)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
 #define FATTR4_WORD2_MODE_UMASK                (1UL << 17)
 #define FATTR4_WORD2_XATTR_SUPPORT     (1UL << 18)
@@ -709,6 +710,14 @@ struct nl4_server {
        } u;
 };
 
+enum nfs4_change_attr_type {
+       NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0,
+       NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1,
+       NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2,
+       NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3,
+       NFS4_CHANGE_TYPE_IS_UNDEFINED = 4,
+};
+
 /*
  * Options for setxattr. These match the flags for setxattr(2).
  */
index eadaabd..ffba254 100644 (file)
@@ -246,11 +246,15 @@ struct nfs4_copy_state {
                                BIT(13)         /* Deferred cache invalidation */
 #define NFS_INO_INVALID_BLOCKS BIT(14)         /* cached blocks are invalid */
 #define NFS_INO_INVALID_XATTR  BIT(15)         /* xattrs are invalid */
+#define NFS_INO_INVALID_NLINK  BIT(16)         /* cached nlinks is invalid */
+#define NFS_INO_INVALID_MODE   BIT(17)         /* cached mode is invalid */
 
 #define NFS_INO_INVALID_ATTR   (NFS_INO_INVALID_CHANGE \
                | NFS_INO_INVALID_CTIME \
                | NFS_INO_INVALID_MTIME \
                | NFS_INO_INVALID_SIZE \
+               | NFS_INO_INVALID_NLINK \
+               | NFS_INO_INVALID_MODE \
                | NFS_INO_INVALID_OTHER)        /* inode metadata is invalid */
 
 /*
@@ -386,7 +390,7 @@ extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
-extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_clear_invalid_mapping(struct address_space *mapping);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
index a28d71b..d71a0e9 100644 (file)
@@ -156,6 +156,7 @@ struct nfs_server {
 #define NFS_MOUNT_WRITE_EAGER          0x01000000
 #define NFS_MOUNT_WRITE_WAIT           0x02000000
 
+       unsigned int            fattr_valid;    /* Valid attributes */
        unsigned int            caps;           /* server capabilities */
        unsigned int            rsize;          /* read size */
        unsigned int            rpages;         /* read size (in pages) */
@@ -180,6 +181,9 @@ struct nfs_server {
 #define NFS_OPTION_FSCACHE     0x00000001      /* - local caching enabled */
 #define NFS_OPTION_MIGRATION   0x00000002      /* - NFSv4 migration enabled */
 
+       enum nfs4_change_attr_type
+                               change_attr_type;/* Description of change attribute */
+
        struct nfs_fsid         fsid;
        __u64                   maxfilesize;    /* maximum file size */
        struct timespec64       time_delta;     /* smallest time granularity */
@@ -265,16 +269,7 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS       (1U << 2)
 #define NFS_CAP_ACLS           (1U << 3)
 #define NFS_CAP_ATOMIC_OPEN    (1U << 4)
-/* #define NFS_CAP_CHANGE_ATTR (1U << 5) */
 #define NFS_CAP_LGOPEN         (1U << 5)
-#define NFS_CAP_FILEID         (1U << 6)
-#define NFS_CAP_MODE           (1U << 7)
-#define NFS_CAP_NLINK          (1U << 8)
-#define NFS_CAP_OWNER          (1U << 9)
-#define NFS_CAP_OWNER_GROUP    (1U << 10)
-#define NFS_CAP_ATIME          (1U << 11)
-#define NFS_CAP_CTIME          (1U << 12)
-#define NFS_CAP_MTIME          (1U << 13)
 #define NFS_CAP_POSIX_LOCK     (1U << 14)
 #define NFS_CAP_UIDGID_NOMAP   (1U << 15)
 #define NFS_CAP_STATEID_NFSV41 (1U << 16)
index 3327239..717ecc8 100644 (file)
@@ -15,6 +15,8 @@
 #define NFS_DEF_FILE_IO_SIZE   (4096U)
 #define NFS_MIN_FILE_IO_SIZE   (1024U)
 
+#define NFS_BITMASK_SZ         3
+
 struct nfs4_string {
        unsigned int len;
        char *data;
@@ -150,6 +152,8 @@ struct nfs_fsinfo {
        __u32                   layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
        __u32                   blksize; /* preferred pnfs io block size */
        __u32                   clone_blksize; /* granularity of a CLONE operation */
+       enum nfs4_change_attr_type
+                               change_attr_type; /* Info about change attr */
        __u32                   xattr_support; /* User xattrs supported */
 };
 
@@ -525,7 +529,8 @@ struct nfs_closeargs {
        struct nfs_seqid *      seqid;
        fmode_t                 fmode;
        u32                     share_access;
-       u32 *                   bitmask;
+       const u32 *             bitmask;
+       u32                     bitmask_store[NFS_BITMASK_SZ];
        struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -608,7 +613,8 @@ struct nfs4_delegreturnargs {
        struct nfs4_sequence_args       seq_args;
        const struct nfs_fh *fhandle;
        const nfs4_stateid *stateid;
-       u32 * bitmask;
+       const u32 *bitmask;
+       u32 bitmask_store[NFS_BITMASK_SZ];
        struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -648,7 +654,8 @@ struct nfs_pgio_args {
        union {
                unsigned int            replen;                 /* used by read */
                struct {
-                       u32 *                   bitmask;        /* used by write */
+                       const u32 *             bitmask;        /* used by write */
+                       u32 bitmask_store[NFS_BITMASK_SZ];      /* used by write */
                        enum nfs3_stable_how    stable;         /* used by write */
                };
        };
index 469fa7f..a4bd411 100644 (file)
 
 struct pagevec;
 
+static inline bool mapping_empty(struct address_space *mapping)
+{
+       return xa_empty(&mapping->i_pages);
+}
+
 /*
  * Bits in mapping->flags.
  */
index 5e77239..46b1378 100644 (file)
@@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 
 /*
  * On some architectures hardware does not set page access bit when accessing
- * memory page, it is responsibilty of software setting this bit. It brings
+ * memory page, it is responsibility of software setting this bit. It brings
  * out extra page fault penalty to track page access bit. For optimization page
  * access bit can be set during all page fault flow on these arches.
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
@@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 /*
  * This is an implementation of pmdp_establish() that is only suitable for an
  * architecture that doesn't have hardware dirty/accessed bits. In this case we
- * can't race with CPU which sets these bits and non-atomic aproach is fine.
+ * can't race with CPU which sets these bits and non-atomic approach is fine.
  */
 static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
@@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
  * updates, but to prevent any updates it may make from being lost.
  *
  * This does not protect against other software modifications of the
- * pte; the appropriate pte lock must be held over the transation.
+ * pte; the appropriate pte lock must be held over the transaction.
  *
  * Note that this interface is intended to be batchable, meaning that
  * ptep_modify_prot_commit may not actually update the pte, but merely
@@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
+#ifdef CONFIG_MMU
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
@@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
        return zero_pfn;
 }
 #endif
+#else
+static inline int is_zero_pfn(unsigned long pfn)
+{
+       return 0;
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+       return 0;
+}
+#endif /* CONFIG_MMU */
 
 #ifdef CONFIG_MMU
 
@@ -1269,13 +1281,13 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
         *
         * The complete check uses is_pmd_migration_entry() in linux/swapops.h
         * But using that requires moving current function and pmd_trans_unstable()
-        * to linux/swapops.h to resovle dependency, which is too much code move.
+        * to linux/swapops.h to resolve dependency, which is too much code move.
         *
         * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
         * because !pmd_present() pages can only be under migration not swapped
         * out.
         *
-        * pmd_none() is preseved for future condition checks on pmd migration
+        * pmd_none() is preserved for future condition checks on pmd migration
         * entries and not confusing with this function name, although it is
         * redundant with !pmd_present().
         */
index 6035d9a..45f53af 100644 (file)
@@ -5679,6 +5679,7 @@ enum tcpc_cc_polarity {
 
 #define PD_STATUS_EVENT_SOP_DISC_DONE          BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE    BIT(1)
+#define PD_STATUS_EVENT_HARD_RESET             BIT(2)
 
 struct ec_params_typec_status {
        uint8_t port;
index 000cc05..069c7fd 100644 (file)
@@ -32,6 +32,7 @@ struct proc_ops {
        ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
        ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *);
        ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
+       /* mandatory unless nonseekable_open() or equivalent is used */
        loff_t  (*proc_lseek)(struct file *, loff_t, int);
        int     (*proc_release)(struct inode *, struct file *);
        __poll_t (*proc_poll)(struct file *, struct poll_table_struct *);
index bad18ca..fd18ca9 100644 (file)
@@ -15,7 +15,6 @@
 #define KVM_PROFILING  4
 
 struct proc_dir_entry;
-struct pt_regs;
 struct notifier_block;
 
 #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
@@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n);
 int profile_event_register(enum profile_type, struct notifier_block * n);
 int profile_event_unregister(enum profile_type, struct notifier_block * n);
 
-struct pt_regs;
-
 #else
 
 #define prof_on 0
index 9c25c8e..d2c8813 100644 (file)
@@ -1583,7 +1583,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE           0x00800000      /* Allowed to write to swap */
 #define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
-#define PF_MEMALLOC_NOCMA      0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_MEMALLOC_PIN                0x10000000      /* Allocation context constrained to zones which allow long term pinning. */
 #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
index 90b2a0b..e24b1fe 100644 (file)
@@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk)
  * Applies per-task gfp context to the given allocation flags.
  * PF_MEMALLOC_NOIO implies GFP_NOIO
  * PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
  */
 static inline gfp_t current_gfp_context(gfp_t flags)
 {
        unsigned int pflags = READ_ONCE(current->flags);
 
-       if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
+       if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
@@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;
+
+               if (pflags & PF_MEMALLOC_PIN)
+                       flags &= ~__GFP_MOVABLE;
        }
        return flags;
 }
@@ -271,29 +275,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
        current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
-#ifdef CONFIG_CMA
-static inline unsigned int memalloc_nocma_save(void)
+static inline unsigned int memalloc_pin_save(void)
 {
-       unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+       unsigned int flags = current->flags & PF_MEMALLOC_PIN;
 
-       current->flags |= PF_MEMALLOC_NOCMA;
+       current->flags |= PF_MEMALLOC_PIN;
        return flags;
 }
 
-static inline void memalloc_nocma_restore(unsigned int flags)
-{
-       current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
-}
-#else
-static inline unsigned int memalloc_nocma_save(void)
-{
-       return 0;
-}
-
-static inline void memalloc_nocma_restore(unsigned int flags)
+static inline void memalloc_pin_restore(unsigned int flags)
 {
+       current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
 }
-#endif
 
 #ifdef CONFIG_MEMCG
 DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
index 0f80123..1eac79c 100644 (file)
@@ -79,13 +79,14 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE    (1 << 0)
-#define SHRINKER_MEMCG_AWARE   (1 << 1)
+#define SHRINKER_REGISTERED    (1 << 0)
+#define SHRINKER_NUMA_AWARE    (1 << 1)
+#define SHRINKER_MEMCG_AWARE   (1 << 2)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB       (1 << 2)
+#define SHRINKER_NONSLAB       (1 << 3)
 
 extern int prealloc_shrinker(struct shrinker *shrinker);
 extern void register_shrinker_prepared(struct shrinker *shrinker);
index 84a0b48..510519e 100644 (file)
@@ -53,7 +53,15 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, call_single_data_t *csd);
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
+
+/*
+ * Cpus stopping functions in panic. All have default weak definitions.
+ * Architecture-dependent code may override them.
+ */
+void panic_smp_self_stop(void);
+void nmi_panic_self_stop(struct pt_regs *regs);
+void crash_smp_send_stop(void);
 
 /*
  * Call a function on all processors
index d2e97ee..d81fe8b 100644 (file)
@@ -247,6 +247,7 @@ struct rpc_xprt {
        struct rpc_task *       snd_task;       /* Task blocked in send */
 
        struct list_head        xmit_queue;     /* Send queue */
+       atomic_long_t           xmit_queuelen;
 
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
index 4cc6ec3..1447270 100644 (file)
 #include <linux/sched.h>
 #include <linux/node.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <uapi/linux/mempolicy.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -339,6 +341,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
 extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
 extern void mark_page_accessed(struct page *);
+
+extern atomic_t lru_disable_count;
+
+static inline bool lru_cache_disabled(void)
+{
+       return atomic_read(&lru_disable_count);
+}
+
+static inline void lru_cache_enable(void)
+{
+       atomic_dec(&lru_disable_count);
+}
+
+extern void lru_cache_disable(void);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
@@ -378,6 +394,12 @@ extern int sysctl_min_slab_ratio;
 #define node_reclaim_mode 0
 #endif
 
+static inline bool node_reclaim_enabled(void)
+{
+       /* Is any node_reclaim_mode bit set? */
+       return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
 extern int kswapd_run(int nid);
index a8e5f3e..794d153 100644 (file)
@@ -17,6 +17,9 @@
 #include <linux/mm.h>
 #include <asm-generic/pgtable_uffd.h>
 
+/* The set of all possible UFFD-related VM flags. */
+#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
+
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
  * new flags, since they might collide with O_* ones. We want
@@ -34,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+       /* A normal copy_from_user into the destination range. */
+       MCOPY_ATOMIC_NORMAL,
+       /* Don't copy; map the destination range to the zero page. */
+       MCOPY_ATOMIC_ZEROPAGE,
+       /* Just install pte(s) with the existing page(s) in the page cache. */
+       MCOPY_ATOMIC_CONTINUE,
+};
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len,
                            bool *mmap_changing, __u64 mode);
@@ -41,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
                              unsigned long dst_start,
                              unsigned long len,
                              bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+                             unsigned long len, bool *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
                               unsigned long start, unsigned long len,
                               bool enable_wp, bool *mmap_changing);
@@ -52,6 +73,22 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
        return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
 }
 
+/*
+ * Never enable huge pmd sharing on some uffd registered vmas:
+ *
+ * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
+ *
+ * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
+ *   VMAs which share huge pmds. (If you have two mappings to the same
+ *   underlying pages, and fault in the non-UFFD-registered one with a write,
+ *   with huge pmd sharing this would *also* setup the second UFFD-registered
+ *   mapping, and we'd not get minor faults.)
+ */
+static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
+}
+
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
 {
        return vma->vm_flags & VM_UFFD_MISSING;
@@ -62,6 +99,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
        return vma->vm_flags & VM_UFFD_WP;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_UFFD_MINOR;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
@@ -76,7 +118,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
 
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
-       return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+       return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
@@ -123,6 +165,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
        return false;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       return false;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
index 18e7597..ae0dd19 100644 (file)
@@ -71,6 +71,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_HUGETLB_PAGE
                HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
 #endif
+#ifdef CONFIG_CMA
+               CMA_ALLOC_SUCCESS,
+               CMA_ALLOC_FAIL,
+#endif
                UNEVICTABLE_PGCULLED,   /* culled to noreclaim list */
                UNEVICTABLE_PGSCANNED,  /* scanned for reclaimability */
                UNEVICTABLE_PGRESCUED,  /* rescued from noreclaim list */
@@ -121,6 +125,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                SWAP_RA,
                SWAP_RA_HIT,
 #endif
+#ifdef CONFIG_X86
+               DIRECT_MAP_LEVEL2_SPLIT,
+               DIRECT_MAP_LEVEL3_SPLIT,
+#endif
                NR_VM_EVENT_ITEMS
 };
 
index 394d03c..4d668ab 100644 (file)
@@ -33,7 +33,7 @@ struct notifier_block;                /* in notifier.h */
  *
  * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
  * shadow memory has been mapped. It's used to handle allocation errors so that
- * we don't try to poision shadow on free if it was never allocated.
+ * we don't try to poison shadow on free if it was never allocated.
  *
  * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
  * determine which allocations need the module shadow freed.
@@ -43,7 +43,7 @@ struct notifier_block;                /* in notifier.h */
 
 /*
  * Maximum alignment for ioremap() regions.
- * Can be overriden by arch-specific value.
+ * Can be overridden by arch-specific value.
  */
 #ifndef IOREMAP_MAX_ORDER
 #define IOREMAP_MAX_ORDER      (7 + PAGE_SHIFT)        /* 128 pages */
@@ -227,9 +227,8 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 #endif
 
-/* for /dev/kmem */
+/* for /proc/kcore */
 extern long vread(char *buf, char *addr, unsigned long count);
-extern long vwrite(char *buf, char *addr, unsigned long count);
 
 /*
  *     Internals.  Dont't use..
index 041d652..3684487 100644 (file)
@@ -3,12 +3,46 @@
 #define _LINUX_KERNEL_VTIME_H
 
 #include <linux/context_tracking_state.h>
+#include <linux/sched.h>
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #include <asm/vtime.h>
 #endif
 
+/*
+ * Common vtime APIs
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void vtime_account_kernel(struct task_struct *tsk);
+extern void vtime_account_idle(struct task_struct *tsk);
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
-struct task_struct;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_user_enter(struct task_struct *tsk);
+extern void vtime_user_exit(struct task_struct *tsk);
+extern void vtime_guest_enter(struct task_struct *tsk);
+extern void vtime_guest_exit(struct task_struct *tsk);
+extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_user_enter(struct task_struct *tsk) { }
+static inline void vtime_user_exit(struct task_struct *tsk) { }
+static inline void vtime_guest_enter(struct task_struct *tsk) { }
+static inline void vtime_guest_exit(struct task_struct *tsk) { }
+static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
+#endif
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
+extern void vtime_account_softirq(struct task_struct *tsk);
+extern void vtime_account_hardirq(struct task_struct *tsk);
+extern void vtime_flush(struct task_struct *tsk);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
+#endif
 
 /*
  * vtime_accounting_enabled_this_cpu() definitions/declarations
@@ -18,6 +52,18 @@ struct task_struct;
 static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
 extern void vtime_task_switch(struct task_struct *prev);
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+       vtime_account_kernel(current);
+       current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+       vtime_account_kernel(current);
+       current->flags &= ~PF_VCPU;
+}
+
 #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
 
 /*
@@ -49,49 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev)
                vtime_task_switch_generic(prev);
 }
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+       if (vtime_accounting_enabled_this_cpu())
+               vtime_guest_enter(current);
+       else
+               current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+       if (vtime_accounting_enabled_this_cpu())
+               vtime_guest_exit(current);
+       else
+               current->flags &= ~PF_VCPU;
+}
+
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
-static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
 static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
 static inline void vtime_task_switch(struct task_struct *prev) { }
 
-#endif
-
-/*
- * Common vtime APIs
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void vtime_account_kernel(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-static inline void vtime_account_kernel(struct task_struct *tsk) { }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+static __always_inline void vtime_account_guest_enter(void)
+{
+       current->flags |= PF_VCPU;
+}
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_user_enter(struct task_struct *tsk);
-extern void vtime_user_exit(struct task_struct *tsk);
-extern void vtime_guest_enter(struct task_struct *tsk);
-extern void vtime_guest_exit(struct task_struct *tsk);
-extern void vtime_init_idle(struct task_struct *tsk, int cpu);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_user_enter(struct task_struct *tsk) { }
-static inline void vtime_user_exit(struct task_struct *tsk) { }
-static inline void vtime_guest_enter(struct task_struct *tsk) { }
-static inline void vtime_guest_exit(struct task_struct *tsk) { }
-static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
-#endif
+static __always_inline void vtime_account_guest_exit(void)
+{
+       current->flags &= ~PF_VCPU;
+}
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
-extern void vtime_account_softirq(struct task_struct *tsk);
-extern void vtime_account_hardirq(struct task_struct *tsk);
-extern void vtime_flush(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
-static inline void vtime_account_softirq(struct task_struct *tsk) { }
-static inline void vtime_account_hardirq(struct task_struct *tsk) { }
-static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
 
index e8df72e..5e84888 100644 (file)
@@ -68,7 +68,6 @@ enum sctp_verb {
        SCTP_CMD_ASSOC_FAILED,   /* Handle association failure. */
        SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */
        SCTP_CMD_GEN_SHUTDOWN,   /* Generate a SHUTDOWN chunk. */
-       SCTP_CMD_UPDATE_ASSOC,   /* Update association information. */
        SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */
        SCTP_CMD_SETUP_T2,       /* Hi-level, setup T2-shutdown parms.  */
        SCTP_CMD_RTO_PENDING,    /* Set transport's rto_pending. */
index 5017a88..c3d3547 100644 (file)
@@ -8,28 +8,31 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-TRACE_EVENT(cma_alloc,
+DECLARE_EVENT_CLASS(cma_alloc_class,
 
-       TP_PROTO(unsigned long pfn, const struct page *page,
-                unsigned int count, unsigned int align),
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
 
-       TP_ARGS(pfn, page, count, align),
+       TP_ARGS(name, pfn, page, count, align),
 
        TP_STRUCT__entry(
+               __string(name, name)
                __field(unsigned long, pfn)
                __field(const struct page *, page)
-               __field(unsigned int, count)
+               __field(unsigned long, count)
                __field(unsigned int, align)
        ),
 
        TP_fast_assign(
+               __assign_str(name, name);
                __entry->pfn = pfn;
                __entry->page = page;
                __entry->count = count;
                __entry->align = align;
        ),
 
-       TP_printk("pfn=%lx page=%p count=%u align=%u",
+       TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u",
+                 __get_str(name),
                  __entry->pfn,
                  __entry->page,
                  __entry->count,
@@ -38,29 +41,72 @@ TRACE_EVENT(cma_alloc,
 
 TRACE_EVENT(cma_release,
 
-       TP_PROTO(unsigned long pfn, const struct page *page,
-                unsigned int count),
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count),
 
-       TP_ARGS(pfn, page, count),
+       TP_ARGS(name, pfn, page, count),
 
        TP_STRUCT__entry(
+               __string(name, name)
                __field(unsigned long, pfn)
                __field(const struct page *, page)
-               __field(unsigned int, count)
+               __field(unsigned long, count)
        ),
 
        TP_fast_assign(
+               __assign_str(name, name);
                __entry->pfn = pfn;
                __entry->page = page;
                __entry->count = count;
        ),
 
-       TP_printk("pfn=%lx page=%p count=%u",
+       TP_printk("name=%s pfn=%lx page=%p count=%lu",
+                 __get_str(name),
                  __entry->pfn,
                  __entry->page,
                  __entry->count)
 );
 
+TRACE_EVENT(cma_alloc_start,
+
+       TP_PROTO(const char *name, unsigned long count, unsigned int align),
+
+       TP_ARGS(name, count, align),
+
+       TP_STRUCT__entry(
+               __string(name, name)
+               __field(unsigned long, count)
+               __field(unsigned int, align)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->count = count;
+               __entry->align = align;
+       ),
+
+       TP_printk("name=%s count=%lu align=%u",
+                 __get_str(name),
+                 __entry->count,
+                 __entry->align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
+
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
+
+       TP_ARGS(name, pfn, page, count, align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
+
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
+
+       TP_ARGS(name, pfn, page, count, align)
+);
+
 #endif /* _TRACE_CMA_H */
 
 /* This part must be outside protection */
index 4d43439..9fb2a3b 100644 (file)
@@ -20,7 +20,8 @@
        EM( MR_SYSCALL,         "syscall_or_cpuset")            \
        EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind")              \
        EM( MR_NUMA_MISPLACED,  "numa_misplaced")               \
-       EMe(MR_CONTIG_RANGE,    "contig_range")
+       EM( MR_CONTIG_RANGE,    "contig_range")                 \
+       EMe(MR_LONGTERM_PIN,    "longterm_pin")
 
 /*
  * First define the enums in the above macros to be exported to userspace
@@ -81,6 +82,28 @@ TRACE_EVENT(mm_migrate_pages,
                __print_symbolic(__entry->mode, MIGRATE_MODE),
                __print_symbolic(__entry->reason, MIGRATE_REASON))
 );
+
+TRACE_EVENT(mm_migrate_pages_start,
+
+       TP_PROTO(enum migrate_mode mode, int reason),
+
+       TP_ARGS(mode, reason),
+
+       TP_STRUCT__entry(
+               __field(enum migrate_mode, mode)
+               __field(int, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->mode   = mode;
+               __entry->reason = reason;
+       ),
+
+       TP_printk("mode=%s reason=%s",
+                 __print_symbolic(__entry->mode, MIGRATE_MODE),
+                 __print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
index 67018d3..629c7a0 100644 (file)
@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,               "arch_2"        )
 #define IF_HAVE_VM_SOFTDIRTY(flag,name)
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
+#else
+# define IF_HAVE_UFFD_MINOR(flag, name)
+#endif
+
 #define __def_vmaflag_names                                            \
        {VM_READ,                       "read"          },              \
        {VM_WRITE,                      "write"         },              \
@@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,                "arch_2"        )
        {VM_MAYSHARE,                   "mayshare"      },              \
        {VM_GROWSDOWN,                  "growsdown"     },              \
        {VM_UFFD_MISSING,               "uffd_missing"  },              \
+IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,      "uffd_minor"    )               \
        {VM_PFNMAP,                     "pfnmap"        },              \
        {VM_DENYWRITE,                  "denywrite"     },              \
        {VM_UFFD_WP,                    "uffd_wp"       },              \
index c838e7a..bd55908 100644 (file)
@@ -60,6 +60,46 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
                                ),                                      \
                                TP_ARGS(wc, cid))
 
+DECLARE_EVENT_CLASS(rpcrdma_mr_completion_class,
+       TP_PROTO(
+               const struct ib_wc *wc,
+               const struct rpc_rdma_cid *cid
+       ),
+
+       TP_ARGS(wc, cid),
+
+       TP_STRUCT__entry(
+               __field(u32, cq_id)
+               __field(int, completion_id)
+               __field(unsigned long, status)
+               __field(unsigned int, vendor_err)
+       ),
+
+       TP_fast_assign(
+               __entry->cq_id = cid->ci_queue_id;
+               __entry->completion_id = cid->ci_completion_id;
+               __entry->status = wc->status;
+               if (wc->status)
+                       __entry->vendor_err = wc->vendor_err;
+               else
+                       __entry->vendor_err = 0;
+       ),
+
+       TP_printk("cq.id=%u mr.id=%d status=%s (%lu/0x%x)",
+               __entry->cq_id, __entry->completion_id,
+               rdma_show_wc_status(__entry->status),
+               __entry->status, __entry->vendor_err
+       )
+);
+
+#define DEFINE_MR_COMPLETION_EVENT(name)                               \
+               DEFINE_EVENT(rpcrdma_mr_completion_class, name,         \
+                               TP_PROTO(                               \
+                                       const struct ib_wc *wc,         \
+                                       const struct rpc_rdma_cid *cid  \
+                               ),                                      \
+                               TP_ARGS(wc, cid))
+
 DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
        TP_PROTO(
                const struct ib_wc *wc,
@@ -150,19 +190,17 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt,
        TP_ARGS(r_xprt),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p",
-               __get_str(addr), __get_str(port), __entry->r_xprt
+       TP_printk("peer=[%s]:%s",
+               __get_str(addr), __get_str(port)
        )
 );
 
@@ -182,7 +220,6 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
        TP_ARGS(r_xprt, rc),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(int, rc)
                __field(int, connect_status)
                __string(addr, rpcrdma_addrstr(r_xprt))
@@ -190,15 +227,14 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->rc = rc;
                __entry->connect_status = r_xprt->rx_ep->re_connect_status;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s rc=%d connection status=%d",
+               __get_str(addr), __get_str(port),
                __entry->rc, __entry->connect_status
        )
 );
@@ -343,7 +379,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class,
 
                __entry->task_id = task->tk_pid;
                __entry->client_id = task->tk_client->cl_clid;
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -384,7 +420,7 @@ DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class,
        ),
 
        TP_fast_assign(
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -495,22 +531,19 @@ TRACE_EVENT(xprtrdma_op_connect,
        TP_ARGS(r_xprt, delay),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned long, delay)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->delay = delay;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
-               __entry->delay
+       TP_printk("peer=[%s]:%s delay=%lu",
+               __get_str(addr), __get_str(port), __entry->delay
        )
 );
 
@@ -525,7 +558,6 @@ TRACE_EVENT(xprtrdma_op_set_cto,
        TP_ARGS(r_xprt, connect, reconnect),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned long, connect)
                __field(unsigned long, reconnect)
                __string(addr, rpcrdma_addrstr(r_xprt))
@@ -533,51 +565,18 @@ TRACE_EVENT(xprtrdma_op_set_cto,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->connect = connect;
                __entry->reconnect = reconnect;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu",
+               __get_str(addr), __get_str(port),
                __entry->connect / HZ, __entry->reconnect / HZ
        )
 );
 
-TRACE_EVENT(xprtrdma_qp_event,
-       TP_PROTO(
-               const struct rpcrdma_ep *ep,
-               const struct ib_event *event
-       ),
-
-       TP_ARGS(ep, event),
-
-       TP_STRUCT__entry(
-               __field(unsigned long, event)
-               __string(name, event->device->name)
-               __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
-               __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
-       ),
-
-       TP_fast_assign(
-               const struct rdma_cm_id *id = ep->re_id;
-
-               __entry->event = event->event;
-               __assign_str(name, event->device->name);
-               memcpy(__entry->srcaddr, &id->route.addr.src_addr,
-                      sizeof(struct sockaddr_in6));
-               memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
-                      sizeof(struct sockaddr_in6));
-       ),
-
-       TP_printk("%pISpc -> %pISpc device=%s %s (%lu)",
-               __entry->srcaddr, __entry->dstaddr, __get_str(name),
-               rdma_show_ib_event(__entry->event), __entry->event
-       )
-);
-
 /**
  ** Call events
  **/
@@ -591,22 +590,19 @@ TRACE_EVENT(xprtrdma_createmrs,
        TP_ARGS(r_xprt, count),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
                __field(unsigned int, count)
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->count = count;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
-               __entry->count
+       TP_printk("peer=[%s]:%s created %u MRs",
+               __get_str(addr), __get_str(port), __entry->count
        )
 );
 
@@ -829,7 +825,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
        TP_ARGS(r_xprt, count, status),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
+               __field(u32, cq_id)
                __field(unsigned int, count)
                __field(int, status)
                __field(int, posted)
@@ -838,16 +834,18 @@ TRACE_EVENT(xprtrdma_post_recvs,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
+               const struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+               __entry->cq_id = ep->re_attr.recv_cq->res.id;
                __entry->count = count;
                __entry->status = status;
-               __entry->posted = r_xprt->rx_ep->re_receive_count;
+               __entry->posted = ep->re_receive_count;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: %u new recvs, %d active (rc %d)",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)",
+               __get_str(addr), __get_str(port), __entry->cq_id,
                __entry->count, __entry->posted, __entry->status
        )
 );
@@ -886,10 +884,10 @@ TRACE_EVENT(xprtrdma_post_linv_err,
 DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
 
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_fastreg);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_wake);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_done);
 
 TRACE_EVENT(xprtrdma_frwr_alloc,
        TP_PROTO(
@@ -905,7 +903,7 @@ TRACE_EVENT(xprtrdma_frwr_alloc,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->rc = rc;
        ),
 
@@ -933,7 +931,7 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
        ),
 
        TP_fast_assign(
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -966,7 +964,7 @@ TRACE_EVENT(xprtrdma_frwr_sgerr,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->nents = sg_nents;
@@ -996,7 +994,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->num_mapped = num_mapped;
@@ -1010,11 +1008,12 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        )
 );
 
+DEFINE_MR_EVENT(fastreg);
 DEFINE_MR_EVENT(localinv);
+DEFINE_MR_EVENT(reminv);
 DEFINE_MR_EVENT(map);
 
 DEFINE_ANON_MR_EVENT(unmap);
-DEFINE_ANON_MR_EVENT(recycle);
 
 TRACE_EVENT(xprtrdma_dma_maperr,
        TP_PROTO(
@@ -1248,22 +1247,19 @@ TRACE_EVENT(xprtrdma_cb_setup,
        TP_ARGS(r_xprt, reqs),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned int, reqs)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->reqs = reqs;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs",
-               __get_str(addr), __get_str(port),
-               __entry->r_xprt, __entry->reqs
+       TP_printk("peer=[%s]:%s %u reqs",
+               __get_str(addr), __get_str(port), __entry->reqs
        )
 );
 
index bda16e9..d02e01a 100644 (file)
@@ -1079,6 +1079,46 @@ TRACE_EVENT(xprt_transmit,
                __entry->seqno, __entry->status)
 );
 
+TRACE_EVENT(xprt_retransmit,
+       TP_PROTO(
+               const struct rpc_rqst *rqst
+       ),
+
+       TP_ARGS(rqst),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, task_id)
+               __field(unsigned int, client_id)
+               __field(u32, xid)
+               __field(int, ntrans)
+               __field(int, version)
+               __string(progname,
+                        rqst->rq_task->tk_client->cl_program->name)
+               __string(procedure,
+                        rqst->rq_task->tk_msg.rpc_proc->p_name)
+       ),
+
+       TP_fast_assign(
+               struct rpc_task *task = rqst->rq_task;
+
+               __entry->task_id = task->tk_pid;
+               __entry->client_id = task->tk_client ?
+                       task->tk_client->cl_clid : -1;
+               __entry->xid = be32_to_cpu(rqst->rq_xid);
+               __entry->ntrans = rqst->rq_ntrans;
+               __assign_str(progname,
+                            task->tk_client->cl_program->name)
+               __entry->version = task->tk_client->cl_vers;
+               __assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+       ),
+
+       TP_printk(
+               "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d",
+               __entry->task_id, __entry->client_id, __entry->xid,
+               __get_str(progname), __entry->version, __get_str(procedure),
+               __entry->ntrans)
+);
+
 TRACE_EVENT(xprt_ping,
        TP_PROTO(const struct rpc_xprt *xprt, int status),
 
@@ -1141,7 +1181,6 @@ DECLARE_EVENT_CLASS(xprt_writelock_event,
 
 DEFINE_WRITELOCK_EVENT(reserve_xprt);
 DEFINE_WRITELOCK_EVENT(release_xprt);
-DEFINE_WRITELOCK_EVENT(transmit_queued);
 
 DECLARE_EVENT_CLASS(xprt_cong_event,
        TP_PROTO(
index e8eb4ad..d174914 100644 (file)
@@ -153,14 +153,3 @@ enum {
 #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
 
 #endif /* _LINUX_IF_BONDING_H */
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
-
index 05669c8..778dc19 100644 (file)
@@ -42,6 +42,7 @@
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
+#define KEXEC_ARCH_RISCV   (243 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
index 8948467..4832fd0 100644 (file)
@@ -64,5 +64,12 @@ enum {
 #define MPOL_F_MOF     (1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON   (1 << 4) /* Migrate On protnone Reference On Node */
 
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI.  New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE   (1<<0)  /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE  (1<<1)  /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP  (1<<2)  /* Unmap pages during reclaim */
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
index 1f2a708..beb2cad 100644 (file)
@@ -20,4 +20,10 @@ struct xt_secmark_target_info {
        char secctx[SECMARK_SECCTX_MAX];
 };
 
+struct xt_secmark_target_info_v1 {
+       __u8 mode;
+       char secctx[SECMARK_SECCTX_MAX];
+       __u32 secid;
+};
+
 #endif /*_XT_SECMARK_H_target */
index ed5415e..800bb0f 100644 (file)
 #define NFS4_MAX_BACK_CHANNEL_OPS 2
 
 #endif /* _UAPI_LINUX_NFS4_H */
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 3b39ef1..5ae3ace 100644 (file)
@@ -27,6 +27,7 @@ enum {
        SEG6_LOCAL_OIF,
        SEG6_LOCAL_BPF,
        SEG6_LOCAL_VRFTABLE,
+       SEG6_LOCAL_COUNTERS,
        __SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -78,4 +79,33 @@ enum {
 
 #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
 
+/* SRv6 Behavior counters are encoded as netlink attributes guaranteeing the
+ * correct alignment.
+ * Each counter is identified by a different attribute type (i.e.
+ * SEG6_LOCAL_CNT_PACKETS).
+ *
+ * - SEG6_LOCAL_CNT_PACKETS: identifies a counter that counts the number of
+ *   packets that have been CORRECTLY processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_BYTES: identifies a counter that counts the total amount
+ *   of traffic in bytes of all packets that have been CORRECTLY processed by
+ *   an SRv6 Behavior instance (i.e., packets that generate errors or are
+ *   dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_ERRORS: identifies a counter that counts the number of
+ *   packets that have NOT been properly processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped).
+ */
+enum {
+       SEG6_LOCAL_CNT_UNSPEC,
+       SEG6_LOCAL_CNT_PAD,             /* pad for 64 bits values */
+       SEG6_LOCAL_CNT_PACKETS,
+       SEG6_LOCAL_CNT_BYTES,
+       SEG6_LOCAL_CNT_ERRORS,
+       __SEG6_LOCAL_CNT_MAX,
+};
+
+#define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1)
+
 #endif
index 5f2d882..bafbeb1 100644 (file)
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING |        \
+                                UFFDIO_REGISTER_MODE_WP |      \
+                                UFFDIO_REGISTER_MODE_MINOR)
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |    \
                           UFFD_FEATURE_EVENT_FORK |            \
                           UFFD_FEATURE_EVENT_REMAP |           \
-                          UFFD_FEATURE_EVENT_REMOVE |  \
+                          UFFD_FEATURE_EVENT_REMOVE |          \
                           UFFD_FEATURE_EVENT_UNMAP |           \
                           UFFD_FEATURE_MISSING_HUGETLBFS |     \
                           UFFD_FEATURE_MISSING_SHMEM |         \
                           UFFD_FEATURE_SIGBUS |                \
-                          UFFD_FEATURE_THREAD_ID)
+                          UFFD_FEATURE_THREAD_ID |             \
+                          UFFD_FEATURE_MINOR_HUGETLBFS)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
        ((__u64)1 << _UFFDIO_WAKE |             \
         (__u64)1 << _UFFDIO_COPY |             \
         (__u64)1 << _UFFDIO_ZEROPAGE |         \
-        (__u64)1 << _UFFDIO_WRITEPROTECT)
+        (__u64)1 << _UFFDIO_WRITEPROTECT |     \
+        (__u64)1 << _UFFDIO_CONTINUE)
 #define UFFD_API_RANGE_IOCTLS_BASIC            \
        ((__u64)1 << _UFFDIO_WAKE |             \
-        (__u64)1 << _UFFDIO_COPY)
+        (__u64)1 << _UFFDIO_COPY |             \
+        (__u64)1 << _UFFDIO_CONTINUE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -55,6 +61,7 @@
 #define _UFFDIO_COPY                   (0x03)
 #define _UFFDIO_ZEROPAGE               (0x04)
 #define _UFFDIO_WRITEPROTECT           (0x06)
+#define _UFFDIO_CONTINUE               (0x07)
 #define _UFFDIO_API                    (0x3F)
 
 /* userfaultfd ioctl ids */
@@ -73,6 +80,8 @@
                                      struct uffdio_zeropage)
 #define UFFDIO_WRITEPROTECT    _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
                                      struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE                _IOR(UFFDIO, _UFFDIO_CONTINUE,  \
+                                    struct uffdio_continue)
 
 /* read() structure */
 struct uffd_msg {
@@ -127,6 +136,7 @@ struct uffd_msg {
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* If this was a write fault */
 #define UFFD_PAGEFAULT_FLAG_WP         (1<<1)  /* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR      (1<<2)  /* If reason is VM_UFFD_MINOR */
 
 struct uffdio_api {
        /* userland asks for an API number and the features to enable */
@@ -171,6 +181,10 @@ struct uffdio_api {
         *
         * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
         * be returned, if feature is not requested 0 will be returned.
+        *
+        * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+        * can be intercepted (via REGISTER_MODE_MINOR) for
+        * hugetlbfs-backed pages.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
@@ -181,6 +195,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_UNMAP               (1<<6)
 #define UFFD_FEATURE_SIGBUS                    (1<<7)
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS           (1<<9)
        __u64 features;
 
        __u64 ioctls;
@@ -195,6 +210,7 @@ struct uffdio_register {
        struct uffdio_range range;
 #define UFFDIO_REGISTER_MODE_MISSING   ((__u64)1<<0)
 #define UFFDIO_REGISTER_MODE_WP                ((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR     ((__u64)1<<2)
        __u64 mode;
 
        /*
@@ -257,6 +273,18 @@ struct uffdio_writeprotect {
        __u64 mode;
 };
 
+struct uffdio_continue {
+       struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE          ((__u64)1<<0)
+       __u64 mode;
+
+       /*
+        * Fields below here are written by the ioctl and must be at the end:
+        * the copy_from_user will not read past here.
+        */
+       __s64 mapped;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
index 34b1f53..ef33ea0 100644 (file)
@@ -333,10 +333,21 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
 
 /* 10de vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
+/*
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1)
 
 /* 1014 vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
+/*
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
+ * to do TLB invalidation on a GPU.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD   (1)
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -630,9 +641,36 @@ struct vfio_device_migration_info {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE     3
 
-/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
+ * and by the userspace to associate a NVLink bridge with a GPU.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT    4
+
+struct vfio_region_info_cap_nvlink2_ssatgt {
+       struct vfio_info_cap_header header;
+       __u64 tgt;
+};
 
-/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
+/*
+ * Capability with an NVLink link speed. The value is read by
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
+ * property in the device tree. The value is fixed in the hardware
+ * and failing to provide the correct value results in the link
+ * not working with no indication from the driver why.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD    5
+
+struct vfio_region_info_cap_nvlink2_lnkspd {
+       struct vfio_info_cap_header header;
+       __u32 link_speed;
+       __u32 __pad;
+};
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
index 9e9f9bf..449bd38 100644 (file)
 #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY
 
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
index 32ca83e..bfc2138 100644 (file)
@@ -131,13 +131,3 @@ struct vcpu_hvm_context {
 typedef struct vcpu_hvm_context vcpu_hvm_context_t;
 
 #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
index aaf2951..fb87161 100644 (file)
@@ -39,13 +39,3 @@ enum xenbus_state
 };
 
 #endif /* _XEN_PUBLIC_IO_XENBUS_H */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index b71bf0c..1ea12c6 100644 (file)
@@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP
        help
          Arch has userfaultfd write protection support
 
+config HAVE_ARCH_USERFAULTFD_MINOR
+       bool
+       help
+         Arch has userfaultfd minor fault support
+
 config MEMBARRIER
        bool "Enable membarrier() system call" if EXPERT
        default y
@@ -2294,6 +2299,18 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
 
          If unsure, say N.
 
+config MODPROBE_PATH
+       string "Path to modprobe binary"
+       default "/sbin/modprobe"
+       help
+         When kernel code requests a module, it does so by calling
+         the "modprobe" userspace utility. This option allows you to
+         set the path where that binary is found. This can be changed
+         at runtime via the sysctl file
+         /proc/sys/kernel/modprobe. Setting this to the empty string
+         removes the kernel's ability to request modules (but
+         userspace can still load modules explicitly).
+
 config TRIM_UNUSED_KSYMS
        bool "Trim unused exported kernel symbols" if EXPERT
        depends on !COMPILE_TEST
index d677e8e..af27abc 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/init.h>
+#include <linux/async.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -541,6 +542,14 @@ static int __init keepinitrd_setup(char *__unused)
 __setup("keepinitrd", keepinitrd_setup);
 #endif
 
+static bool __initdata initramfs_async = true;
+static int __init initramfs_async_setup(char *str)
+{
+       strtobool(str, &initramfs_async);
+       return 1;
+}
+__setup("initramfs_async=", initramfs_async_setup);
+
 extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
@@ -658,7 +667,7 @@ static void __init populate_initrd_image(char *err)
 }
 #endif /* CONFIG_BLK_DEV_RAM */
 
-static int __init populate_rootfs(void)
+static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
 {
        /* Load the built in initramfs */
        char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
@@ -693,6 +702,33 @@ done:
        initrd_end = 0;
 
        flush_delayed_fput();
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
+static async_cookie_t initramfs_cookie;
+
+void wait_for_initramfs(void)
+{
+       if (!initramfs_cookie) {
+               /*
+                * Something before rootfs_initcall wants to access
+                * the filesystem/initramfs. Probably a bug. Make a
+                * note, avoid deadlocking the machine, and let the
+                * caller's access fail as it used to.
+                */
+               pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n");
+               return;
+       }
+       async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain);
+}
+EXPORT_SYMBOL_GPL(wait_for_initramfs);
+
+static int __init populate_rootfs(void)
+{
+       initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
+                                                &initramfs_domain);
+       if (!initramfs_async)
+               wait_for_initramfs();
        return 0;
 }
 rootfs_initcall(populate_rootfs);
index 543fbe3..eb01e12 100644 (file)
@@ -1561,6 +1561,7 @@ static noinline void __init kernel_init_freeable(void)
 
        kunit_run_all_tests();
 
+       wait_for_initramfs();
        console_on_rootfs();
 
        /*
index f6c30a8..e0ec239 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -36,7 +36,7 @@
  * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
  * - undo adjustments at process exit are limited to 0..SEMVMX.
  * - namespace are supported.
- * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
+ * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
  *   to /proc/sys/kernel/sem.
  * - statistics about the usage are reported in /proc/sysvipc/sem.
  *
@@ -224,7 +224,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
  * Setting it to a result code is a RELEASE, this is ensured by both a
  * smp_store_release() (for case a) and while holding sem_lock()
  * (for case b).
- * The AQUIRE when reading the result code without holding sem_lock() is
+ * The ACQUIRE when reading the result code without holding sem_lock() is
  * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
  * (case a above).
  * Reading the result code while holding sem_lock() needs no further barriers,
@@ -786,7 +786,7 @@ static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
 {
        get_task_struct(q->sleeper);
 
-       /* see SEM_BARRIER_2 for purpuse/pairing */
+       /* see SEM_BARRIER_2 for purpose/pairing */
        smp_store_release(&q->status, error);
 
        wake_q_add_safe(wake_q, q->sleeper);
@@ -821,7 +821,7 @@ static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
 
        /* It is impossible that someone waits for the new value:
         * - complex operations always restart.
-        * - wait-for-zero are handled seperately.
+        * - wait-for-zero are handled separately.
         * - q is a previously sleeping simple operation that
         *   altered the array. It must be a decrement, because
         *   simple increments never sleep.
@@ -1046,7 +1046,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                         * - No complex ops, thus all sleeping ops are
                         *   decrease.
                         * - if we decreased the value, then any sleeping
-                        *   semaphore ops wont be able to run: If the
+                        *   semaphore ops won't be able to run: If the
                         *   previous value was too small, then the new
                         *   value will be too small, too.
                         */
@@ -2108,7 +2108,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
        queue.dupsop = dupsop;
 
        error = perform_atomic_semop(sma, &queue);
-       if (error == 0) { /* non-blocking succesfull path */
+       if (error == 0) { /* non-blocking successful path */
                DEFINE_WAKE_Q(wake_q);
 
                /*
index 78701ea..c6b299a 100644 (file)
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-kheaders.md5
-timeconst.h
-hz.bc
+/config_data
+/kheaders.md5
index e8a6715..4df609b 100644 (file)
@@ -142,10 +142,15 @@ obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
 
 $(obj)/configs.o: $(obj)/config_data.gz
 
-targets += config_data.gz
-$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
+targets += config_data config_data.gz
+$(obj)/config_data.gz: $(obj)/config_data FORCE
        $(call if_changed,gzip)
 
+filechk_cat = cat $<
+
+$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
+       $(call filechk,cat)
+
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
index 33258e6..b8d7a66 100644 (file)
@@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 
 static atomic_t entry_count;
 
+static long long microseconds_since(ktime_t start)
+{
+       ktime_t now = ktime_get();
+       return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
 static async_cookie_t lowest_in_progress(struct async_domain *domain)
 {
        struct async_entry *first = NULL;
@@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work)
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
-       ktime_t calltime, delta, rettime;
+       ktime_t calltime;
 
        /* 1) run (and print duration) */
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               pr_debug("calling  %lli_%pS @ %i\n",
-                       (long long)entry->cookie,
-                       entry->func, task_pid_nr(current));
-               calltime = ktime_get();
-       }
+       pr_debug("calling  %lli_%pS @ %i\n", (long long)entry->cookie,
+                entry->func, task_pid_nr(current));
+       calltime = ktime_get();
+
        entry->func(entry->data, entry->cookie);
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               rettime = ktime_get();
-               delta = ktime_sub(rettime, calltime);
-               pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
-                       (long long)entry->cookie,
-                       entry->func,
-                       (long long)ktime_to_ns(delta) >> 10);
-       }
+
+       pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+                (long long)entry->cookie, entry->func,
+                microseconds_since(calltime));
 
        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
@@ -246,24 +246,6 @@ void async_synchronize_full(void)
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
-       spin_lock_irq(&async_lock);
-       WARN_ON(!domain->registered || !list_empty(&domain->pending));
-       domain->registered = 0;
-       spin_unlock_irq(&async_lock);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-
-/**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @domain: the domain to synchronize
  *
@@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
  */
 void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
-       ktime_t starttime, delta, endtime;
+       ktime_t starttime;
 
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               pr_debug("async_waiting @ %i\n", task_pid_nr(current));
-               starttime = ktime_get();
-       }
+       pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+       starttime = ktime_get();
 
        wait_event(async_done, lowest_in_progress(domain) >= cookie);
 
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               endtime = ktime_get();
-               delta = ktime_sub(endtime, starttime);
-
-               pr_debug("async_continuing @ %i after %lli usec\n",
-                       task_pid_nr(current),
-                       (long long)ktime_to_ns(delta) >> 10);
-       }
+       pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+                microseconds_since(starttime));
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 
index 8fd552c..757476c 100644 (file)
@@ -6496,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 {
        struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
        struct bpf_verifier_state *vstate = env->cur_state;
+       bool off_is_imm = tnum_is_const(off_reg->var_off);
        bool off_is_neg = off_reg->smin_value < 0;
        bool ptr_is_dst_reg = ptr_reg == dst_reg;
        u8 opcode = BPF_OP(insn->code);
@@ -6526,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
                alu_limit = abs(tmp_aux->alu_limit - alu_limit);
        } else {
                alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+               alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
                alu_state |= ptr_is_dst_reg ?
                             BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
        }
@@ -12371,7 +12373,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
                        const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
                        struct bpf_insn *patch = &insn_buf[0];
-                       bool issrc, isneg;
+                       bool issrc, isneg, isimm;
                        u32 off_reg;
 
                        aux = &env->insn_aux_data[i + delta];
@@ -12382,28 +12384,29 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
                        issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
                                BPF_ALU_SANITIZE_SRC;
+                       isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
 
                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
-                       if (isneg)
-                               *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
-                       *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
-                       *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
-                       *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
-                       *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
-                       if (issrc) {
-                               *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
-                                                        off_reg);
-                               insn->src_reg = BPF_REG_AX;
+                       if (isimm) {
+                               *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        } else {
-                               *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
-                                                        BPF_REG_AX);
+                               if (isneg)
+                                       *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+                               *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+                               *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+                               *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+                               *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+                               *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+                               *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
                        }
+                       if (!issrc)
+                               *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+                       insn->src_reg = BPF_REG_AX;
                        if (isneg)
                                insn->code = insn->code == code_add ?
                                             code_sub : code_add;
                        *patch++ = *insn;
-                       if (issrc && isneg)
+                       if (issrc && isneg && !isimm)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                        cnt = patch - insn_buf;
 
index d3fd428..eb701b2 100644 (file)
@@ -1,5 +1,4 @@
 #  KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
index 421b114..e1d274c 100644 (file)
@@ -33,7 +33,7 @@ do {                                                                  \
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
 
 /*
  * The initial credentials for the initial task
index 0596526..fd1c041 100644 (file)
@@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
                           TASK_INTERRUPTIBLE, p);
 }
 
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+                                struct task_struct *target)
+{
+       struct task_struct *parent =
+               !ptrace ? target->real_parent : target->parent;
+
+       return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+                                    same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+       bool ptrace;
+       struct task_struct *target;
+       int retval;
+
+       ptrace = false;
+       target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+       if (target && is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       ptrace = true;
+       target = pid_task(wo->wo_pid, PIDTYPE_PID);
+       if (target && target->ptrace &&
+           is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       return 0;
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-       struct task_struct *tsk;
        int retval;
 
        trace_sched_process_wait(wo->wo_pid);
@@ -1464,19 +1503,27 @@ repeat:
 
        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
-       tsk = current;
-       do {
-               retval = do_wait_thread(wo, tsk);
-               if (retval)
-                       goto end;
 
-               retval = ptrace_do_wait(wo, tsk);
+       if (wo->wo_type == PIDTYPE_PID) {
+               retval = do_wait_pid(wo);
                if (retval)
                        goto end;
+       } else {
+               struct task_struct *tsk = current;
+
+               do {
+                       retval = do_wait_thread(wo, tsk);
+                       if (retval)
+                               goto end;
 
-               if (wo->wo_flags & __WNOTHREAD)
-                       break;
-       } while_each_thread(current, tsk);
+                       retval = ptrace_do_wait(wo, tsk);
+                       if (retval)
+                               goto end;
+
+                       if (wo->wo_flags & __WNOTHREAD)
+                               break;
+               } while_each_thread(current, tsk);
+       }
        read_unlock(&tasklist_lock);
 
 notask:
index 771e0ea..dc06afd 100644 (file)
@@ -1145,7 +1145,7 @@ void mmput_async(struct mm_struct *mm)
  * invocations: in mmput() nobody alive left, in execve task is single
  * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
  * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to do avoid the need for any locks.
+ * to avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
@@ -1396,7 +1396,6 @@ fail_nomem:
 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct mm_struct *mm, *oldmm;
-       int retval;
 
        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
@@ -1423,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
-               goto good_mm;
+       } else {
+               mm = dup_mm(tsk, current->mm);
+               if (!mm)
+                       return -ENOMEM;
        }
 
-       retval = -ENOMEM;
-       mm = dup_mm(tsk, current->mm);
-       if (!mm)
-               goto fail_nomem;
-
-good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
-
-fail_nomem:
-       return retval;
 }
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
@@ -1743,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
  * /proc/<pid>/status where Pid and NSpid are always shown relative to
  * the  pid namespace of the procfs instance. The difference becomes
  * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestoral relation is
+ * different branch of the tree, i.e. where no ancestral relation is
  * present between the pid namespaces:
  * - create two new pid namespaces ns1 and ns2 in the initial pid
  *   namespace (also take care to create new mount namespaces in the
@@ -2735,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
                return false;
 
        /*
-        * - make the CLONE_DETACHED bit reuseable for clone3
-        * - make the CSIGNAL bits reuseable for clone3
+        * - make the CLONE_DETACHED bit reusable for clone3
+        * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
                return false;
index c98b825..4938a00 100644 (file)
@@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 
        if (op & FUTEX_CLOCK_REALTIME) {
                flags |= FLAGS_CLOCKRT;
-               if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
-                   cmd != FUTEX_WAIT_REQUEUE_PI)
+               if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
                        return -ENOSYS;
        }
 
@@ -3758,42 +3757,52 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        return -ENOSYS;
 }
 
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+       switch (cmd) {
+       case FUTEX_WAIT:
+       case FUTEX_LOCK_PI:
+       case FUTEX_WAIT_BITSET:
+       case FUTEX_WAIT_REQUEUE_PI:
+               return true;
+       }
+       return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+       if (!timespec64_valid(ts))
+               return -EINVAL;
+
+       *t = timespec64_to_ktime(*ts);
+       if (cmd == FUTEX_WAIT)
+               *t = ktime_add_safe(ktime_get(), *t);
+       else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+               *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+       return 0;
+}
 
 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                const struct __kernel_timespec __user *, utime,
                u32 __user *, uaddr2, u32, val3)
 {
-       struct timespec64 ts;
+       int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
-       u32 val2 = 0;
-       int cmd = op & FUTEX_CMD_MASK;
+       struct timespec64 ts;
 
-       if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                     cmd == FUTEX_WAIT_BITSET ||
-                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
+       if (utime && futex_cmd_has_timeout(cmd)) {
                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
                        return -EFAULT;
                if (get_timespec64(&ts, utime))
                        return -EFAULT;
-               if (!timespec64_valid(&ts))
-                       return -EINVAL;
-
-               t = timespec64_to_ktime(ts);
-               if (cmd == FUTEX_WAIT)
-                       t = ktime_add_safe(ktime_get(), t);
-               else if (!(op & FUTEX_CLOCK_REALTIME))
-                       t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+               ret = futex_init_timeout(cmd, op, &ts, &t);
+               if (ret)
+                       return ret;
                tp = &t;
        }
-       /*
-        * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
-        * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
-        */
-       if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
-               val2 = (u32) (unsigned long) utime;
 
-       return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+       return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
 #ifdef CONFIG_COMPAT
@@ -3959,31 +3968,20 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
                const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
                u32, val3)
 {
-       struct timespec64 ts;
+       int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
-       int val2 = 0;
-       int cmd = op & FUTEX_CMD_MASK;
+       struct timespec64 ts;
 
-       if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                     cmd == FUTEX_WAIT_BITSET ||
-                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
+       if (utime && futex_cmd_has_timeout(cmd)) {
                if (get_old_timespec32(&ts, utime))
                        return -EFAULT;
-               if (!timespec64_valid(&ts))
-                       return -EINVAL;
-
-               t = timespec64_to_ktime(ts);
-               if (cmd == FUTEX_WAIT)
-                       t = ktime_add_safe(ktime_get(), t);
-               else if (!(op & FUTEX_CLOCK_REALTIME))
-                       t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+               ret = futex_init_timeout(cmd, op, &ts, &t);
+               if (ret)
+                       return ret;
                tp = &t;
        }
-       if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
-               val2 = (int) (unsigned long) utime;
 
-       return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+       return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 #endif /* CONFIG_COMPAT_32BIT_TIME */
 
index f62de2d..58f87a3 100644 (file)
@@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
        depends on DEBUG_FS
+       depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
        select CONSTRUCTORS
        default n
        help
index 0ffe9f1..073a373 100644 (file)
@@ -49,6 +49,55 @@ void gcov_enable_events(void)
        mutex_unlock(&gcov_lock);
 }
 
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+       u32 *data;
+
+       if (buffer) {
+               data = buffer + off;
+               *data = v;
+       }
+
+       return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+       u32 *data;
+
+       if (buffer) {
+               data = buffer + off;
+
+               data[0] = (v & 0xffffffffUL);
+               data[1] = (v >> 32);
+       }
+
+       return sizeof(*data) * 2;
+}
+
 #ifdef CONFIG_MODULES
 /* Update list and generate events when modules are unloaded. */
 static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
index c466c7f..cbb0bed 100644 (file)
@@ -48,9 +48,8 @@
 #include <linux/list.h>
 #include <linux/printk.h>
 #include <linux/ratelimit.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 typedef void (*llvm_gcov_callback)(void);
@@ -70,16 +69,10 @@ struct gcov_fn_info {
 
        u32 ident;
        u32 checksum;
-#if CONFIG_CLANG_VERSION < 110000
-       u8 use_extra_checksum;
-#endif
        u32 cfg_checksum;
 
        u32 num_counters;
        u64 *counters;
-#if CONFIG_CLANG_VERSION < 110000
-       const char *function_name;
-#endif
 };
 
 static struct gcov_info *current_info;
@@ -109,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
 }
 EXPORT_SYMBOL(llvm_gcov_init);
 
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_start_file(const char *orig_filename, const char version[4],
-               u32 checksum)
-{
-       current_info->filename = orig_filename;
-       memcpy(&current_info->version, version, sizeof(current_info->version));
-       current_info->checksum = checksum;
-}
-EXPORT_SYMBOL(llvm_gcda_start_file);
-#else
 void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
 {
        current_info->filename = orig_filename;
@@ -126,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
        current_info->checksum = checksum;
 }
 EXPORT_SYMBOL(llvm_gcda_start_file);
-#endif
-
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_emit_function(u32 ident, const char *function_name,
-               u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
-{
-       struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
-
-       if (!info)
-               return;
-
-       INIT_LIST_HEAD(&info->head);
-       info->ident = ident;
-       info->checksum = func_checksum;
-       info->use_extra_checksum = use_extra_checksum;
-       info->cfg_checksum = cfg_checksum;
-       if (function_name)
-               info->function_name = kstrdup(function_name, GFP_KERNEL);
 
-       list_add_tail(&info->head, &current_info->functions);
-}
-#else
 void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
 {
        struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -161,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
        info->cfg_checksum = cfg_checksum;
        list_add_tail(&info->head, &current_info->functions);
 }
-#endif
 EXPORT_SYMBOL(llvm_gcda_emit_function);
 
 void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
@@ -292,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
                !list_is_last(&fn_ptr2->head, &info2->functions)) {
                if (fn_ptr1->checksum != fn_ptr2->checksum)
                        return false;
-#if CONFIG_CLANG_VERSION < 110000
-               if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
-                       return false;
-               if (fn_ptr1->use_extra_checksum &&
-                       fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
-                       return false;
-#else
                if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
                        return false;
-#endif
                fn_ptr1 = list_next_entry(fn_ptr1, head);
                fn_ptr2 = list_next_entry(fn_ptr2, head);
        }
@@ -330,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
        }
 }
 
-#if CONFIG_CLANG_VERSION < 110000
-static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
-{
-       size_t cv_size; /* counter values size */
-       struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
-                       GFP_KERNEL);
-       if (!fn_dup)
-               return NULL;
-       INIT_LIST_HEAD(&fn_dup->head);
-
-       fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
-       if (!fn_dup->function_name)
-               goto err_name;
-
-       cv_size = fn->num_counters * sizeof(fn->counters[0]);
-       fn_dup->counters = vmalloc(cv_size);
-       if (!fn_dup->counters)
-               goto err_counters;
-       memcpy(fn_dup->counters, fn->counters, cv_size);
-
-       return fn_dup;
-
-err_counters:
-       kfree(fn_dup->function_name);
-err_name:
-       kfree(fn_dup);
-       return NULL;
-}
-#else
 static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
 {
        size_t cv_size; /* counter values size */
@@ -369,7 +293,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
        INIT_LIST_HEAD(&fn_dup->head);
 
        cv_size = fn->num_counters * sizeof(fn->counters[0]);
-       fn_dup->counters = vmalloc(cv_size);
+       fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL);
        if (!fn_dup->counters) {
                kfree(fn_dup);
                return NULL;
@@ -379,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
 
        return fn_dup;
 }
-#endif
 
 /**
  * gcov_info_dup - duplicate profiling data set
@@ -420,99 +343,18 @@ err:
  * gcov_info_free - release memory for profiling data set duplicate
  * @info: profiling data set duplicate to free
  */
-#if CONFIG_CLANG_VERSION < 110000
-void gcov_info_free(struct gcov_info *info)
-{
-       struct gcov_fn_info *fn, *tmp;
-
-       list_for_each_entry_safe(fn, tmp, &info->functions, head) {
-               kfree(fn->function_name);
-               vfree(fn->counters);
-               list_del(&fn->head);
-               kfree(fn);
-       }
-       kfree(info->filename);
-       kfree(info);
-}
-#else
 void gcov_info_free(struct gcov_info *info)
 {
        struct gcov_fn_info *fn, *tmp;
 
        list_for_each_entry_safe(fn, tmp, &info->functions, head) {
-               vfree(fn->counters);
+               kvfree(fn->counters);
                list_del(&fn->head);
                kfree(fn);
        }
        kfree(info->filename);
        kfree(info);
 }
-#endif
-
-#define ITER_STRIDE    PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
-       struct gcov_info *info;
-       void *buffer;
-       size_t size;
-       loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-               *data = v;
-       }
-
-       return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-
-               data[0] = (v & 0xffffffffUL);
-               data[1] = (v >> 32);
-       }
-
-       return sizeof(*data) * 2;
-}
 
 /**
  * convert_to_gcda - convert profiling data set to gcda file format
@@ -521,7 +363,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
  *
  * Returns the number of bytes that were/would have been stored into the buffer.
  */
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 {
        struct gcov_fn_info *fi_ptr;
        size_t pos = 0;
@@ -535,21 +377,10 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
                u32 i;
 
                pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
-#if CONFIG_CLANG_VERSION < 110000
-               pos += store_gcov_u32(buffer, pos,
-                       fi_ptr->use_extra_checksum ? 3 : 2);
-#else
                pos += store_gcov_u32(buffer, pos, 3);
-#endif
                pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
                pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
-#if CONFIG_CLANG_VERSION < 110000
-               if (fi_ptr->use_extra_checksum)
-                       pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#else
                pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#endif
-
                pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
                pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
                for (i = 0; i < fi_ptr->num_counters; i++)
@@ -558,102 +389,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 
        return pos;
 }
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
-       struct gcov_iterator *iter;
-
-       iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
-       if (!iter)
-               goto err_free;
-
-       iter->info = info;
-       /* Dry-run to get the actual buffer size. */
-       iter->size = convert_to_gcda(NULL, info);
-       iter->buffer = vmalloc(iter->size);
-       if (!iter->buffer)
-               goto err_free;
-
-       convert_to_gcda(iter->buffer, info);
-
-       return iter;
-
-err_free:
-       kfree(iter);
-       return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
-       vfree(iter->buffer);
-       kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
-       return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
-       iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
-       if (iter->pos < iter->size)
-               iter->pos += ITER_STRIDE;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
-       size_t len;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       len = ITER_STRIDE;
-       if (iter->pos + len > iter->size)
-               len = iter->size - iter->pos;
-
-       seq_write(seq, iter->buffer + iter->pos, len);
-
-       return 0;
-}
index 82babf5..5c3086c 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 /**
@@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str)
 }
 __setup("gcov_persist=", gcov_persist_setup);
 
+#define ITER_STRIDE    PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+       struct gcov_info *info;
+       size_t size;
+       loff_t pos;
+       char buffer[];
+};
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+static struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+       struct gcov_iterator *iter;
+       size_t size;
+
+       /* Dry-run to get the actual buffer size. */
+       size = convert_to_gcda(NULL, info);
+
+       iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL);
+       if (!iter)
+               return NULL;
+
+       iter->info = info;
+       iter->size = size;
+       convert_to_gcda(iter->buffer, info);
+
+       return iter;
+}
+
+
+/**
+ * gcov_iter_free - free iterator data
+ * @iter: file iterator
+ */
+static void gcov_iter_free(struct gcov_iterator *iter)
+{
+       kvfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+       return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+static void gcov_iter_start(struct gcov_iterator *iter)
+{
+       iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+static int gcov_iter_next(struct gcov_iterator *iter)
+{
+       if (iter->pos < iter->size)
+               iter->pos += ITER_STRIDE;
+
+       if (iter->pos >= iter->size)
+               return -EINVAL;
+
+       return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+       size_t len;
+
+       if (iter->pos >= iter->size)
+               return -EINVAL;
+
+       len = ITER_STRIDE;
+       if (iter->pos + len > iter->size)
+               len = iter->size - iter->pos;
+
+       seq_write(seq, iter->buffer + iter->pos, len);
+
+       return 0;
+}
+
 /*
  * seq_file.start() implementation for gcov data files. Note that the
  * gcov_iterator interface is designed to be more restrictive than seq_file
index c53408a..460c12b 100644 (file)
@@ -15,8 +15,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 #if (__GNUC__ >= 10)
@@ -310,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
 
                        cv_size = sizeof(gcov_type) * sci_ptr->num;
 
-                       dci_ptr->values = vmalloc(cv_size);
+                       dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL);
 
                        if (!dci_ptr->values)
                                goto err_free;
@@ -352,7 +351,7 @@ void gcov_info_free(struct gcov_info *info)
                ci_ptr = info->functions[fi_idx]->ctrs;
 
                for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
-                       vfree(ci_ptr->values);
+                       kvfree(ci_ptr->values);
 
                kfree(info->functions[fi_idx]);
        }
@@ -363,71 +362,6 @@ free_info:
        kfree(info);
 }
 
-#define ITER_STRIDE    PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
-       struct gcov_info *info;
-       void *buffer;
-       size_t size;
-       loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-               *data = v;
-       }
-
-       return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-
-               data[0] = (v & 0xffffffffUL);
-               data[1] = (v >> 32);
-       }
-
-       return sizeof(*data) * 2;
-}
-
 /**
  * convert_to_gcda - convert profiling data set to gcda file format
  * @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
  *
  * Returns the number of bytes that were/would have been stored into the buffer.
  */
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 {
        struct gcov_fn_info *fi_ptr;
        struct gcov_ctr_info *ci_ptr;
@@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 
        return pos;
 }
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
-       struct gcov_iterator *iter;
-
-       iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
-       if (!iter)
-               goto err_free;
-
-       iter->info = info;
-       /* Dry-run to get the actual buffer size. */
-       iter->size = convert_to_gcda(NULL, info);
-       iter->buffer = vmalloc(iter->size);
-       if (!iter->buffer)
-               goto err_free;
-
-       convert_to_gcda(iter->buffer, info);
-
-       return iter;
-
-err_free:
-       kfree(iter);
-       return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
-       vfree(iter->buffer);
-       kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
-       return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
-       iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
-       if (iter->pos < iter->size)
-               iter->pos += ITER_STRIDE;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
-       size_t len;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       len = ITER_STRIDE;
-       if (iter->pos + len > iter->size)
-               len = iter->size - iter->pos;
-
-       seq_write(seq, iter->buffer + iter->pos, len);
-
-       return 0;
-}
index 6ab2c18..912b8ea 100644 (file)
@@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
 bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
+size_t convert_to_gcda(char *buffer, struct gcov_info *info);
 
 /* Base interface. */
 enum gcov_action {
@@ -58,16 +59,9 @@ enum gcov_action {
 void gcov_event(enum gcov_action action, struct gcov_info *info);
 void gcov_enable_events(void);
 
-/* Iterator control. */
-struct seq_file;
-struct gcov_iterator;
-
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
-void gcov_iter_free(struct gcov_iterator *iter);
-void gcov_iter_start(struct gcov_iterator *iter);
-int gcov_iter_next(struct gcov_iterator *iter);
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* writing helpers */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v);
+size_t store_gcov_u64(void *buffer, size_t off, u64 v);
 
 /* gcov_info control. */
 void gcov_info_reset(struct gcov_info *info);
index a0b6780..f099bae 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/compiler.h>
 #include <linux/hugetlb.h>
 #include <linux/objtool.h>
+#include <linux/kmsg_dump.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -1165,7 +1166,7 @@ int kernel_kexec(void)
 #endif
        {
                kexec_in_progress = true;
-               kernel_restart_prepare(NULL);
+               kernel_restart_prepare("kexec reboot");
                migrate_to_reboot_cpu();
 
                /*
@@ -1179,6 +1180,7 @@ int kernel_kexec(void)
                machine_shutdown();
        }
 
+       kmsg_dump(KMSG_DUMP_SHUTDOWN);
        machine_kexec(kexec_image);
 
 #ifdef CONFIG_KEXEC_JUMP
index 5c3447c..33400ff 100644 (file)
@@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image)
 
        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
        sha_regions = vzalloc(sha_region_sz);
-       if (!sha_regions)
+       if (!sha_regions) {
+               ret = -ENOMEM;
                goto out_free_desc;
+       }
 
        desc->tfm   = tfm;
 
index 3cd075c..b717134 100644 (file)
@@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
 /*
        modprobe_path is set via /proc/sys.
 */
-char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
index b94f383..ec36b73 100644 (file)
@@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
        arch_spin_lock(&lock->wait_lock);
 
        /* Try to acquire the lock directly if no reader is present */
-       if (!atomic_read(&lock->cnts) &&
-           (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+       if (!(cnts = atomic_read(&lock->cnts)) &&
+           atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
                goto unlock;
 
        /* Set the waiting flag to notify readers that a writer is pending */
-       atomic_add(_QW_WAITING, &lock->cnts);
+       atomic_or(_QW_WAITING, &lock->cnts);
 
        /* When no more readers or writers, set the locked flag */
        do {
index 627e61b..028a5ab 100644 (file)
@@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock);
 static struct resource *bootmem_resource_free;
 static DEFINE_SPINLOCK(bootmem_resource_lock);
 
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p)
 {
-       /* Caller wants to traverse through siblings only */
-       if (sibling_only)
-               return p->sibling;
-
        if (p->child)
                return p->child;
        while (!p->sibling && p->parent)
@@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
        (*pos)++;
-       return (void *)next_resource(p, false);
+       return (void *)next_resource(p);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource);
  * of the resource that's within [@start..@end]; if none is found, returns
  * -ENODEV.  Returns -EINVAL for invalid parameters.
  *
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
- *
  * @start:     start address of the resource searched for
  * @end:       end address of same resource
  * @flags:     flags which the resource must have
  * @desc:      descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
  * @res:       return ptr, if resource found
  *
  * The caller must specify @start, @end, @flags, and @desc
@@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource);
  */
 static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                               unsigned long flags, unsigned long desc,
-                              bool first_lvl, struct resource *res)
+                              struct resource *res)
 {
-       bool siblings_only = true;
        struct resource *p;
 
        if (!res)
@@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
        read_lock(&resource_lock);
 
-       for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+       for (p = iomem_resource.child; p; p = next_resource(p)) {
                /* If we passed the resource we are looking for, stop */
                if (p->start > end) {
                        p = NULL;
@@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                if (p->end < start)
                        continue;
 
-               /*
-                * Now that we found a range that matches what we look for,
-                * check the flags and the descriptor. If we were not asked to
-                * use only the first level, start looking at children as well.
-                */
-               siblings_only = first_lvl;
-
                if ((p->flags & flags) != flags)
                        continue;
                if ((desc != IORES_DESC_NONE) && (desc != p->desc))
@@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
 static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
                                 unsigned long flags, unsigned long desc,
-                                bool first_lvl, void *arg,
+                                void *arg,
                                 int (*func)(struct resource *, void *))
 {
        struct resource res;
        int ret = -EINVAL;
 
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+              !find_next_iomem_res(start, end, flags, desc, &res)) {
                ret = (*func)(&res, arg);
                if (ret)
                        break;
@@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
  * @arg: function argument for the callback @func
  * @func: callback function that is called for each qualifying resource area
  *
- * This walks through whole tree and not just first level children.
  * All the memory ranges which overlap start,end and also match flags and
  * desc are valid candidates.
  *
@@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
                u64 end, void *arg, int (*func)(struct resource *, void *))
 {
-       return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+       return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
 }
 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
@@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
 }
 
 /*
@@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
 }
 
 /*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
  * It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
  */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                          void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
-                                   false, &res)) {
+              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
                pfn = PFN_UP(res.start);
                end_pfn = PFN_DOWN(res.end + 1);
                if (end_pfn > pfn)
@@ -523,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
+static int __region_intersects(resource_size_t start, size_t size,
+                       unsigned long flags, unsigned long desc)
+{
+       struct resource res;
+       int type = 0; int other = 0;
+       struct resource *p;
+
+       res.start = start;
+       res.end = start + size - 1;
+
+       for (p = iomem_resource.child; p ; p = p->sibling) {
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
+
+               if (resource_overlaps(p, &res))
+                       is_type ? type++ : other++;
+       }
+
+       if (type == 0)
+               return REGION_DISJOINT;
+
+       if (other == 0)
+               return REGION_INTERSECTS;
+
+       return REGION_MIXED;
+}
+
 /**
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address
@@ -546,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram);
 int region_intersects(resource_size_t start, size_t size, unsigned long flags,
                      unsigned long desc)
 {
-       struct resource res;
-       int type = 0; int other = 0;
-       struct resource *p;
-
-       res.start = start;
-       res.end = start + size - 1;
+       int ret;
 
        read_lock(&resource_lock);
-       for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = (((p->flags & flags) == flags) &&
-                               ((desc == IORES_DESC_NONE) ||
-                                (desc == p->desc)));
-
-               if (resource_overlaps(p, &res))
-                       is_type ? type++ : other++;
-       }
+       ret = __region_intersects(start, size, flags, desc);
        read_unlock(&resource_lock);
 
-       if (type == 0)
-               return REGION_DISJOINT;
-
-       if (other == 0)
-               return REGION_INTERSECTS;
-
-       return REGION_MIXED;
+       return ret;
 }
 EXPORT_SYMBOL_GPL(region_intersects);
 
@@ -1171,31 +1160,16 @@ struct address_space *iomem_get_mapping(void)
        return smp_load_acquire(&iomem_inode)->i_mapping;
 }
 
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static int __request_region_locked(struct resource *res, struct resource *parent,
                                   resource_size_t start, resource_size_t n,
                                   const char *name, int flags)
 {
        DECLARE_WAITQUEUE(wait, current);
-       struct resource *res = alloc_resource(GFP_KERNEL);
-       struct resource *orig_parent = parent;
-
-       if (!res)
-               return NULL;
 
        res->name = name;
        res->start = start;
        res->end = start + n - 1;
 
-       write_lock(&resource_lock);
-
        for (;;) {
                struct resource *conflict;
 
@@ -1231,13 +1205,40 @@ struct resource * __request_region(struct resource *parent,
                        continue;
                }
                /* Uhhuh, that didn't work out.. */
-               free_resource(res);
-               res = NULL;
-               break;
+               return -EBUSY;
        }
+
+       return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+                                 resource_size_t start, resource_size_t n,
+                                 const char *name, int flags)
+{
+       struct resource *res = alloc_resource(GFP_KERNEL);
+       int ret;
+
+       if (!res)
+               return NULL;
+
+       write_lock(&resource_lock);
+       ret = __request_region_locked(res, parent, start, n, name, flags);
        write_unlock(&resource_lock);
 
-       if (res && orig_parent == &iomem_resource)
+       if (ret) {
+               free_resource(res);
+               return NULL;
+       }
+
+       if (parent == &iomem_resource)
                revoke_iomem(res);
 
        return res;
@@ -1779,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev,
 {
        resource_size_t end, addr;
        struct resource *res;
+       struct region_devres *dr = NULL;
 
        size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
        end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
        addr = end - size + 1UL;
 
+       res = alloc_resource(GFP_KERNEL);
+       if (!res)
+               return ERR_PTR(-ENOMEM);
+
+       if (dev) {
+               dr = devres_alloc(devm_region_release,
+                               sizeof(struct region_devres), GFP_KERNEL);
+               if (!dr) {
+                       free_resource(res);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+
+       write_lock(&resource_lock);
        for (; addr > size && addr >= base->start; addr -= size) {
-               if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+               if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
                                REGION_DISJOINT)
                        continue;
 
-               if (dev)
-                       res = devm_request_mem_region(dev, addr, size, name);
-               else
-                       res = request_mem_region(addr, size, name);
-               if (!res)
-                       return ERR_PTR(-ENOMEM);
+               if (!__request_region_locked(res, &iomem_resource, addr, size,
+                                               name, 0))
+                       break;
+
+               if (dev) {
+                       dr->parent = &iomem_resource;
+                       dr->start = addr;
+                       dr->n = size;
+                       devres_add(dev, dr);
+               }
+
                res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+               write_unlock(&resource_lock);
+
+               /*
+                * A driver is claiming this region so revoke any mappings.
+                */
+               revoke_iomem(res);
                return res;
        }
+       write_unlock(&resource_lock);
+
+       free_resource(res);
+       if (dr)
+               devres_free(dr);
 
        return ERR_PTR(-ERANGE);
 }
index 9143163..5226cc2 100644 (file)
@@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
 
 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
 {
-       return clamp_value / UCLAMP_BUCKET_DELTA;
+       return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
 }
 
 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
index 1d75af1..20aa234 100644 (file)
@@ -10878,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq;
 
+       list_add_leaf_cfs_rq(cfs_rq_of(se));
+
        /* Start to propagate at parent */
        se = se->parent;
 
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
 
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
+               if (!cfs_rq_throttled(cfs_rq)){
+                       update_load_avg(cfs_rq, se, UPDATE_TG);
+                       list_add_leaf_cfs_rq(cfs_rq);
+                       continue;
+               }
 
-               update_load_avg(cfs_rq, se, UPDATE_TG);
+               if (list_add_leaf_cfs_rq(cfs_rq))
+                       break;
        }
 }
 #else
index db27b69..cc25a3c 100644 (file)
@@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
  */
 void cgroup_move_task(struct task_struct *task, struct css_set *to)
 {
-       unsigned int task_flags = 0;
+       unsigned int task_flags;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 
        rq = task_rq_lock(task, &rf);
 
-       if (task_on_rq_queued(task)) {
-               task_flags = TSK_RUNNING;
-               if (task_current(rq, task))
-                       task_flags |= TSK_ONCPU;
-       } else if (task->in_iowait)
-               task_flags = TSK_IOWAIT;
-
-       if (task->in_memstall)
-               task_flags |= TSK_MEMSTALL;
+       /*
+        * We may race with schedule() dropping the rq lock between
+        * deactivating prev and switching to next. Because the psi
+        * updates from the deactivation are deferred to the switch
+        * callback to save cgroup tree updates, the task's scheduling
+        * state here is not coherent with its psi state:
+        *
+        * schedule()                   cgroup_move_task()
+        *   rq_lock()
+        *   deactivate_task()
+        *     p->on_rq = 0
+        *     psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+        *   pick_next_task()
+        *     rq_unlock()
+        *                                rq_lock()
+        *                                psi_task_change() // old cgroup
+        *                                task->cgroups = to
+        *                                psi_task_change() // new cgroup
+        *                                rq_unlock()
+        *     rq_lock()
+        *   psi_sched_switch() // does deferred updates in new cgroup
+        *
+        * Don't rely on the scheduling state. Use psi_flags instead.
+        */
+       task_flags = task->psi_flags;
 
        if (task_flags)
                psi_task_change(task, task_flags, 0);
index e210749..52bf159 100644 (file)
@@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
        } while (0)
 
 /* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(call_single_data_t *csd)
+static void __csd_lock_record(struct __call_single_data *csd)
 {
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
@@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd)
                  /* Or before unlock, as the case may be. */
 }
 
-static __always_inline void csd_lock_record(call_single_data_t *csd)
+static __always_inline void csd_lock_record(struct __call_single_data *csd)
 {
        if (static_branch_unlikely(&csdlock_debug_enabled))
                __csd_lock_record(csd);
 }
 
-static int csd_lock_wait_getcpu(call_single_data_t *csd)
+static int csd_lock_wait_getcpu(struct __call_single_data *csd)
 {
        unsigned int csd_type;
 
@@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type)
        return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
 }
 
-static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
+static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
 {
        struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
        unsigned int srccpu = csd->node.src;
@@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
 {
        int cpu = -1;
        int cpux;
@@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void __csd_lock_wait(call_single_data_t *csd)
+static void __csd_lock_wait(struct __call_single_data *csd)
 {
        int bug_id = 0;
        u64 ts0, ts1;
@@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd)
        smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 {
        if (static_branch_unlikely(&csdlock_debug_enabled)) {
                __csd_lock_wait(csd);
@@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
 #else
 #define cfd_seq_store(var, src, dst, type)
 
-static void csd_lock_record(call_single_data_t *csd)
+static void csd_lock_record(struct __call_single_data *csd)
 {
 }
 
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 {
        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
-static __always_inline void csd_lock(call_single_data_t *csd)
+static __always_inline void csd_lock(struct __call_single_data *csd)
 {
        csd_lock_wait(csd);
        csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd)
        smp_wmb();
 }
 
-static __always_inline void csd_unlock(call_single_data_t *csd)
+static __always_inline void csd_unlock(struct __call_single_data *csd)
 {
        WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
@@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, call_single_data_t *csd)
+static int generic_exec_single(int cpu, struct __call_single_data *csd)
 {
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
@@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * NOTE: Be careful, there is unfortunately no current debugging facility to
  * validate the correctness of this serialization.
  */
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 {
        int err = 0;
 
index 3d62c95..3a583a2 100644 (file)
@@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 
        /*
         * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
-        * infite. In case of RLIM_INFINITY the posix CPU timer code
+        * infinite. In case of RLIM_INFINITY the posix CPU timer code
         * ignores the rlimit.
         */
         if (!retval && new_rlim && resource == RLIMIT_CPU &&
@@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
        }
 
        /*
-        * arg_lock protects concurent updates but we still need mmap_lock for
+        * arg_lock protects concurrent updates but we still need mmap_lock for
         * read to exclude races with sys_brk.
         */
        mmap_read_lock(mm);
@@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
         * output in procfs mostly, except
         *
         *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
-        *    for VMAs when updating these memvers so anything wrong written
+        *    for VMAs when updating these members so anything wrong written
         *    here cause kernel to swear at userspace program but won't lead
         *    to any problem in kernel itself
         */
@@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
        error = -EINVAL;
 
        /*
-        * arg_lock protects concurent updates of arg boundaries, we need
+        * arg_lock protects concurrent updates of arg boundaries, we need
         * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
         * validation.
         */
@@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
         * If command line arguments and environment
         * are placed somewhere else on stack, we can
         * set them up here, ARG_START/END to setup
-        * command line argumets and ENV_START/END
+        * command line arguments and ENV_START/END
         * for environment.
         */
        case PR_SET_MM_START_STACK:
@@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti
 static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 {
        /*
-        * If task has has_child_subreaper - all its decendants
-        * already have these flag too and new decendants will
+        * If task has has_child_subreaper - all its descendants
+        * already have these flag too and new descendants will
         * inherit it on fork, skip them.
         *
         * If we've found child_reaper - skip descendants in
index f91d327..14edf84 100644 (file)
@@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_COMPACTION
        {
                .procname       = "compact_memory",
-               .data           = &sysctl_compact_memory,
+               .data           = NULL,
                .maxlen         = sizeof(int),
                .mode           = 0200,
                .proc_handler   = sysctl_compaction_handler,
index 792b558..2e8a3fd 100644 (file)
@@ -5624,7 +5624,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
        parser = &iter->parser;
        if (trace_parser_loaded(parser)) {
-               ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+               int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+               ftrace_process_regex(iter, parser->buffer,
+                                    parser->idx, enable);
        }
 
        trace_parser_put(parser);
index 3f64661..36c1233 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/initrd.h>
 
 #include <trace/events/module.h>
 
@@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data)
 
        commit_creds(new);
 
+       wait_for_initramfs();
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
@@ -336,8 +338,8 @@ static void helper_unlock(void)
  * @argv: arg vector for process
  * @envp: environment for process
  * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
  * @init: an init function
+ * @cleanup: a cleanup function
  * @data: arbitrary context sensitive data
  *
  * Returns either %NULL on allocation failure, or a subprocess_info
@@ -348,7 +350,7 @@ static void helper_unlock(void)
  * exec.  A non-zero return code causes the process to error out, exit,
  * and return the failure to the calling process
  *
- * The cleanup function is just before ethe subprocess_info is about to
+ * The cleanup function is just before the subprocess_info is about to
  * be freed.  This can be used for freeing the argv and envp.  The
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
@@ -384,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
  * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
+ * @sub_info: information about the subprocess
  * @wait: wait for the application to finish and return status.
  *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
  *        when the program couldn't be exec'ed. This makes it safe to call
index bf20b4a..a38b8b0 100644 (file)
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 {
        unsigned long flags;
 
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(smp_call_function_single_async);
 
 /*
  * Preemption is disabled here to make sure the cond_func is called under the
- * same condtions in UP and SMP.
+ * same conditions in UP and SMP.
  */
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
index 9a4b980..8d62863 100644 (file)
@@ -85,7 +85,7 @@ int create_user_ns(struct cred *new)
        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
-        * by verifing that the root directory is at the root of the
+        * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
@@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
                        goto out;
                ret = -EINVAL;
        }
-       /* Be very certaint the new map actually exists */
+       /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;
 
@@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file,
 
        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
-        * And the opener of the id file also had the approprpiate capability.
+        * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
index 327cb2c..5e7fa54 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
-gen_crc32table
-gen_crc64table
-crc32table.h
-crc64table.h
-oid_registry_data.c
+/crc32table.h
+/crc64table.h
+/gen_crc32table
+/gen_crc64table
+/oid_registry_data.c
index 78f50cc..e641add 100644 (file)
@@ -7,6 +7,7 @@ menuconfig KFENCE
        bool "KFENCE: low-overhead sampling-based memory safety error detector"
        depends on HAVE_ARCH_KFENCE && (SLAB || SLUB)
        select STACKTRACE
+       select IRQ_WORK
        help
          KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
          access, use-after-free, and invalid-free errors. KFENCE is designed
index 7c031ee..c8095f3 100644 (file)
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -584,7 +584,7 @@ static int find_affine4_roots(struct bch_control *bch, unsigned int a,
        k = a_log(bch, a);
        rows[0] = c;
 
-       /* buid linear system to solve X^4+aX^2+bX+c = 0 */
+       /* build linear system to solve X^4+aX^2+bX+c = 0 */
        for (i = 0; i < m; i++) {
                rows[i+1] = bch->a_pow_tab[4*i]^
                        (a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
index 595a5a7..1ad8e50 100644 (file)
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(crc8_populate_lsb);
  * @nbytes: number of bytes in data buffer.
  * @crc: previous returned crc8 value.
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc)
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
 {
        /* loop over the buffer data */
        while (nbytes-- > 0)
index 1cf409e..20a8580 100644 (file)
@@ -391,7 +391,7 @@ static inline int INIT process_bit0(struct writer *wr, struct rc *rc,
 static inline int INIT process_bit1(struct writer *wr, struct rc *rc,
                                            struct cstate *cst, uint16_t *p,
                                            int pos_state, uint16_t *prob) {
-  int offset;
+       int offset;
        uint16_t *prob_len;
        int num_bits;
        int len;
index f67f86f..0f8e2e3 100644 (file)
@@ -29,7 +29,7 @@
  *    searching it for one bits.
  *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static unsigned long _find_next_bit(const unsigned long *addr1,
+unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
                unsigned long start, unsigned long invert, unsigned long le)
 {
@@ -68,44 +68,14 @@ static unsigned long _find_next_bit(const unsigned long *addr1,
 
        return min(start + __ffs(tmp), nbits);
 }
-#endif
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-                           unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
-}
-EXPORT_SYMBOL(find_next_bit);
-#endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
-#endif
-
-#if !defined(find_next_and_bit)
-unsigned long find_next_and_bit(const unsigned long *addr1,
-               const unsigned long *addr2, unsigned long size,
-               unsigned long offset)
-{
-       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
-}
-EXPORT_SYMBOL(find_next_and_bit);
+EXPORT_SYMBOL(_find_next_bit);
 #endif
 
 #ifndef find_first_bit
 /*
  * Find the first set bit in a memory region.
  */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -116,14 +86,14 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 
        return size;
 }
-EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(_find_first_bit);
 #endif
 
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -134,11 +104,11 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 
        return size;
 }
-EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(_find_first_zero_bit);
 #endif
 
 #ifndef find_last_bit
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
 {
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
@@ -154,31 +124,9 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
        }
        return size;
 }
-EXPORT_SYMBOL(find_last_bit);
-#endif
-
-#ifdef __BIG_ENDIAN
-
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
-               long size, unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
-}
-EXPORT_SYMBOL(find_next_zero_bit_le);
-#endif
-
-#ifndef find_next_bit_le
-unsigned long find_next_bit_le(const void *addr, unsigned
-               long size, unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
-}
-EXPORT_SYMBOL(find_next_bit_le);
+EXPORT_SYMBOL(_find_last_bit);
 #endif
 
-#endif /* __BIG_ENDIAN */
-
 unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
 {
index 5dcf9cd..9a57257 100644 (file)
@@ -642,6 +642,7 @@ EXPORT_SYMBOL(gen_pool_set_algo);
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -660,6 +661,7 @@ EXPORT_SYMBOL(gen_pool_first_fit);
  * @nr: The number of zeroed bits we're looking for
  * @data: data for alignment
  * @pool: pool to get order from
+ * @start_addr: start addr of alloction chunk
  */
 unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -687,6 +689,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_align);
  * @nr: The number of zeroed bits we're looking for
  * @data: data for alignment
  * @pool: pool to get order from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -721,6 +724,7 @@ EXPORT_SYMBOL(gen_pool_fixed_alloc);
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_first_fit_order_align(unsigned long *map,
                unsigned long size, unsigned long start,
@@ -735,13 +739,14 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align);
 
 /**
  * gen_pool_best_fit - find the best fitting region of memory
- * macthing the size requirement (no alignment constraint)
+ * matching the size requirement (no alignment constraint)
  * @map: The address to base the search on
  * @size: The bitmap size in bits
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  *
  * Iterate over the bitmap to find the smallest free region
  * which we can allocate the memory.
index 61228a6..c701b7a 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
@@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static void memzero_page(struct page *page, size_t offset, size_t len)
-{
-       char *addr = kmap_atomic(page);
-       memset(addr + offset, 0, len);
-       kunmap_atomic(addr);
-}
-
 static inline bool allocated(struct pipe_buffer *buf)
 {
        return buf->ops == &default_pipe_buf_ops;
index a926d96..1e1e377 100644 (file)
@@ -137,7 +137,7 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
  *
  *
  * The merging is controlled by "count", the number of elements in the
- * pending lists.  This is beautiully simple code, but rather subtle.
+ * pending lists.  This is beautifully simple code, but rather subtle.
  *
  * Each time we increment "count", we set one bit (bit k) and clear
  * bits k-1 .. 0.  Each time this happens (except the very first time
index 5b6116e..1d051ef 100644 (file)
@@ -828,7 +828,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str)
        int attrlen = nla_len(nla);
        int d;
 
-       if (attrlen > 0 && buf[attrlen - 1] == '\0')
+       while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;
 
        d = attrlen - len;
index 7a5769d..f1a6d90 100644 (file)
@@ -98,7 +98,7 @@ static int match_one(char *s, const char *p, substring_t args[])
  * locations.
  *
  * Description: Detects which if any of a set of token strings has been passed
- * to it. Tokens can include up to MAX_OPT_ARGS instances of basic c-style
+ * to it. Tokens can include up to %MAX_OPT_ARGS instances of basic c-style
  * format identifiers which will be taken into account when matching the
  * tokens, and whose locations will be returned in the @args array.
  */
@@ -120,8 +120,10 @@ EXPORT_SYMBOL(match_token);
  * @base: base to use when converting string
  *
  * Description: Given a &substring_t and a base, attempts to parse the substring
- * as a number in that base. On success, sets @result to the integer represented
- * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * as a number in that base.
+ *
+ * Return: On success, sets @result to the integer represented by the
+ * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 static int match_number(substring_t *s, int *result, int base)
 {
@@ -153,8 +155,10 @@ static int match_number(substring_t *s, int *result, int base)
  * @base: base to use when converting string
  *
  * Description: Given a &substring_t and a base, attempts to parse the substring
- * as a number in that base. On success, sets @result to the integer represented
- * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * as a number in that base.
+ *
+ * Return: On success, sets @result to the integer represented by the
+ * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 static int match_u64int(substring_t *s, u64 *result, int base)
 {
@@ -178,9 +182,10 @@ static int match_u64int(substring_t *s, u64 *result, int base)
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as a decimal integer. On
- * success, sets @result to the integer represented by the string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as a decimal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_int(substring_t *s, int *result)
 {
@@ -188,14 +193,15 @@ int match_int(substring_t *s, int *result)
 }
 EXPORT_SYMBOL(match_int);
 
-/*
+/**
  * match_uint - scan a decimal representation of an integer from a substring_t
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as a decimal integer. On
- * success, sets @result to the integer represented by the string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as a decimal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_uint(substring_t *s, unsigned int *result)
 {
@@ -217,9 +223,10 @@ EXPORT_SYMBOL(match_uint);
  * @result: resulting unsigned long long on success
  *
  * Description: Attempts to parse the &substring_t @s as a long decimal
- * integer. On success, sets @result to the integer represented by the
- * string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_u64(substring_t *s, u64 *result)
 {
@@ -232,9 +239,10 @@ EXPORT_SYMBOL(match_u64);
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as an octal integer. On
- * success, sets @result to the integer represented by the string and returns
- * 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as an octal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_octal(substring_t *s, int *result)
 {
@@ -248,8 +256,9 @@ EXPORT_SYMBOL(match_octal);
  * @result: resulting integer on success
  *
  * Description: Attempts to parse the &substring_t @s as a hexadecimal integer.
- * On success, sets @result to the integer represented by the string and
- * returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_hex(substring_t *s, int *result)
 {
@@ -263,10 +272,11 @@ EXPORT_SYMBOL(match_hex);
  * @str: the string to be parsed
  *
  * Description: Parse the string @str to check if matches wildcard
- * pattern @pattern. The pattern may contain two type wildcardes:
+ * pattern @pattern. The pattern may contain two types of wildcards:
  *   '*' - matches zero or more characters
  *   '?' - matches one character
- * If it's matched, return true, else return false.
+ *
+ * Return: If the @str matches the @pattern, return true, else return false.
  */
 bool match_wildcard(const char *pattern, const char *str)
 {
@@ -316,7 +326,9 @@ EXPORT_SYMBOL(match_wildcard);
  *
  * Description: Copy the characters in &substring_t @src to the
  * c-style string @dest.  Copy no more than @size - 1 characters, plus
- * the terminating NUL.  Return length of @src.
+ * the terminating NUL.
+ *
+ * Return: length of @src.
  */
 size_t match_strlcpy(char *dest, const substring_t *src, size_t size)
 {
@@ -338,6 +350,9 @@ EXPORT_SYMBOL(match_strlcpy);
  * Description: Allocates and returns a string filled with the contents of
  * the &substring_t @s. The caller is responsible for freeing the returned
  * string with kfree().
+ *
+ * Return: the address of the newly allocated NUL-terminated string or
+ * %NULL on error.
  */
 char *match_strdup(const substring_t *s)
 {
index 00f666d..ed610b7 100644 (file)
@@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 }
 EXPORT_SYMBOL(percpu_counter_set);
 
-/**
+/*
  * This function is both preempt and irq safe. The former is due to explicit
  * preemption disable. The latter is guaranteed by the fact that the slow path
  * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
index 49f67a0..df9179f 100644 (file)
@@ -71,7 +71,7 @@ static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
 static int depot_index;
 static int next_slab_inited;
 static size_t depot_offset;
-static DEFINE_SPINLOCK(depot_lock);
+static DEFINE_RAW_SPINLOCK(depot_lock);
 
 static bool init_stack_slab(void **prealloc)
 {
@@ -305,7 +305,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
                        prealloc = page_address(page);
        }
 
-       spin_lock_irqsave(&depot_lock, flags);
+       raw_spin_lock_irqsave(&depot_lock, flags);
 
        found = find_stack(*bucket, entries, nr_entries, hash);
        if (!found) {
@@ -329,7 +329,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
                WARN_ON(!init_stack_slab(&prealloc));
        }
 
-       spin_unlock_irqrestore(&depot_lock, flags);
+       raw_spin_unlock_irqrestore(&depot_lock, flags);
 exit:
        if (prealloc) {
                /* Nobody used this memory, ok to free it. */
index 3636da2..02d44e3 100644 (file)
@@ -148,6 +148,9 @@ config MEMORY_ISOLATION
 config HAVE_BOOTMEM_INFO_NODE
        def_bool n
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+       bool
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@ -176,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
          Say N here if you want the default policy to keep all hot-plugged
          memory blocks in 'offline' state.
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+       bool
+
 config MEMORY_HOTREMOVE
        bool "Allow for memory hot remove"
        select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
 
+config MHP_MEMMAP_ON_MEMORY
+       def_bool y
+       depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+       depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
@@ -273,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config ARCH_ENABLE_THP_MIGRATION
        bool
 
+config HUGETLB_PAGE_SIZE_VARIABLE
+       def_bool n
+       help
+         Allows the pageblock_order value to be dynamic instead of just standard
+         HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
+         on a platform.
+
 config CONTIG_ALLOC
        def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
 
@@ -511,6 +529,13 @@ config CMA_DEBUGFS
        help
          Turns on the DebugFS interface for CMA.
 
+config CMA_SYSFS
+       bool "CMA information through sysfs interface"
+       depends on CMA && SYSFS
+       help
+         This option exposes some sysfs attributes to get information
+         from CMA.
+
 config CMA_AREAS
        int "Maximum count of the CMA areas"
        depends on CMA
@@ -758,6 +783,9 @@ config IDLE_PAGE_TRACKING
          See Documentation/admin-guide/mm/idle_page_tracking.rst for
          more details.
 
+config ARCH_HAS_CACHE_LINE_SIZE
+       bool
+
 config ARCH_HAS_PTE_DEVMAP
        bool
 
index c0135e3..bf71e29 100644 (file)
@@ -58,9 +58,13 @@ obj-y                        := filemap.o mempool.o oom_kill.o fadvise.o \
 page-alloc-y := page_alloc.o
 page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
 obj-y += page-alloc.o
 obj-y += init-mm.o
 obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
 
 ifdef CONFIG_MMU
        obj-$(CONFIG_ADVISE_SYSCALLS)   += madvise.o
@@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KASAN)    += kasan/
 obj-$(CONFIG_KFENCE) += kfence/
 obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)          += memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
@@ -109,6 +112,7 @@ obj-$(CONFIG_CMA)   += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
index 26de020..907fefd 100644 (file)
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
 /**
  * balloon_page_list_dequeue() - removes pages from balloon's page list and
  *                              returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  * @pages: pointer to the list of pages that would be returned to the caller.
  * @n_req_pages: number of requested pages.
  *
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
 /*
  * balloon_page_dequeue - removes a page from balloon's page list and returns
  *                       its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  *
  * Driver must call this function to properly dequeue a previously enqueued page
  * before definitively releasing it back to the guest system.
index 54eee21..995e154 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/err.h>
 #include <linux/mm.h>
-#include <linux/mutex.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
@@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
 }
 
 static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
-                            unsigned int count)
+                            unsigned long count)
 {
        unsigned long bitmap_no, bitmap_count;
+       unsigned long flags;
 
        bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irqsave(&cma->lock, flags);
        bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
-       mutex_unlock(&cma->lock);
+       spin_unlock_irqrestore(&cma->lock, flags);
 }
 
 static void __init cma_activate_area(struct cma *cma)
@@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma)
             pfn += pageblock_nr_pages)
                init_cma_reserved_pageblock(pfn_to_page(pfn));
 
-       mutex_init(&cma->lock);
+       spin_lock_init(&cma->lock);
 
 #ifdef CONFIG_CMA_DEBUGFS
        INIT_HLIST_HEAD(&cma->mem_head);
@@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma)
        unsigned long nr_part, nr_total = 0;
        unsigned long nbits = cma_bitmap_maxno(cma);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        pr_info("number of available pages: ");
        for (;;) {
                next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
@@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma)
                start = next_zero_bit + nr_zero;
        }
        pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
 }
 #else
 static inline void cma_debug_show_areas(struct cma *cma) { }
@@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
-                      bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+                      unsigned int align, bool no_warn)
 {
        unsigned long mask, offset;
        unsigned long pfn = -1;
        unsigned long start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
-       size_t i;
+       unsigned long i;
        struct page *page = NULL;
        int ret = -ENOMEM;
 
        if (!cma || !cma->count || !cma->bitmap)
-               return NULL;
+               goto out;
 
-       pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+       pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
                 count, align);
 
        if (!count)
-               return NULL;
+               goto out;
+
+       trace_cma_alloc_start(cma->name, count, align);
 
        mask = cma_bitmap_aligned_mask(cma, align);
        offset = cma_bitmap_aligned_offset(cma, align);
@@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
        if (bitmap_count > bitmap_maxno)
-               return NULL;
+               goto out;
 
        for (;;) {
-               mutex_lock(&cma->lock);
+               spin_lock_irq(&cma->lock);
                bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
                                bitmap_maxno, start, bitmap_count, mask,
                                offset);
                if (bitmap_no >= bitmap_maxno) {
-                       mutex_unlock(&cma->lock);
+                       spin_unlock_irq(&cma->lock);
                        break;
                }
                bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
@@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
                 * our exclusive use. If the migration fails we will take the
                 * lock again and unmark it.
                 */
-               mutex_unlock(&cma->lock);
+               spin_unlock_irq(&cma->lock);
 
                pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
                ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
@@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 
                pr_debug("%s(): memory range at %p is busy, retrying\n",
                         __func__, pfn_to_page(pfn));
+
+               trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
+                                          count, align);
                /* try again with a bit different memory target */
                start = bitmap_no + mask + 1;
        }
 
-       trace_cma_alloc(pfn, page, count, align);
+       trace_cma_alloc_finish(cma->name, pfn, page, count, align);
 
        /*
         * CMA can allocate multiple page blocks, which results in different
@@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        }
 
        if (ret && !no_warn) {
-               pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
-                      __func__, cma->name, count, ret);
+               pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
+                                  __func__, cma->name, count, ret);
                cma_debug_show_areas(cma);
        }
 
        pr_debug("%s(): returned %p\n", __func__, page);
+out:
+       if (page) {
+               count_vm_event(CMA_ALLOC_SUCCESS);
+               cma_sysfs_account_success_pages(cma, count);
+       } else {
+               count_vm_event(CMA_ALLOC_FAIL);
+               if (cma)
+                       cma_sysfs_account_fail_pages(cma, count);
+       }
+
        return page;
 }
 
@@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
  * It returns false when provided pages do not belong to contiguous area and
  * true otherwise.
  */
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+                unsigned long count)
 {
        unsigned long pfn;
 
        if (!cma || !pages)
                return false;
 
-       pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
+       pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
 
        pfn = page_to_pfn(pages);
 
@@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 
        free_contig_range(pfn, count);
        cma_clear_bitmap(cma, pfn, count);
-       trace_cma_release(pfn, pages, count);
+       trace_cma_release(cma->name, pfn, pages, count);
 
        return true;
 }
index 42ae082..2c77587 100644 (file)
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,19 +3,33 @@
 #define __MM_CMA_H__
 
 #include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_kobject {
+       struct kobject kobj;
+       struct cma *cma;
+};
 
 struct cma {
        unsigned long   base_pfn;
        unsigned long   count;
        unsigned long   *bitmap;
        unsigned int order_per_bit; /* Order of pages represented by one bit */
-       struct mutex    lock;
+       spinlock_t      lock;
 #ifdef CONFIG_CMA_DEBUGFS
        struct hlist_head mem_head;
        spinlock_t mem_head_lock;
        struct debugfs_u32_array dfs_bitmap;
 #endif
        char name[CMA_MAX_NAME];
+#ifdef CONFIG_CMA_SYSFS
+       /* the number of CMA page successful allocations */
+       atomic64_t nr_pages_succeeded;
+       /* the number of CMA page allocation failures */
+       atomic64_t nr_pages_failed;
+       /* kobject requires dynamic object */
+       struct cma_kobject *cma_kobj;
+#endif
 };
 
 extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
        return cma->count >> cma->order_per_bit;
 }
 
+#ifdef CONFIG_CMA_SYSFS
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+#else
+static inline void cma_sysfs_account_success_pages(struct cma *cma,
+                                                  unsigned long nr_pages) {};
+static inline void cma_sysfs_account_fail_pages(struct cma *cma,
+                                               unsigned long nr_pages) {};
+#endif
 #endif
index d5bf8aa..2e77049 100644 (file)
@@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
        struct cma *cma = data;
        unsigned long used;
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        /* pages counter is smaller than sizeof(int) */
        used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
        *val = (u64)used << cma->order_per_bit;
 
        return 0;
@@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
        unsigned long start, end = 0;
        unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        for (;;) {
                start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
                if (start >= bitmap_maxno)
@@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
                end = find_next_bit(cma->bitmap, bitmap_maxno, start);
                maxchunk = max(end - start, maxchunk);
        }
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
        *val = (u64)maxchunk << cma->order_per_bit;
 
        return 0;
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644 (file)
index 0000000..eb2f39c
--- /dev/null
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+#define CMA_ATTR_RO(_name) \
+       static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
+{
+       atomic64_add(nr_pages, &cma->nr_pages_succeeded);
+}
+
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
+{
+       atomic64_add(nr_pages, &cma->nr_pages_failed);
+}
+
+static inline struct cma *cma_from_kobj(struct kobject *kobj)
+{
+       return container_of(kobj, struct cma_kobject, kobj)->cma;
+}
+
+static ssize_t alloc_pages_success_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+
+       return sysfs_emit(buf, "%llu\n",
+                         atomic64_read(&cma->nr_pages_succeeded));
+}
+CMA_ATTR_RO(alloc_pages_success);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr, char *buf)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+
+       return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+       struct cma_kobject *cma_kobj = cma->cma_kobj;
+
+       kfree(cma_kobj);
+       cma->cma_kobj = NULL;
+}
+
+static struct attribute *cma_attrs[] = {
+       &alloc_pages_success_attr.attr,
+       &alloc_pages_fail_attr.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static struct kobj_type cma_ktype = {
+       .release = cma_kobj_release,
+       .sysfs_ops = &kobj_sysfs_ops,
+       .default_groups = cma_groups,
+};
+
+static int __init cma_sysfs_init(void)
+{
+       struct kobject *cma_kobj_root;
+       struct cma_kobject *cma_kobj;
+       struct cma *cma;
+       int i, err;
+
+       cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
+       if (!cma_kobj_root)
+               return -ENOMEM;
+
+       for (i = 0; i < cma_area_count; i++) {
+               cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
+               if (!cma_kobj) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               cma = &cma_areas[i];
+               cma->cma_kobj = cma_kobj;
+               cma_kobj->cma = cma;
+               err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
+                                          cma_kobj_root, "%s", cma->name);
+               if (err) {
+                       kobject_put(&cma_kobj->kobj);
+                       goto out;
+               }
+       }
+
+       return 0;
+out:
+       while (--i >= 0) {
+               cma = &cma_areas[i];
+               kobject_put(&cma->cma_kobj->kobj);
+       }
+       kobject_put(cma_kobj_root);
+
+       return err;
+}
+subsys_initcall(cma_sysfs_init);
index e04f447..84fde27 100644 (file)
@@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat)
  *
  * Isolate all pages that can be migrated from the range specified by
  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
  *
  * The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
  */
-static unsigned long
+static int
 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        unsigned long end_pfn, isolate_mode_t isolate_mode)
 {
@@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        bool skip_on_failure = false;
        unsigned long next_skip_pfn = 0;
        bool skip_updated = false;
+       int ret = 0;
+
+       cc->migrate_pfn = low_pfn;
 
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        while (unlikely(too_many_isolated(pgdat))) {
                /* stop isolation if there are still pages not migrated */
                if (cc->nr_migratepages)
-                       return 0;
+                       return -EAGAIN;
 
                /* async migration should just abort */
                if (cc->mode == MIGRATE_ASYNC)
-                       return 0;
+                       return -EAGAIN;
 
                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                if (fatal_signal_pending(current))
-                       return 0;
+                       return -EINTR;
        }
 
        cond_resched();
@@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                        if (fatal_signal_pending(current)) {
                                cc->contended = true;
+                               ret = -EINTR;
 
-                               low_pfn = 0;
                                goto fatal_pending;
                        }
 
@@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        valid_page = page;
                }
 
+               if (PageHuge(page) && cc->alloc_contig) {
+                       ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+                       /*
+                        * Fail isolation in case isolate_or_dissolve_huge_page()
+                        * reports an error. In case of -ENOMEM, abort right away.
+                        */
+                       if (ret < 0) {
+                                /* Do not report -EBUSY down the chain */
+                               if (ret == -EBUSY)
+                                       ret = 0;
+                               low_pfn += (1UL << compound_order(page)) - 1;
+                               goto isolate_fail;
+                       }
+
+                       if (PageHuge(page)) {
+                               /*
+                                * Hugepage was successfully isolated and placed
+                                * on the cc->migratepages list.
+                                */
+                               low_pfn += compound_nr(page) - 1;
+                               goto isolate_success_no_list;
+                       }
+
+                       /*
+                        * Ok, the hugepage was dissolved. Now these pages are
+                        * Buddy and cannot be re-allocated because they are
+                        * isolated. Fall-through as the check below handles
+                        * Buddy pages.
+                        */
+               }
+
                /*
                 * Skip if free. We read page order here without zone lock
                 * which is generally unsafe, but the race window is small and
@@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 isolate_success:
                list_add(&page->lru, &cc->migratepages);
+isolate_success_no_list:
                cc->nr_migratepages += compound_nr(page);
                nr_isolated += compound_nr(page);
 
@@ -1063,7 +1098,7 @@ isolate_fail_put:
                put_page(page);
 
 isolate_fail:
-               if (!skip_on_failure)
+               if (!skip_on_failure && ret != -ENOMEM)
                        continue;
 
                /*
@@ -1089,6 +1124,9 @@ isolate_fail:
                         */
                        next_skip_pfn += 1UL << cc->order;
                }
+
+               if (ret == -ENOMEM)
+                       break;
        }
 
        /*
@@ -1130,7 +1168,9 @@ fatal_pending:
        if (nr_isolated)
                count_compact_events(COMPACTISOLATED, nr_isolated);
 
-       return low_pfn;
+       cc->migrate_pfn = low_pfn;
+
+       return ret;
 }
 
 /**
@@ -1139,15 +1179,15 @@ fatal_pending:
  * @start_pfn: The first PFN to start isolating.
  * @end_pfn:   The one-past-last PFN.
  *
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
  */
-unsigned long
+int
 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                                        unsigned long end_pfn)
 {
        unsigned long pfn, block_start_pfn, block_end_pfn;
+       int ret = 0;
 
        /* Scan block by block. First and last block may be incomplete */
        pfn = start_pfn;
@@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                        block_end_pfn, cc->zone))
                        continue;
 
-               pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
-                                                       ISOLATE_UNEVICTABLE);
+               ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+                                                ISOLATE_UNEVICTABLE);
 
-               if (!pfn)
+               if (ret)
                        break;
 
                if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
                        break;
        }
 
-       return pfn;
+       return ret;
 }
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
         */
        for (; block_end_pfn <= cc->free_pfn;
                        fast_find_block = false,
-                       low_pfn = block_end_pfn,
+                       cc->migrate_pfn = low_pfn = block_end_pfn,
                        block_start_pfn = block_end_pfn,
                        block_end_pfn += pageblock_nr_pages) {
 
@@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
                }
 
                /* Perform the isolation */
-               low_pfn = isolate_migratepages_block(cc, low_pfn,
-                                               block_end_pfn, isolate_mode);
-
-               if (!low_pfn)
+               if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+                                               isolate_mode))
                        return ISOLATE_ABORT;
 
                /*
@@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
                break;
        }
 
-       /* Record where migration scanner will be restarted. */
-       cc->migrate_pfn = low_pfn;
-
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
@@ -1977,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
        unsigned int wmark_low;
 
        /*
-        * Cap the low watermak to avoid excessive compaction
-        * activity in case a user sets the proactivess tunable
+        * Cap the low watermark to avoid excessive compaction
+        * activity in case a user sets the proactiveness tunable
         * close to 100 (maximum).
         */
        wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
@@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
 
-       migrate_prep_local();
+       /* lru_add_drain_all could be expensive with involving other CPUs */
+       lru_add_drain();
 
        while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
                int err;
@@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
         */
        WRITE_ONCE(current->capture_control, NULL);
        *capture = READ_ONCE(capc.page);
+       /*
+        * Technically, it is also possible that compaction is skipped but
+        * the page is still captured out of luck(IRQ came and freed the page).
+        * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+        * the COMPACT[STALL|FAIL] when compaction is skipped.
+        */
+       if (*capture)
+               ret = COMPACT_SUCCESS;
 
        return ret;
 }
@@ -2657,9 +2701,6 @@ static void compact_nodes(void)
                compact_node(nid);
 }
 
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
-
 /*
  * Tunable for proactive compaction. It determines how
  * aggressively the kernel should compact memory in the
@@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
  */
 static int kcompactd(void *p)
 {
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        unsigned int proactive_defer = 0;
 
index 5be57ba..66f7e9f 100644 (file)
@@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping,
 
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
-
-       if (shadow) {
-               mapping->nrexceptional += nr;
-               /*
-                * Make sure the nrexceptional update is committed before
-                * the nrpages update so that final truncate racing
-                * with reclaim does not see both counters 0 at the
-                * same time and miss a shadow entry.
-                */
-               smp_wmb();
-       }
        mapping->nrpages -= nr;
 }
 
@@ -629,9 +618,6 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 /* Returns true if writeback might be needed or already in progress. */
 static bool mapping_needs_writeback(struct address_space *mapping)
 {
-       if (dax_mapping(mapping))
-               return mapping->nrexceptional;
-
        return mapping->nrpages;
 }
 
@@ -925,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page,
                if (xas_error(&xas))
                        goto unlock;
 
-               if (old)
-                       mapping->nrexceptional--;
                mapping->nrpages++;
 
                /* hugetlb pages do not participate in page cache accounting */
@@ -2771,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page)
  * entirely memory-based such as tmpfs, and filesystems which support
  * unwritten extents.
  *
- * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
  * SEEK_DATA and there is no data after @start.  There is an implicit hole
  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
  * and @end contain data.
@@ -3283,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
 
 /* This is used for a general mmap of a disk file */
 
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct address_space *mapping = file->f_mapping;
 
@@ -3308,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
        return VM_FAULT_SIGBUS;
 }
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        return -ENOSYS;
 }
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
        return -ENOSYS;
 }
@@ -3740,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write);
 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-       struct address_space * mapping = file->f_mapping;
+       struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        ssize_t         written = 0;
        ssize_t         err;
index 2183a56..130e301 100644 (file)
@@ -60,16 +60,20 @@ static u64 frontswap_succ_stores;
 static u64 frontswap_failed_stores;
 static u64 frontswap_invalidates;
 
-static inline void inc_frontswap_loads(void) {
+static inline void inc_frontswap_loads(void)
+{
        data_race(frontswap_loads++);
 }
-static inline void inc_frontswap_succ_stores(void) {
+static inline void inc_frontswap_succ_stores(void)
+{
        data_race(frontswap_succ_stores++);
 }
-static inline void inc_frontswap_failed_stores(void) {
+static inline void inc_frontswap_failed_stores(void)
+{
        data_race(frontswap_failed_stores++);
 }
-static inline void inc_frontswap_invalidates(void) {
+static inline void inc_frontswap_invalidates(void)
+{
        data_race(frontswap_invalidates++);
 }
 #else
index 71e546e..0697134 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
                int orig_refs = refs;
 
                /*
-                * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
-                * path, so fail and let the caller fall back to the slow path.
+                * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+                * right zone, so fail and let the caller fall back to the slow
+                * path.
                 */
-               if (unlikely(flags & FOLL_LONGTERM) &&
-                               is_migrate_cma_page(page))
+               if (unlikely((flags & FOLL_LONGTERM) &&
+                            !is_pinnable_page(page)))
                        return NULL;
 
                /*
@@ -1527,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
-       int i;
+       long i;
 
        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
@@ -1574,7 +1575,7 @@ finish_or_fault:
  * Returns NULL on any kind of failure - a hole must then be inserted into
  * the corefile, to preserve alignment with its headers; and also returns
  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
+ * allowing a hole to be left in the corefile to save disk space.
  *
  * Called without mmap_lock (takes and releases the mmap_lock by itself).
  */
@@ -1600,112 +1601,92 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-                                       unsigned long start,
-                                       unsigned long nr_pages,
-                                       struct page **pages,
-                                       struct vm_area_struct **vmas,
-                                       unsigned int gup_flags)
+#ifdef CONFIG_MIGRATION
+/*
+ * Check whether all pages are pinnable, if so return number of pages.  If some
+ * pages are not pinnable, migrate them, and unpin all pages. Return zero if
+ * pages were migrated, or if some pages were not successfully isolated.
+ * Return negative error if migration fails.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+                                           struct page **pages,
+                                           unsigned int gup_flags)
 {
        unsigned long i;
-       unsigned long step;
+       unsigned long isolation_error_count = 0;
        bool drain_allow = true;
-       bool migrate_allow = true;
-       LIST_HEAD(cma_page_list);
-       long ret = nr_pages;
+       LIST_HEAD(movable_page_list);
+       long ret = 0;
+       struct page *prev_head = NULL;
+       struct page *head;
        struct migration_target_control mtc = {
                .nid = NUMA_NO_NODE,
-               .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+               .gfp_mask = GFP_USER | __GFP_NOWARN,
        };
 
-check_again:
-       for (i = 0; i < nr_pages;) {
-
-               struct page *head = compound_head(pages[i]);
-
-               /*
-                * gup may start from a tail page. Advance step by the left
-                * part.
-                */
-               step = compound_nr(head) - (pages[i] - head);
+       for (i = 0; i < nr_pages; i++) {
+               head = compound_head(pages[i]);
+               if (head == prev_head)
+                       continue;
+               prev_head = head;
                /*
-                * If we get a page from the CMA zone, since we are going to
-                * be pinning these entries, we might as well move them out
-                * of the CMA zone if possible.
+                * If we get a movable page, since we are going to be pinning
+                * these entries, try to move them out if possible.
                 */
-               if (is_migrate_cma_page(head)) {
-                       if (PageHuge(head))
-                               isolate_huge_page(head, &cma_page_list);
-                       else {
+               if (!is_pinnable_page(head)) {
+                       if (PageHuge(head)) {
+                               if (!isolate_huge_page(head, &movable_page_list))
+                                       isolation_error_count++;
+                       } else {
                                if (!PageLRU(head) && drain_allow) {
                                        lru_add_drain_all();
                                        drain_allow = false;
                                }
 
-                               if (!isolate_lru_page(head)) {
-                                       list_add_tail(&head->lru, &cma_page_list);
-                                       mod_node_page_state(page_pgdat(head),
-                                                           NR_ISOLATED_ANON +
-                                                           page_is_file_lru(head),
-                                                           thp_nr_pages(head));
+                               if (isolate_lru_page(head)) {
+                                       isolation_error_count++;
+                                       continue;
                                }
+                               list_add_tail(&head->lru, &movable_page_list);
+                               mod_node_page_state(page_pgdat(head),
+                                                   NR_ISOLATED_ANON +
+                                                   page_is_file_lru(head),
+                                                   thp_nr_pages(head));
                        }
                }
-
-               i += step;
        }
 
-       if (!list_empty(&cma_page_list)) {
-               /*
-                * drop the above get_user_pages reference.
-                */
-               if (gup_flags & FOLL_PIN)
-                       unpin_user_pages(pages, nr_pages);
-               else
-                       for (i = 0; i < nr_pages; i++)
-                               put_page(pages[i]);
-
-               if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
-                       /*
-                        * some of the pages failed migration. Do get_user_pages
-                        * without migration.
-                        */
-                       migrate_allow = false;
+       /*
+        * If list is empty, and no isolation errors, means that all pages are
+        * in the correct zone.
+        */
+       if (list_empty(&movable_page_list) && !isolation_error_count)
+               return nr_pages;
 
-                       if (!list_empty(&cma_page_list))
-                               putback_movable_pages(&cma_page_list);
-               }
-               /*
-                * We did migrate all the pages, Try to get the page references
-                * again migrating any new CMA pages which we failed to isolate
-                * earlier.
-                */
-               ret = __get_user_pages_locked(mm, start, nr_pages,
-                                                  pages, vmas, NULL,
-                                                  gup_flags);
-
-               if ((ret > 0) && migrate_allow) {
-                       nr_pages = ret;
-                       drain_allow = true;
-                       goto check_again;
-               }
+       if (gup_flags & FOLL_PIN) {
+               unpin_user_pages(pages, nr_pages);
+       } else {
+               for (i = 0; i < nr_pages; i++)
+                       put_page(pages[i]);
+       }
+       if (!list_empty(&movable_page_list)) {
+               ret = migrate_pages(&movable_page_list, alloc_migration_target,
+                                   NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+                                   MR_LONGTERM_PIN);
+               if (ret && !list_empty(&movable_page_list))
+                       putback_movable_pages(&movable_page_list);
        }
 
-       return ret;
+       return ret > 0 ? -ENOMEM : ret;
 }
 #else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-                                       unsigned long start,
-                                       unsigned long nr_pages,
-                                       struct page **pages,
-                                       struct vm_area_struct **vmas,
-                                       unsigned int gup_flags)
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+                                           struct page **pages,
+                                           unsigned int gup_flags)
 {
        return nr_pages;
 }
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
 
 /*
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1718,21 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm,
                                  struct vm_area_struct **vmas,
                                  unsigned int gup_flags)
 {
-       unsigned long flags = 0;
+       unsigned int flags;
        long rc;
 
-       if (gup_flags & FOLL_LONGTERM)
-               flags = memalloc_nocma_save();
-
-       rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
-                                    gup_flags);
+       if (!(gup_flags & FOLL_LONGTERM))
+               return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+                                              NULL, gup_flags);
+       flags = memalloc_pin_save();
+       do {
+               rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+                                            NULL, gup_flags);
+               if (rc <= 0)
+                       break;
+               rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
+       } while (!rc);
+       memalloc_pin_restore(flags);
 
-       if (gup_flags & FOLL_LONGTERM) {
-               if (rc > 0)
-                       rc = check_and_migrate_cma_pages(mm, start, rc, pages,
-                                                        vmas, gup_flags);
-               memalloc_nocma_restore(flags);
-       }
        return rc;
 }
 
index e3cf78e..d974dec 100644 (file)
@@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
 
                                dump_page(page, "gup_test failure");
                                break;
+                       } else if (cmd == PIN_LONGTERM_BENCHMARK &&
+                               WARN(!is_pinnable_page(page),
+                                    "pages[%lu] is NOT pinnable but pinned\n",
+                                    i)) {
+                               dump_page(page, "gup_test failure");
+                               break;
                        }
                }
                break;
@@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd,
 {
        ktime_t start_time, end_time;
        unsigned long i, nr_pages, addr, next;
-       int nr;
+       long nr;
        struct page **pages;
        int ret = 0;
        bool needs_mmap_lock =
@@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd,
                        nr = (next - addr) / PAGE_SIZE;
                }
 
-               /* Filter out most gup flags: only allow a tiny subset here: */
-               gup->flags &= FOLL_WRITE;
-
                switch (cmd) {
                case GUP_FAST_BENCHMARK:
-                       nr = get_user_pages_fast(addr, nr, gup->flags,
+                       nr = get_user_pages_fast(addr, nr, gup->gup_flags,
                                                 pages + i);
                        break;
                case GUP_BASIC_TEST:
-                       nr = get_user_pages(addr, nr, gup->flags, pages + i,
+                       nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
                                            NULL);
                        break;
                case PIN_FAST_BENCHMARK:
-                       nr = pin_user_pages_fast(addr, nr, gup->flags,
+                       nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
                                                 pages + i);
                        break;
                case PIN_BASIC_TEST:
-                       nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+                       nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
                                            NULL);
                        break;
                case PIN_LONGTERM_BENCHMARK:
                        nr = pin_user_pages(addr, nr,
-                                           gup->flags | FOLL_LONGTERM,
+                                           gup->gup_flags | FOLL_LONGTERM,
                                            pages + i, NULL);
                        break;
                case DUMP_USER_PAGES_TEST:
-                       if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
-                               nr = pin_user_pages(addr, nr, gup->flags,
+                       if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+                               nr = pin_user_pages(addr, nr, gup->gup_flags,
                                                    pages + i, NULL);
                        else
-                               nr = get_user_pages(addr, nr, gup->flags,
+                               nr = get_user_pages(addr, nr, gup->gup_flags,
                                                    pages + i, NULL);
                        break;
                default:
@@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd,
 
        start_time = ktime_get();
 
-       put_back_pages(cmd, pages, nr_pages, gup->flags);
+       put_back_pages(cmd, pages, nr_pages, gup->test_flags);
 
        end_time = ktime_get();
        gup->put_delta_usec = ktime_us_delta(end_time, start_time);
index 90a6713..887ac1d 100644 (file)
@@ -21,7 +21,8 @@ struct gup_test {
        __u64 addr;
        __u64 size;
        __u32 nr_pages_per_call;
-       __u32 flags;
+       __u32 gup_flags;
+       __u32 test_flags;
        /*
         * Each non-zero entry is the number of the page (1-based: first page is
         * page 1, so that zero entries mean "do nothing") from the .addr base.
index 6ef8f5e..4fb51d7 100644 (file)
@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 atomic_long_t _totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(_totalhigh_pages);
 
-unsigned int __nr_free_highpages (void)
+unsigned int __nr_free_highpages(void)
 {
        struct zone *zone;
        unsigned int pages = 0;
@@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void)
 static int pkmap_count[LAST_PKMAP];
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
 
-pte_t * pkmap_page_table;
+pte_t *pkmap_page_table;
 
 /*
  * Most architectures have no use for kmap_high_get(), so let's abstract
@@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr)
 
        if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
                int i = PKMAP_NR(addr);
+
                return pte_page(pkmap_page_table[i]);
        }
 
@@ -278,9 +279,8 @@ void *kmap_high(struct page *page)
        pkmap_count[PKMAP_NR(vaddr)]++;
        BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
        unlock_kmap();
-       return (void*) vaddr;
+       return (void *) vaddr;
 }
-
 EXPORT_SYMBOL(kmap_high);
 
 #ifdef ARCH_NEEDS_KMAP_HIGH_GET
@@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page)
                pkmap_count[PKMAP_NR(vaddr)]++;
        }
        unlock_kmap_any(flags);
-       return (void*) vaddr;
+       return (void *) vaddr;
 }
 #endif
 
@@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 
        /*
         * Disable migration so resulting virtual address is stable
-        * accross preemption.
+        * across preemption.
         */
        migrate_disable();
        preempt_disable();
@@ -737,7 +737,6 @@ done:
        spin_unlock_irqrestore(&pas->lock, flags);
        return ret;
 }
-
 EXPORT_SYMBOL(page_address);
 
 /**
index ae907a9..63ed6b2 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
@@ -77,18 +78,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
        return false;
 }
 
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-               return READ_ONCE(huge_zero_page);
+               return true;
 
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_page) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
-               return NULL;
+               return false;
        }
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        preempt_disable();
@@ -101,7 +102,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-       return READ_ONCE(huge_zero_page);
+       return true;
 }
 
 static void put_huge_zero_page(void)
@@ -624,14 +625,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 
                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
-                       vm_fault_t ret2;
-
                        spin_unlock(vmf->ptl);
                        put_page(page);
                        pte_free(vma->vm_mm, pgtable);
-                       ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
-                       VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
-                       return ret2;
+                       ret = handle_userfault(vmf, VM_UFFD_MISSING);
+                       VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       return ret;
                }
 
                entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -1293,7 +1292,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        }
 
        page = pmd_page(orig_pmd);
-       VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+       VM_BUG_ON_PAGE(!PageHead(page), page);
 
        /* Lock page for reuse_swap_page() */
        if (!trylock_page(page)) {
@@ -1464,12 +1463,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
         */
        page_locked = trylock_page(page);
        target_nid = mpol_misplaced(page, vma, haddr);
-       if (target_nid == NUMA_NO_NODE) {
-               /* If the page was locked, there are no parallel migrations */
-               if (page_locked)
-                       goto clear_pmdnuma;
-       }
-
        /* Migration could have started since the pmd_trans_migrating check */
        if (!page_locked) {
                page_nid = NUMA_NO_NODE;
@@ -1478,6 +1471,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                spin_unlock(vmf->ptl);
                put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
                goto out;
+       } else if (target_nid == NUMA_NO_NODE) {
+               /* There are no parallel migrations and page is in the right
+                * node. Clear the numa hinting info in this pmd.
+                */
+               goto clear_pmdnuma;
        }
 
        /*
@@ -1696,7 +1694,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
-                       page = pfn_to_page(swp_offset(entry));
+                       page = migration_entry_to_page(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1794,8 +1792,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 /*
  * Returns
  *  - 0 if PMD could not be locked
- *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2104,7 +2102,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                swp_entry_t entry;
 
                entry = pmd_to_swp_entry(old_pmd);
-               page = pfn_to_page(swp_offset(entry));
+               page = migration_entry_to_page(entry);
                write = is_write_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
@@ -2303,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
        __split_huge_pmd(vma, pmd, address, freeze, page);
 }
 
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+       /*
+        * If the new address isn't hpage aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+           range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+                        ALIGN(address, HPAGE_PMD_SIZE)))
+               split_huge_pmd_address(vma, address, false, NULL);
+}
+
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
 {
-       /*
-        * If the new start address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (start & ~HPAGE_PMD_MASK &&
-           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, start, false, NULL);
+       /* Check if we need to split start first. */
+       split_huge_pmd_if_needed(vma, start);
 
-       /*
-        * If the new end address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (end & ~HPAGE_PMD_MASK &&
-           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, end, false, NULL);
+       /* Check if we need to split end next. */
+       split_huge_pmd_if_needed(vma, end);
 
        /*
-        * If we're also updating the vma->vm_next->vm_start, if the new
-        * vm_next->vm_start isn't hpage aligned and it could previously
-        * contain an hugepage: check if we need to split an huge pmd.
+        * If we're also updating the vma->vm_next->vm_start,
+        * check if we need to split it.
         */
        if (adjust_next > 0) {
                struct vm_area_struct *next = vma->vm_next;
                unsigned long nstart = next->vm_start;
                nstart += adjust_next;
-               if (nstart & ~HPAGE_PMD_MASK &&
-                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
-                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_pmd_address(next, nstart, false, NULL);
+               split_huge_pmd_if_needed(next, nstart);
        }
 }
 
@@ -2477,7 +2469,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                xa_lock(&swap_cache->i_pages);
        }
 
-       /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+       /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
        lruvec = lock_page_lruvec(head);
 
        for (i = nr - 1; i >= 1; i--) {
@@ -2838,8 +2830,8 @@ void deferred_split_huge_page(struct page *page)
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
                if (memcg)
-                       memcg_set_shrinker_bit(memcg, page_to_nid(page),
-                                              deferred_split_shrinker.id);
+                       set_shrinker_bit(memcg, page_to_nid(page),
+                                        deferred_split_shrinker.id);
 #endif
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2924,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
 {
        struct zone *zone;
        struct page *page;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;
 
-       if (val != 1)
-               return -EINVAL;
-
+       pr_debug("Split all THPs\n");
        for_each_populated_zone(zone) {
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2957,15 +2947,243 @@ static int split_huge_pages_set(void *data, u64 val)
                        unlock_page(page);
 next:
                        put_page(page);
+                       cond_resched();
                }
        }
 
-       pr_info("%lu of %lu THP split\n", split, total);
+       pr_debug("%lu of %lu THP split\n", split, total);
+}
 
-       return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+       return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+                   is_vm_hugetlb_page(vma);
+}
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+                               unsigned long vaddr_end)
+{
+       int ret = 0;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       unsigned long total = 0, split = 0;
+       unsigned long addr;
+
+       vaddr_start &= PAGE_MASK;
+       vaddr_end &= PAGE_MASK;
+
+       /* Find the task_struct from pid */
+       rcu_read_lock();
+       task = find_task_by_vpid(pid);
+       if (!task) {
+               rcu_read_unlock();
+               ret = -ESRCH;
+               goto out;
+       }
+       get_task_struct(task);
+       rcu_read_unlock();
+
+       /* Find the mm_struct */
+       mm = get_task_mm(task);
+       put_task_struct(task);
+
+       if (!mm) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+                pid, vaddr_start, vaddr_end);
+
+       mmap_read_lock(mm);
+       /*
+        * always increase addr by PAGE_SIZE, since we could have a PTE page
+        * table filled with PTE-mapped THPs, each of which is distinct.
+        */
+       for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+               struct vm_area_struct *vma = find_vma(mm, addr);
+               unsigned int follflags;
+               struct page *page;
+
+               if (!vma || addr < vma->vm_start)
+                       break;
+
+               /* skip special VMA and hugetlb VMA */
+               if (vma_not_suitable_for_thp_split(vma)) {
+                       addr = vma->vm_end;
+                       continue;
+               }
+
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               follflags = FOLL_GET | FOLL_DUMP;
+               page = follow_page(vma, addr, follflags);
+
+               if (IS_ERR(page))
+                       continue;
+               if (!page)
+                       continue;
+
+               if (!is_transparent_hugepage(page))
+                       goto next;
+
+               total++;
+               if (!can_split_huge_page(compound_head(page), NULL))
+                       goto next;
+
+               if (!trylock_page(page))
+                       goto next;
+
+               if (!split_huge_page(page))
+                       split++;
+
+               unlock_page(page);
+next:
+               put_page(page);
+               cond_resched();
+       }
+       mmap_read_unlock(mm);
+       mmput(mm);
+
+       pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+       return ret;
+}
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+                               pgoff_t off_end)
+{
+       struct filename *file;
+       struct file *candidate;
+       struct address_space *mapping;
+       int ret = -EINVAL;
+       pgoff_t index;
+       int nr_pages = 1;
+       unsigned long total = 0, split = 0;
+
+       file = getname_kernel(file_path);
+       if (IS_ERR(file))
+               return ret;
+
+       candidate = file_open_name(file, O_RDONLY, 0);
+       if (IS_ERR(candidate))
+               goto out;
+
+       pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+                file_path, off_start, off_end);
+
+       mapping = candidate->f_mapping;
+
+       for (index = off_start; index < off_end; index += nr_pages) {
+               struct page *fpage = pagecache_get_page(mapping, index,
+                                               FGP_ENTRY | FGP_HEAD, 0);
+
+               nr_pages = 1;
+               if (xa_is_value(fpage) || !fpage)
+                       continue;
+
+               if (!is_transparent_hugepage(fpage))
+                       goto next;
+
+               total++;
+               nr_pages = thp_nr_pages(fpage);
+
+               if (!trylock_page(fpage))
+                       goto next;
+
+               if (!split_huge_page(fpage))
+                       split++;
+
+               unlock_page(fpage);
+next:
+               put_page(fpage);
+               cond_resched();
+       }
+
+       filp_close(candidate, NULL);
+       ret = 0;
+
+       pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+       putname(file);
+       return ret;
 }
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
-               "%llu\n");
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppops)
+{
+       static DEFINE_MUTEX(split_debug_mutex);
+       ssize_t ret;
+       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+       char input_buf[MAX_INPUT_BUF_SZ];
+       int pid;
+       unsigned long vaddr_start, vaddr_end;
+
+       ret = mutex_lock_interruptible(&split_debug_mutex);
+       if (ret)
+               return ret;
+
+       ret = -EFAULT;
+
+       memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+       if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+               goto out;
+
+       input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+       if (input_buf[0] == '/') {
+               char *tok;
+               char *buf = input_buf;
+               char file_path[MAX_INPUT_BUF_SZ];
+               pgoff_t off_start = 0, off_end = 0;
+               size_t input_len = strlen(input_buf);
+
+               tok = strsep(&buf, ",");
+               if (tok) {
+                       strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+               } else {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+               if (ret != 2) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = split_huge_pages_in_file(file_path, off_start, off_end);
+               if (!ret)
+                       ret = input_len;
+
+               goto out;
+       }
+
+       ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+       if (ret == 1 && pid == 1) {
+               split_huge_pages_all();
+               ret = strlen(input_buf);
+               goto out;
+       } else if (ret != 3) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+       if (!ret)
+               ret = strlen(input_buf);
+out:
+       mutex_unlock(&split_debug_mutex);
+       return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+       .owner   = THIS_MODULE,
+       .write   = split_huge_pages_write,
+       .llseek  = no_llseek,
+};
 
 static int __init split_huge_pages_debugfs(void)
 {
index 6c72433..3db405d 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
 
@@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool)
        return true;
 }
 
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+                                               unsigned long irq_flags)
 {
-       spin_unlock(&spool->lock);
+       spin_unlock_irqrestore(&spool->lock, irq_flags);
 
        /* If no pages are used, and no other handles to the subpool
         * remain, give up any reservations based on minimum size and
@@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 
 void hugepage_put_subpool(struct hugepage_subpool *spool)
 {
-       spin_lock(&spool->lock);
+       unsigned long flags;
+
+       spin_lock_irqsave(&spool->lock, flags);
        BUG_ON(!spool->count);
        spool->count--;
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
 }
 
 /*
@@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
        if (!spool)
                return ret;
 
-       spin_lock(&spool->lock);
+       spin_lock_irq(&spool->lock);
 
        if (spool->max_hpages != -1) {          /* maximum size accounting */
                if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
        }
 
 unlock_ret:
-       spin_unlock(&spool->lock);
+       spin_unlock_irq(&spool->lock);
        return ret;
 }
 
@@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
        long ret = delta;
+       unsigned long flags;
 
        if (!spool)
                return delta;
 
-       spin_lock(&spool->lock);
+       spin_lock_irqsave(&spool->lock, flags);
 
        if (spool->max_hpages != -1)            /* maximum size accounting */
                spool->used_hpages -= delta;
@@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
         * If hugetlbfs_put_super couldn't free spool due to an outstanding
         * quota reference, free it now.
         */
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
 
        return ret;
 }
@@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
                              resv->region_cache_count;
 
                /* At this point, we should have enough entries in the cache
-                * for all the existings adds_in_progress. We should only be
+                * for all the existing adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -553,7 +556,6 @@ retry:
        resv->adds_in_progress -= in_regions_needed;
 
        spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
        return add;
 }
 
@@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
 {
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
+       bool reserved = false;
 
        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (rsv_adjust) {
+       if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);
 
-               hugetlb_acct_memory(h, 1);
+               if (!hugetlb_acct_memory(h, 1))
+                       reserved = true;
+       } else if (!rsv_adjust) {
+               reserved = true;
        }
+
+       if (!reserved)
+               pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
 }
 
 /*
@@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
+
+       lockdep_assert_held(&hugetlb_lock);
        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
@@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 {
        struct page *page;
-       bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+       bool pin = !!(current->flags & PF_MEMALLOC_PIN);
 
+       lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (nocma && is_migrate_cma_page(page))
+               if (pin && !is_pinnable_page(page))
                        continue;
 
                if (PageHWPoison(page))
@@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
 }
 
 /*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
  * node ["this node"] from which to free a huge page.  Advance the
  * next node id whether or not we find a free huge page to free so
  * that the next attempt to free addresses the next node.
@@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
 {
-       unsigned long nr_pages = 1UL << huge_page_order(h);
+       unsigned long nr_pages = pages_per_huge_page(h);
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
 
@@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
 #endif
 
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page.  A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+       lockdep_assert_held(&hugetlb_lock);
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+
+       list_del(&page->lru);
+
+       if (HPageFreed(page)) {
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+       }
+       if (adjust_surplus) {
+               h->surplus_huge_pages--;
+               h->surplus_huge_pages_node[nid]--;
+       }
+
+       set_page_refcounted(page);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+       h->nr_huge_pages--;
+       h->nr_huge_pages_node[nid]--;
+}
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
@@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
-       h->nr_huge_pages--;
-       h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
-       set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
-               /*
-                * Temporarily drop the hugetlb_lock, because
-                * we might block in free_gigantic_page().
-                */
-               spin_unlock(&hugetlb_lock);
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
-               spin_lock(&hugetlb_lock);
        } else {
                __free_pages(page, huge_page_order(h));
        }
 }
 
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+       struct page *page, *t_page;
+
+       list_for_each_entry_safe(page, t_page, list, lru) {
+               update_and_free_page(h, page);
+               cond_resched();
+       }
+}
+
 struct hstate *size_to_hstate(unsigned long size)
 {
        struct hstate *h;
@@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
 
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
 {
        /*
         * Can't pass hstate in here because it is called from the
@@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page)
        int nid = page_to_nid(page);
        struct hugepage_subpool *spool = hugetlb_page_subpool(page);
        bool restore_reserve;
+       unsigned long flags;
 
        VM_BUG_ON_PAGE(page_count(page), page);
        VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page)
                        restore_reserve = true;
        }
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irqsave(&hugetlb_lock, flags);
        ClearHPageMigratable(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
@@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page)
                h->resv_huge_pages++;
 
        if (HPageTemporary(page)) {
-               list_del(&page->lru);
-               ClearHPageTemporary(page);
+               remove_hugetlb_page(h, page, false);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_page(h, page);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
-               list_del(&page->lru);
+               remove_hugetlb_page(h, page, true);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_page(h, page);
-               h->surplus_huge_pages--;
-               h->surplus_huge_pages_node[nid]--;
        } else {
                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
-       spin_unlock(&hugetlb_lock);
 }
 
 /*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
  */
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
 {
-       struct llist_node *node;
-       struct page *page;
-
-       node = llist_del_all(&hpage_freelist);
-
-       while (node) {
-               page = container_of((struct address_space **)node,
-                                    struct page, mapping);
-               node = node->next;
-               __free_huge_page(page);
-       }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
-{
-       /*
-        * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
-        */
-       if (!in_task()) {
-               /*
-                * Only call schedule_work() if hpage_freelist is previously
-                * empty. Otherwise, schedule_work() had been called but the
-                * workfn hasn't retrieved the list yet.
-                */
-               if (llist_add((struct llist_node *)&page->mapping,
-                             &hpage_freelist))
-                       schedule_work(&free_hpage_work);
-               return;
-       }
-
-       __free_huge_page(page);
+       lockdep_assert_held(&hugetlb_lock);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
 }
 
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void __prep_new_huge_page(struct page *page)
 {
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
        set_hugetlb_cgroup(page, NULL);
        set_hugetlb_cgroup_rsvd(page, NULL);
-       spin_lock(&hugetlb_lock);
-       h->nr_huge_pages++;
-       h->nr_huge_pages_node[nid]++;
-       ClearHPageFreed(page);
-       spin_unlock(&hugetlb_lock);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+       __prep_new_huge_page(page);
+       spin_lock_irq(&hugetlb_lock);
+       __prep_account_new_huge_page(h, nid);
+       spin_unlock_irq(&hugetlb_lock);
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1693,17 +1704,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 }
 
 /*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free.  Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page.  The caller must make
+ * an additional call to free the page to low level allocators.
  * Called with hugetlb_lock locked.
  */
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
-                                                        bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+                                               nodemask_t *nodes_allowed,
+                                                bool acct_surplus)
 {
        int nr_nodes, node;
-       int ret = 0;
+       struct page *page = NULL;
 
+       lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
@@ -1711,23 +1725,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                 */
                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                    !list_empty(&h->hugepage_freelists[node])) {
-                       struct page *page =
-                               list_entry(h->hugepage_freelists[node].next,
+                       page = list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[node]--;
-                       if (acct_surplus) {
-                               h->surplus_huge_pages--;
-                               h->surplus_huge_pages_node[node]--;
-                       }
-                       update_and_free_page(h, page);
-                       ret = 1;
+                       remove_hugetlb_page(h, page, acct_surplus);
                        break;
                }
        }
 
-       return ret;
+       return page;
 }
 
 /*
@@ -1749,7 +1754,7 @@ retry:
        if (!PageHuge(page))
                return 0;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (!PageHuge(page)) {
                rc = 0;
                goto out;
@@ -1758,7 +1763,6 @@ retry:
        if (!page_count(page)) {
                struct page *head = compound_head(page);
                struct hstate *h = page_hstate(head);
-               int nid = page_to_nid(head);
                if (h->free_huge_pages - h->resv_huge_pages == 0)
                        goto out;
 
@@ -1767,7 +1771,7 @@ retry:
                 * when it is dissolved.
                 */
                if (unlikely(!HPageFreed(head))) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        cond_resched();
 
                        /*
@@ -1789,15 +1793,14 @@ retry:
                        SetPageHWPoison(page);
                        ClearPageHWPoison(head);
                }
-               list_del(&head->lru);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
+               remove_hugetlb_page(h, page, false);
                h->max_huge_pages--;
+               spin_unlock_irq(&hugetlb_lock);
                update_and_free_page(h, head);
-               rc = 0;
+               return 0;
        }
 out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return rc;
 }
 
@@ -1839,16 +1842,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
        if (hstate_is_gigantic(h))
                return NULL;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                goto out_unlock;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * We could have raced with the pool size change.
         * Double check that and simply deallocate the new page
@@ -1858,7 +1861,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetHPageTemporary(page);
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                put_page(page);
                return NULL;
        } else {
@@ -1867,7 +1870,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
        }
 
 out_unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return page;
 }
@@ -1917,17 +1920,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
 {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (h->free_huge_pages - h->resv_huge_pages > 0) {
                struct page *page;
 
                page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
                if (page) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        return page;
                }
        }
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
@@ -1964,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
        long needed, allocated;
        bool alloc_ok = true;
 
+       lockdep_assert_held(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
@@ -1975,7 +1979,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
 
        ret = -ENOMEM;
 retry:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                NUMA_NO_NODE, NULL);
@@ -1992,7 +1996,7 @@ retry:
         * After retaking hugetlb_lock, we need to recalculate 'needed'
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
        if (needed > 0) {
@@ -2032,12 +2036,12 @@ retry:
                enqueue_huge_page(h, page);
        }
 free:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        /* Free unnecessary surplus pages to the buddy allocator */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru)
                put_page(page);
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
 
        return ret;
 }
@@ -2049,17 +2053,17 @@ free:
  *    to the associated reservation map.
  * 2) Free any unused surplus pages that may have been allocated to satisfy
  *    the reservation.  As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing.  Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
  */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
        unsigned long nr_pages;
+       struct page *page;
+       LIST_HEAD(page_list);
+
+       lockdep_assert_held(&hugetlb_lock);
+       /* Uncommit the reservation */
+       h->resv_huge_pages -= unused_resv_pages;
 
        /* Cannot return gigantic pages currently */
        if (hstate_is_gigantic(h))
@@ -2076,24 +2080,21 @@ static void return_unused_surplus_pages(struct hstate *h,
         * evenly across all nodes with memory. Iterate across these nodes
         * until we can no longer free unreserved surplus pages. This occurs
         * when the nodes with surplus pages have no free pages.
-        * free_pool_huge_page() will balance the freed pages across the
+        * remove_pool_huge_page() will balance the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
-        *
-        * Note that we decrement resv_huge_pages as we free the pages.  If
-        * we drop the lock, resv_huge_pages will still be sufficiently large
-        * to cover subsequent pages we may free.
         */
        while (nr_pages--) {
-               h->resv_huge_pages--;
-               unused_resv_pages--;
-               if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+               page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+               if (!page)
                        goto out;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
        }
 
 out:
-       /* Fully uncommit the reservation */
-       h->resv_huge_pages -= unused_resv_pages;
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
 }
 
 
@@ -2175,27 +2176,26 @@ static long __vma_reservation_common(struct hstate *h,
 
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
-       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-               /*
-                * In most cases, reserves always exist for private mappings.
-                * However, a file associated with mapping could have been
-                * hole punched or truncated after reserves were consumed.
-                * As subsequent fault on such a range will not use reserves.
-                * Subtle - The reserve map for private mappings has the
-                * opposite meaning than that of shared mappings.  If NO
-                * entry is in the reserve map, it means a reservation exists.
-                * If an entry exists in the reserve map, it means the
-                * reservation has already been consumed.  As a result, the
-                * return value of this routine is the opposite of the
-                * value returned from reserve map manipulation routines above.
-                */
-               if (ret)
-                       return 0;
-               else
-                       return 1;
-       }
-       else
-               return ret < 0 ? ret : 0;
+       /*
+        * We know private mapping must have HPAGE_RESV_OWNER set.
+        *
+        * In most cases, reserves always exist for private mappings.
+        * However, a file associated with mapping could have been
+        * hole punched or truncated after reserves were consumed.
+        * As subsequent fault on such a range will not use reserves.
+        * Subtle - The reserve map for private mappings has the
+        * opposite meaning than that of shared mappings.  If NO
+        * entry is in the reserve map, it means a reservation exists.
+        * If an entry exists in the reserve map, it means the
+        * reservation has already been consumed.  As a result, the
+        * return value of this routine is the opposite of the
+        * value returned from reserve map manipulation routines above.
+        */
+       if (ret > 0)
+               return 0;
+       if (ret == 0)
+               return 1;
+       return ret;
 }
 
 static long vma_needs_reservation(struct hstate *h,
@@ -2266,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h,
        }
 }
 
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+                                       struct list_head *list)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+       int nid = page_to_nid(old_page);
+       struct page *new_page;
+       int ret = 0;
+
+       /*
+        * Before dissolving the page, we need to allocate a new one for the
+        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+        * not having to deal with prep_new_huge_page() and avoids dealing of any
+        * counters. This simplifies and let us do the whole thing under the
+        * lock.
+        */
+       new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+       if (!new_page)
+               return -ENOMEM;
+
+retry:
+       spin_lock_irq(&hugetlb_lock);
+       if (!PageHuge(old_page)) {
+               /*
+                * Freed from under us. Drop new_page too.
+                */
+               goto free_new;
+       } else if (page_count(old_page)) {
+               /*
+                * Someone has grabbed the page, try to isolate it here.
+                * Fail with -EBUSY if not possible.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               if (!isolate_huge_page(old_page, list))
+                       ret = -EBUSY;
+               spin_lock_irq(&hugetlb_lock);
+               goto free_new;
+       } else if (!HPageFreed(old_page)) {
+               /*
+                * Page's refcount is 0 but it has not been enqueued in the
+                * freelist yet. Race window is small, so we can succeed here if
+                * we retry.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               cond_resched();
+               goto retry;
+       } else {
+               /*
+                * Ok, old_page is still a genuine free hugepage. Remove it from
+                * the freelist and decrease the counters. These will be
+                * incremented again when calling __prep_account_new_huge_page()
+                * and enqueue_huge_page() for new_page. The counters will remain
+                * stable since this happens under the lock.
+                */
+               remove_hugetlb_page(h, old_page, false);
+
+               /*
+                * new_page needs to be initialized with the standard hugetlb
+                * state. This is normally done by prep_new_huge_page() but
+                * that takes hugetlb_lock which is already held so we need to
+                * open code it here.
+                * Reference count trick is needed because allocator gives us
+                * referenced page but the pool requires pages with 0 refcount.
+                */
+               __prep_new_huge_page(new_page);
+               __prep_account_new_huge_page(h, nid);
+               page_ref_dec(new_page);
+               enqueue_huge_page(h, new_page);
+
+               /*
+                * Pages have been replaced, we can safely free the old one.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               update_and_free_page(h, old_page);
+       }
+
+       return ret;
+
+free_new:
+       spin_unlock_irq(&hugetlb_lock);
+       __free_pages(new_page, huge_page_order(h));
+
+       return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+       struct hstate *h;
+       struct page *head;
+       int ret = -EBUSY;
+
+       /*
+        * The page might have been dissolved from under our feet, so make sure
+        * to carefully check the state under the lock.
+        * Return success when racing as if we dissolved the page ourselves.
+        */
+       spin_lock_irq(&hugetlb_lock);
+       if (PageHuge(page)) {
+               head = compound_head(page);
+               h = page_hstate(head);
+       } else {
+               spin_unlock_irq(&hugetlb_lock);
+               return 0;
+       }
+       spin_unlock_irq(&hugetlb_lock);
+
+       /*
+        * Fence off gigantic pages as there is a cyclic dependency between
+        * alloc_contig_range and them. Return -ENOMEM as this has the effect
+        * of bailing out right away without further retrying.
+        */
+       if (hstate_is_gigantic(h))
+               return -ENOMEM;
+
+       if (page_count(head) && isolate_huge_page(head, list))
+               ret = 0;
+       else if (!page_count(head))
+               ret = alloc_and_dissolve_huge_page(h, head, list);
+
+       return ret;
+}
+
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
@@ -2316,7 +2444,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
        /* If this allocation is not consuming a reservation, charge it now.
         */
-       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       deferred_reserve = map_chg || avoid_reserve;
        if (deferred_reserve) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
@@ -2328,7 +2456,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (ret)
                goto out_uncharge_cgroup_reservation;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * glb_chg is passed to indicate whether or not a page must be taken
         * from the global free pool (global change).  gbl_chg == 0 indicates
@@ -2336,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
@@ -2344,7 +2472,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                        SetHPageRestoreReserve(page);
                        h->resv_huge_pages--;
                }
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                list_add(&page->lru, &h->hugepage_activelist);
                /* Fall through */
        }
@@ -2357,7 +2485,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                                                  h_cg, page);
        }
 
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        hugetlb_set_page_subpool(page, spool);
 
@@ -2547,24 +2675,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
 {
        int i;
+       LIST_HEAD(page_list);
 
+       lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h))
                return;
 
+       /*
+        * Collect pages to be freed on a list, and free after dropping lock
+        */
        for_each_node_mask(i, *nodes_allowed) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(page, next, freel, lru) {
                        if (count >= h->nr_huge_pages)
-                               return;
+                               goto out;
                        if (PageHighMem(page))
                                continue;
-                       list_del(&page->lru);
-                       update_and_free_page(h, page);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[page_to_nid(page)]--;
+                       remove_hugetlb_page(h, page, false);
+                       list_add(&page->lru, &page_list);
                }
        }
+
+out:
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
 }
 #else
 static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2583,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 {
        int nr_nodes, node;
 
+       lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON(delta != -1 && delta != 1);
 
        if (delta < 0) {
@@ -2610,6 +2747,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                              nodemask_t *nodes_allowed)
 {
        unsigned long min_count, ret;
+       struct page *page;
+       LIST_HEAD(page_list);
        NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
 
        /*
@@ -2622,7 +2761,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        else
                return -ENOMEM;
 
-       spin_lock(&hugetlb_lock);
+       /*
+        * resize_lock mutex prevents concurrent adjustments to number of
+        * pages in hstate via the proc/sysfs interfaces.
+        */
+       mutex_lock(&h->resize_lock);
+       spin_lock_irq(&hugetlb_lock);
 
        /*
         * Check for a node specific request.
@@ -2653,7 +2797,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         */
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
+                       mutex_unlock(&h->resize_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
@@ -2682,14 +2827,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
 
                /* yield cpu to avoid soft lockup */
                cond_resched();
 
                ret = alloc_pool_huge_page(h, nodes_allowed,
                                                node_alloc_noretry);
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                if (!ret)
                        goto out;
 
@@ -2716,18 +2861,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
        try_to_free_low(h, min_count, nodes_allowed);
+
+       /*
+        * Collect pages to be removed on list without dropping lock
+        */
        while (min_count < persistent_huge_pages(h)) {
-               if (!free_pool_huge_page(h, nodes_allowed, 0))
+               page = remove_pool_huge_page(h, nodes_allowed, 0);
+               if (!page)
                        break;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
        }
+       /* free the pages after dropping lock */
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
+
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
 out:
        h->max_huge_pages = persistent_huge_pages(h);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
 
        NODEMASK_FREE(node_alloc_noretry);
 
@@ -2882,9 +3039,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        if (err)
                return err;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return count;
 }
@@ -3215,6 +3372,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
        h = &hstates[hugetlb_max_hstate++];
+       mutex_init(&h->resize_lock);
        h->order = order;
        h->mask = ~(huge_page_size(h) - 1);
        for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3425,10 @@ static int __init hugepages_setup(char *s)
 
        /*
         * Global state is always initialized later in hugetlb_init.
-        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * But we need to allocate gigantic hstates here early to still
         * use the bootmem allocator.
         */
-       if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
                hugetlb_hstate_alloc_pages(parsed_hstate);
 
        last_mhp = mhp;
@@ -3470,9 +3628,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                goto out;
 
        if (write) {
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
        }
 out:
        return ret;
@@ -3568,7 +3726,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
        if (!delta)
                return 0;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -3607,7 +3765,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
                return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return ret;
 }
 
@@ -3795,7 +3953,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
-               dst_pte = huge_pte_alloc(dst, addr, sz);
+               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@ -4310,6 +4468,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
        return 0;
 }
 
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+                                                 struct address_space *mapping,
+                                                 pgoff_t idx,
+                                                 unsigned int flags,
+                                                 unsigned long haddr,
+                                                 unsigned long reason)
+{
+       vm_fault_t ret;
+       u32 hash;
+       struct vm_fault vmf = {
+               .vma = vma,
+               .address = haddr,
+               .flags = flags,
+
+               /*
+                * Hard to debug if it ends up being
+                * used by a callee that assumes
+                * something about the other
+                * uninitialized fields... same as in
+                * memory.c
+                */
+       };
+
+       /*
+        * hugetlb_fault_mutex and i_mmap_rwsem must be
+        * dropped before handling userfault.  Reacquire
+        * after handling fault to make calling code simpler.
+        */
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
+       ret = handle_userfault(&vmf, reason);
+       i_mmap_lock_read(mapping);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       return ret;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
@@ -4348,35 +4544,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-               /*
-                * Check for page in userfault range
-                */
+               /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
-                       u32 hash;
-                       struct vm_fault vmf = {
-                               .vma = vma,
-                               .address = haddr,
-                               .flags = flags,
-                               /*
-                                * Hard to debug if it ends up being
-                                * used by a callee that assumes
-                                * something about the other
-                                * uninitialized fields... same as in
-                                * memory.c
-                                */
-                       };
-
-                       /*
-                        * hugetlb_fault_mutex and i_mmap_rwsem must be
-                        * dropped before handling userfault.  Reacquire
-                        * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(mapping, idx);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-                       i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MISSING);
                        goto out;
                }
 
@@ -4395,13 +4567,10 @@ retry:
                         * sure there really is no pte entry.
                         */
                        ptl = huge_pte_lock(h, mm, ptep);
-                       if (!huge_pte_none(huge_ptep_get(ptep))) {
-                               ret = 0;
-                               spin_unlock(ptl);
-                               goto out;
-                       }
+                       ret = 0;
+                       if (huge_pte_none(huge_ptep_get(ptep)))
+                               ret = vmf_error(PTR_ERR(page));
                        spin_unlock(ptl);
-                       ret = vmf_error(PTR_ERR(page));
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4435,6 +4604,16 @@ retry:
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
+
+               /* Check for page in userfault range. */
+               if (userfaultfd_minor(vma)) {
+                       unlock_page(page);
+                       put_page(page);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MINOR);
+                       goto out;
+               }
        }
 
        /*
@@ -4563,7 +4742,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        mapping = vma->vm_file->f_mapping;
        i_mmap_lock_read(mapping);
-       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
        if (!ptep) {
                i_mmap_unlock_read(mapping);
                return VM_FAULT_OOM;
@@ -4675,6 +4854,7 @@ out_mutex:
        return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
 /*
  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  * modifications for huge pages.
@@ -4684,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
+                           enum mcopy_atomic_mode mode,
                            struct page **pagep)
 {
+       bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
        struct address_space *mapping;
        pgoff_t idx;
        unsigned long size;
@@ -4695,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        spinlock_t *ptl;
        int ret;
        struct page *page;
+       int writable;
+
+       mapping = dst_vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
 
-       if (!*pagep) {
+       if (is_continue) {
+               ret = -EFAULT;
+               page = find_lock_page(mapping, idx);
+               if (!page)
+                       goto out;
+       } else if (!*pagep) {
                ret = -ENOMEM;
                page = alloc_huge_page(dst_vma, dst_addr, 0);
                if (IS_ERR(page))
@@ -4725,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         */
        __SetPageUptodate(page);
 
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-       /*
-        * If shared, add to page cache
-        */
-       if (vm_shared) {
+       /* Add shared, newly allocated pages to the page cache. */
+       if (vm_shared && !is_continue) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                ret = -EFAULT;
                if (idx >= size)
@@ -4776,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }
 
-       _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-       if (dst_vma->vm_flags & VM_WRITE)
+       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+       if (is_continue && !vm_shared)
+               writable = 0;
+       else
+               writable = dst_vma->vm_flags & VM_WRITE;
+
+       _dst_pte = make_huge_pte(dst_vma, page, writable);
+       if (writable)
                _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
 
@@ -4791,20 +4983,22 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
        spin_unlock(ptl);
-       SetHPageMigratable(page);
-       if (vm_shared)
+       if (!is_continue)
+               SetHPageMigratable(page);
+       if (vm_shared || is_continue)
                unlock_page(page);
        ret = 0;
 out:
        return ret;
 out_release_unlock:
        spin_unlock(ptl);
-       if (vm_shared)
+       if (vm_shared || is_continue)
                unlock_page(page);
 out_release_nounlock:
        put_page(page);
        goto out;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
                                 int refs, struct page **pages,
@@ -4996,14 +5190,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i ? i : err;
 }
 
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
-#endif
-
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
@@ -5280,6 +5466,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
+        *
+        * Note that !resv_map implies freed == 0. So (chg - freed)
+        * won't go negative.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5515,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
        return false;
 }
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
+       return vma_shareable(vma, addr);
+}
+
 /*
  * Determine if start,end range within vma could be mapped by shared pmd.
  * If yes, adjust start and end to cover range associated with possible
@@ -5338,8 +5536,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
 
        /*
-        * vma need span at least one aligned PUD size and the start,end range
-        * must at least partialy within it.
+        * vma needs to span at least one aligned PUD size, and the range
+        * must be at least partially within in.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))
@@ -5370,9 +5568,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
  * only required for subsequent processing.
  */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
-       struct vm_area_struct *vma = find_vma(mm, addr);
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
@@ -5382,9 +5580,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
-
        i_mmap_assert_locked(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
@@ -5448,9 +5643,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
        return 1;
 }
-#define want_pmd_share()       (1)
+
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
        return NULL;
 }
@@ -5465,11 +5661,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
 {
 }
-#define want_pmd_share()       (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+       return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
@@ -5487,8 +5687,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
-                       if (want_pmd_share() && pud_none(*pud))
-                               pte = huge_pmd_share(mm, addr, pud);
+                       if (want_pmd_share(vma, addr) && pud_none(*pud))
+                               pte = huge_pmd_share(mm, vma, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
@@ -5632,7 +5832,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 {
        bool ret = true;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (!PageHeadHuge(page) ||
            !HPageMigratable(page) ||
            !get_page_unless_zero(page)) {
@@ -5642,16 +5842,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
        ClearHPageMigratable(page);
        list_move_tail(&page->lru, list);
 unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        SetHPageMigratable(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        put_page(page);
 }
 
@@ -5679,13 +5879,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                SetHPageTemporary(oldpage);
                ClearHPageTemporary(newpage);
 
-               spin_lock(&hugetlb_lock);
+               /*
+                * There is no need to transfer the per-node surplus state
+                * when we do not cross the node.
+                */
+               if (new_nid == old_nid)
+                       return;
+               spin_lock_irq(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
                        h->surplus_huge_pages_node[new_nid]++;
                }
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
+       }
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_notifier_range range;
+       unsigned long address, start, end;
+       spinlock_t *ptl;
+       pte_t *ptep;
+
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       start = ALIGN(vma->vm_start, PUD_SIZE);
+       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return;
+
+       /*
+        * No need to call adjust_range_if_pmd_sharing_possible(), because
+        * we have already done the PUD_SIZE alignment.
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       for (address = start; address < end; address += PUD_SIZE) {
+               unsigned long tmp = address;
+
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+                       continue;
+               ptl = huge_pte_lock(h, mm, ptep);
+               /* We don't want 'address' to be changed */
+               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               spin_unlock(ptl);
        }
+       flush_hugetlb_tlb_range(vma, start, end);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/vm/mmu_notifier.rst.
+        */
+       mmu_notifier_invalidate_range_end(&range);
 }
 
 #ifdef CONFIG_CMA
index 603a131..5383023 100644 (file)
@@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
        do {
                idx = 0;
                for_each_hstate(h) {
-                       spin_lock(&hugetlb_lock);
+                       spin_lock_irq(&hugetlb_lock);
                        list_for_each_entry(page, &h->hugepage_activelist, lru)
                                hugetlb_cgroup_move_parent(idx, h_cg, page);
 
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        idx++;
                }
                cond_resched();
@@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
        if (hugetlb_cgroup_disabled())
                return;
 
-       VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_page(oldhpage);
        h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
        set_hugetlb_cgroup(oldhpage, NULL);
@@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
        set_hugetlb_cgroup(newhpage, h_cg);
        set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
        list_move(&newhpage->lru, &h->hugepage_activelist);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return;
 }
 
index ef5f336..54bd0dc 100644 (file)
@@ -244,7 +244,13 @@ struct compact_control {
        unsigned int nr_freepages;      /* Number of isolated free pages */
        unsigned int nr_migratepages;   /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
-       unsigned long migrate_pfn;      /* isolate_migratepages search base */
+       /*
+        * Acts as an in/out parameter to page isolation for migration.
+        * isolate_migratepages uses it as a search base.
+        * isolate_migratepages_block will update the value to the next pfn
+        * after the last isolated one.
+        */
+       unsigned long migrate_pfn;
        unsigned long fast_start_pfn;   /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
@@ -280,7 +286,7 @@ struct capture_control {
 unsigned long
 isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
-unsigned long
+int
 isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
 int find_suitable_fallback(struct free_area *area, unsigned int order,
@@ -328,7 +334,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
 }
 
 /*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
  *
  * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
  * do_mmap() forbids all other combinations.
index 3820ca5..8f450bc 100644 (file)
@@ -55,9 +55,9 @@ extern bool kasan_flag_async __ro_after_init;
 #define KASAN_TAG_MAX          0xFD /* maximum value for random tags */
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN          0xF0 /* mimimum value for random tags */
+#define KASAN_TAG_MIN          0xF0 /* minimum value for random tags */
 #else
-#define KASAN_TAG_MIN          0x00 /* mimimum value for random tags */
+#define KASAN_TAG_MIN          0x00 /* minimum value for random tags */
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
@@ -403,7 +403,7 @@ static inline bool kasan_byte_accessible(const void *addr)
 #else /* CONFIG_KASAN_HW_TAGS */
 
 /**
- * kasan_poison - mark the memory range as unaccessible
+ * kasan_poison - mark the memory range as inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size, must be aligned to KASAN_GRANULE_SIZE
  * @value - value that's written to metadata for the range
@@ -434,7 +434,7 @@ bool kasan_byte_accessible(const void *addr);
 
 /**
  * kasan_poison_last_granule - mark the last granule of the memory range as
- * unaccessible
+ * inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size
  *
index 728fb24..d8ccff4 100644 (file)
@@ -27,7 +27,7 @@
 /* Data structure and operations for quarantine queues. */
 
 /*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
  * objects inside of it.
  */
 struct qlist_head {
@@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
                local_irq_save(flags);
 
        /*
-        * As the object now gets freed from the quaratine, assume that its
+        * As the object now gets freed from the quarantine, assume that its
         * free track is no longer valid.
         */
        *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
index 727ad46..082ee5b 100644 (file)
@@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
         * // rest of vmalloc process           <data dependency>
         * STORE p, a                           LOAD shadow(x+99)
         *
-        * If there is no barrier between the end of unpoisioning the shadow
+        * If there is no barrier between the end of unpoisoning the shadow
         * and the store of the result to p, the stores could be committed
         * in a different order by CPU#0, and CPU#1 could erroneously observe
         * poison in the shadow.
@@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  * How does this work?
  * -------------------
  *
- * We have a region that is page aligned, labelled as A.
+ * We have a region that is page aligned, labeled as A.
  * That might not map onto the shadow in a way that is page-aligned:
  *
  *                    start                     end
index d53c91f..e18fbbd 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/debugfs.h>
+#include <linux/irq_work.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
@@ -19,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/sysctl.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 
        /* Restore page protection if there was an OOB access. */
        if (meta->unprotected_page) {
+               memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
                kfence_protect(meta->unprotected_page);
                meta->unprotected_page = 0;
        }
@@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init);
 
 /* === Allocation Gate Timer ================================================ */
 
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+       wake_up(&allocation_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+#endif
+
 /*
  * Set up delayed work, which will enable and disable the static key. We need to
  * use a work queue (rather than a simple timer), since enabling and disabling a
@@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work)
        if (!READ_ONCE(kfence_enabled))
                return;
 
-       /* Enable static key, and await allocation to happen. */
        atomic_set(&kfence_allocation_gate, 0);
 #ifdef CONFIG_KFENCE_STATIC_KEYS
+       /* Enable static key, and await allocation to happen. */
        static_branch_enable(&kfence_allocation_key);
-       /*
-        * Await an allocation. Timeout after 1 second, in case the kernel stops
-        * doing allocations, to avoid stalling this worker task for too long.
-        */
-       {
-               unsigned long end_wait = jiffies + HZ;
-
-               do {
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       if (atomic_read(&kfence_allocation_gate) != 0)
-                               break;
-                       schedule_timeout(1);
-               } while (time_before(jiffies, end_wait));
-               __set_current_state(TASK_RUNNING);
+
+       if (sysctl_hung_task_timeout_secs) {
+               /*
+                * During low activity with no allocations we might wait a
+                * while; let's avoid the hung task warning.
+                */
+               wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
+                                  sysctl_hung_task_timeout_secs * HZ / 2);
+       } else {
+               wait_event(allocation_wait, atomic_read(&kfence_allocation_gate));
        }
+
        /* Disable static key and reset timer. */
        static_branch_disable(&kfence_allocation_key);
 #endif
-       schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+       queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+                          msecs_to_jiffies(kfence_sample_interval));
 }
 static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
 
@@ -654,7 +666,7 @@ void __init kfence_init(void)
        }
 
        WRITE_ONCE(kfence_enabled, true);
-       schedule_delayed_work(&kfence_timer, 0);
+       queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
        pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
                CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
                (void *)(__kfence_pool + KFENCE_POOL_SIZE));
@@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
         */
        if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
                return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+       /*
+        * waitqueue_active() is fully ordered after the update of
+        * kfence_allocation_gate per atomic_inc_return().
+        */
+       if (waitqueue_active(&allocation_wait)) {
+               /*
+                * Calling wake_up() here may deadlock when allocations happen
+                * from within timer code. Use an irq_work to defer it.
+                */
+               irq_work_queue(&wake_up_kfence_timer_work);
+       }
+#endif
 
        if (!READ_ONCE(kfence_enabled))
                return NULL;
index e3f7145..2a319c2 100644 (file)
@@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
        if (panic_on_warn)
                panic("panic_on_warn set ...\n");
 
-       /* We encountered a memory unsafety error, taint the kernel! */
+       /* We encountered a memory safety error, taint the kernel! */
        add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
 }
index a7d6cb9..6c0185f 100644 (file)
@@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm)
                return -ENOMEM;
 
        /* __khugepaged_exit() must not run from under us */
-       VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+       VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
                free_mm_slot(mm_slot);
                return 0;
@@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
-                * an additinal pin on the page.
+                * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
@@ -716,17 +716,17 @@ next:
                if (pte_write(pteval))
                        writable = true;
        }
-       if (likely(writable)) {
-               if (likely(referenced)) {
-                       result = SCAN_SUCCEED;
-                       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
-                                                           referenced, writable, result);
-                       return 1;
-               }
-       } else {
+
+       if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
+       } else if (unlikely(!referenced)) {
+               result = SCAN_LACK_REFERENCED_PAGE;
+       } else {
+               result = SCAN_SUCCEED;
+               trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+                                                   referenced, writable, result);
+               return 1;
        }
-
 out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
-       if (!node_reclaim_mode)
+       if (!node_reclaim_enabled())
                return false;
 
        /* If there is a count for this node already, it must be acceptable */
@@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, &vma);
        if (result)
-               goto out;
+               goto out_up_write;
        /* check if the pmd is still valid */
        if (mm_find_pmd(mm, address) != pmd)
-               goto out;
+               goto out_up_write;
 
        anon_vma_lock_write(vma->anon_vma);
 
@@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                result = SCAN_FAIL;
-               goto out;
+               goto out_up_write;
        }
 
        /*
@@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm,
        __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
                        &compound_pagelist);
        pte_unmap(pte);
+       /*
+        * spin_lock() below is not the equivalent of smp_wmb(), but
+        * the smp_wmb() inside __SetPageUptodate() can be reused to
+        * avoid the copy_huge_page writes to become visible after
+        * the set_pmd_at() write.
+        */
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
 
        _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
-       /*
-        * spin_lock() below is not the equivalent of smp_wmb(), so
-        * this is needed to avoid the copy_huge_page writes to become
-        * visible after the set_pmd_at() write.
-        */
-       smp_wmb();
-
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address, true);
@@ -1216,8 +1215,6 @@ out_nolock:
                mem_cgroup_uncharge(*hpage);
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
-out:
-       goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                                goto out_unmap;
                        }
                }
-               if (!pte_present(pteval)) {
-                       result = SCAN_PTE_NON_PRESENT;
-                       goto out_unmap;
-               }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
@@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        int i;
 
        if (!vma || !vma->vm_file ||
-           vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+           !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return;
 
        /*
@@ -1533,16 +1526,16 @@ abort:
        goto drop_hpage;
 }
 
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 {
        struct mm_struct *mm = mm_slot->mm;
        int i;
 
        if (likely(mm_slot->nr_pte_mapped_thp == 0))
-               return 0;
+               return;
 
        if (!mmap_write_trylock(mm))
-               return -EBUSY;
+               return;
 
        if (unlikely(khugepaged_test_exit(mm)))
                goto out;
@@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 out:
        mm_slot->nr_pte_mapped_thp = 0;
        mmap_write_unlock(mm);
-       return 0;
 }
 
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
@@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        BUILD_BUG();
 }
 
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 {
-       return 0;
 }
 #endif
 
@@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void)
 {
        struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
-       unsigned int pages = khugepaged_pages_to_scan;
+       unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
 
-       barrier(); /* write khugepaged_pages_to_scan to local stack */
-
        lru_add_drain_all();
 
        while (progress < pages) {
index 9694ee2..6bbe314 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -215,8 +215,6 @@ struct rmap_item {
 #define SEQNR_MASK     0x0ff   /* low bits of unstable tree seqnr */
 #define UNSTABLE_FLAG  0x100   /* is a node of the unstable tree */
 #define STABLE_FLAG    0x200   /* is listed from the stable tree */
-#define KSM_FLAG_MASK  (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
-                               /* to mask all the flags */
 
 /* The stable and unstable tree heads */
 static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -461,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  * in case the application has unmapped and remapped mm,addr meanwhile.
  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
  *
  * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
  * of the process that owns 'vma'.  We also do not want to enforce
@@ -778,12 +776,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                struct page *page;
 
                stable_node = rmap_item->head;
-               page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
+               page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
                if (!page)
                        goto out;
 
                hlist_del(&rmap_item->hlist);
-               unlock_page(page);
                put_page(page);
 
                if (!hlist_empty(&stable_node->hlist))
@@ -794,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                stable_node->rmap_hlist_len--;
 
                put_anon_vma(rmap_item->anon_vma);
+               rmap_item->head = NULL;
                rmap_item->address &= PAGE_MASK;
 
        } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -817,8 +815,7 @@ out:
        cond_resched();         /* we're called from many long loops */
 }
 
-static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
-                                      struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
 {
        while (*rmap_list) {
                struct rmap_item *rmap_item = *rmap_list;
@@ -989,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void)
                                goto error;
                }
 
-               remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+               remove_trailing_rmap_items(&mm_slot->rmap_list);
                mmap_read_unlock(mm);
 
                spin_lock(&ksm_mmlist_lock);
@@ -1068,7 +1065,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                /*
                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
-                * with the pagecount against the mapcount is racey and
+                * with the pagecount against the mapcount is racy and
                 * O_DIRECT can happen right after the check.
                 * So we clear the pte and flush the tlb before the check
                 * this assure us that no O_DIRECT can happen after the check
@@ -1438,7 +1435,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
                         */
                        *_stable_node = found;
                        /*
-                        * Just for robustneess as stable_node is
+                        * Just for robustness, as stable_node is
                         * otherwise left as a stable pointer, the
                         * compiler shall optimize it away at build
                         * time.
@@ -1771,7 +1768,6 @@ chain_append:
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
-               VM_BUG_ON(is_stable_node_chain(stable_node_dup));
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* chain is missing so create it */
                stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1781,6 @@ chain_append:
         * of the current nid for this page
         * content.
         */
-       VM_BUG_ON(!is_stable_node_chain(stable_node));
        VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
        VM_BUG_ON(page_node->head != &migrate_nodes);
        list_del(&page_node->list);
@@ -2337,7 +2332,7 @@ next_mm:
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
-       remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+       remove_trailing_rmap_items(ksm_scan.rmap_list);
 
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -2634,7 +2629,7 @@ again:
                        vma = vmac->vma;
 
                        /* Ignore the stable/unstable/sqnr flags */
-                       addr = rmap_item->address & ~KSM_FLAG_MASK;
+                       addr = rmap_item->address & PAGE_MASK;
 
                        if (addr < vma->vm_start || addr >= vma->vm_end)
                                continue;
index 6f067b6..cd58790 100644 (file)
@@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
-                       memcg_set_shrinker_bit(memcg, nid,
-                                              lru_shrinker_id(lru));
+                       set_shrinker_bit(memcg, nid,
+                                        lru_shrinker_id(lru));
                nlru->nr_items++;
                spin_unlock(&nlru->lock);
                return true;
@@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
 
        if (src->nr_items) {
                dst->nr_items += src->nr_items;
-               memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+               set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
                src->nr_items = 0;
        }
 
index 01fef79..63e489e 100644 (file)
@@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                if (end > vma->vm_end) {
                        /*
                         * Don't fail if end > vma->vm_end. If the old
-                        * vma was splitted while the mmap_lock was
+                        * vma was split while the mmap_lock was
                         * released the effect of the concurrent
                         * operation may not cause madvise() to
                         * have an undefined result. There may be an
@@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior)
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  *  MADV_COLD - the application is not expected to use this memory soon,
  *             deactivate pages in this range so that they can be reclaimed
- *             easily if memory pressure hanppens.
+ *             easily if memory pressure happens.
  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
  *             page out the pages in this range immediately.
  *
index c100265..64ada9e 100644 (file)
@@ -215,7 +215,7 @@ enum res_type {
 #define MEMFILE_PRIVATE(x, val)        ((x) << 16 | (val))
 #define MEMFILE_TYPE(val)      ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)      ((val) & 0xffff)
-/* Used for OOM nofiier */
+/* Used for OOM notifier */
 #define OOM_CONTROL            (0)
 
 /*
@@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 #endif
 
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
-       kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
-                                        int size, int old_size)
-{
-       struct memcg_shrinker_map *new, *old;
-       struct mem_cgroup_per_node *pn;
-       int nid;
-
-       lockdep_assert_held(&memcg_shrinker_map_mutex);
-
-       for_each_node(nid) {
-               pn = memcg->nodeinfo[nid];
-               old = rcu_dereference_protected(pn->shrinker_map, true);
-               /* Not yet online memcg */
-               if (!old)
-                       return 0;
-
-               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
-               if (!new)
-                       return -ENOMEM;
-
-               /* Set all old bits, clear all new bits */
-               memset(new->map, (int)0xff, old_size);
-               memset((void *)new->map + old_size, 0, size - old_size);
-
-               rcu_assign_pointer(pn->shrinker_map, new);
-               call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
-       }
-
-       return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup_per_node *pn;
-       struct memcg_shrinker_map *map;
-       int nid;
-
-       if (mem_cgroup_is_root(memcg))
-               return;
-
-       for_each_node(nid) {
-               pn = memcg->nodeinfo[nid];
-               map = rcu_dereference_protected(pn->shrinker_map, true);
-               kvfree(map);
-               rcu_assign_pointer(pn->shrinker_map, NULL);
-       }
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-       struct memcg_shrinker_map *map;
-       int nid, size, ret = 0;
-
-       if (mem_cgroup_is_root(memcg))
-               return 0;
-
-       mutex_lock(&memcg_shrinker_map_mutex);
-       size = memcg_shrinker_map_size;
-       for_each_node(nid) {
-               map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
-               if (!map) {
-                       memcg_free_shrinker_maps(memcg);
-                       ret = -ENOMEM;
-                       break;
-               }
-               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
-       }
-       mutex_unlock(&memcg_shrinker_map_mutex);
-
-       return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
-       int size, old_size, ret = 0;
-       struct mem_cgroup *memcg;
-
-       size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
-       old_size = memcg_shrinker_map_size;
-       if (size <= old_size)
-               return 0;
-
-       mutex_lock(&memcg_shrinker_map_mutex);
-       if (!root_mem_cgroup)
-               goto unlock;
-
-       for_each_mem_cgroup(memcg) {
-               if (mem_cgroup_is_root(memcg))
-                       continue;
-               ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
-               if (ret) {
-                       mem_cgroup_iter_break(NULL, memcg);
-                       goto unlock;
-               }
-       }
-unlock:
-       if (!ret)
-               memcg_shrinker_map_size = size;
-       mutex_unlock(&memcg_shrinker_map_mutex);
-       return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
-       if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
-               struct memcg_shrinker_map *map;
-
-               rcu_read_lock();
-               map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
-               /* Pairs with smp mb in shrink_slab() */
-               smp_mb__before_atomic();
-               set_bit(shrinker_id, map->map);
-               rcu_read_unlock();
-       }
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -910,7 +786,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
  * __count_memcg_events - account VM events in a cgroup
  * @memcg: the memory cgroup
  * @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
  */
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count)
@@ -1028,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
        rcu_read_lock();
        do {
                /*
-                * Page cache insertions can happen withou an
+                * Page cache insertions can happen without an
                 * actual mm context, e.g. during disk probing
                 * on boot, loopback IO, acct() writes etc.
                 */
@@ -1836,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
        struct mem_cgroup *iter;
 
        /*
-        * Be careful about under_oom underflows becase a child memcg
+        * Be careful about under_oom underflows because a child memcg
         * could have been added after mem_cgroup_mark_under_oom.
         */
        spin_lock(&memcg_oom_lock);
@@ -2008,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
                /*
                 * There is no guarantee that an OOM-lock contender
                 * sees the wakeups triggered by the OOM kill
-                * uncharges.  Wake any sleepers explicitely.
+                * uncharges.  Wake any sleepers explicitly.
                 */
                memcg_oom_recover(memcg);
        }
@@ -4488,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * Foreign dirty flushing
  *
  * There's an inherent mismatch between memcg and writeback.  The former
- * trackes ownership per-page while the latter per-inode.  This was a
+ * tracks ownership per-page while the latter per-inode.  This was a
  * deliberate design decision because honoring per-page ownership in the
  * writeback path is complicated, may lead to higher CPU and IO overheads
  * and deemed unnecessary given that write-sharing an inode across
@@ -4503,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * triggering background writeback.  A will be slowed down without a way to
  * make writeback of the dirty pages happen.
  *
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
  * severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
  * completely idle.
  *
  * Solving this problem completely requires matching the ownership tracking
@@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
        /*
-        * A memcg must be visible for memcg_expand_shrinker_maps()
+        * A memcg must be visible for expand_shrinker_info()
         * by the time the maps are allocated. So, we allocate maps
         * here, when for_each_mem_cgroup() can't skip it.
         */
-       if (memcg_alloc_shrinker_maps(memcg)) {
+       if (alloc_shrinker_info(memcg)) {
                mem_cgroup_id_remove(memcg);
                return -ENOMEM;
        }
@@ -5278,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        page_counter_set_low(&memcg->memory, 0);
 
        memcg_offline_kmem(memcg);
+       reparent_shrinker_deferred(memcg);
        wb_memcg_offline(memcg);
 
        drain_all_stock(memcg);
@@ -5310,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
        vmpressure_cleanup(&memcg->vmpressure);
        cancel_work_sync(&memcg->high_work);
        mem_cgroup_remove_from_trees(memcg);
-       memcg_free_shrinker_maps(memcg);
+       free_shrinker_info(memcg);
        memcg_free_kmem(memcg);
        mem_cgroup_free(memcg);
 }
@@ -5897,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
                return 0;
 
        /*
-        * We are now commited to this value whatever it is. Changes in this
+        * We are now committed to this value whatever it is. Changes in this
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
index bd39454..85ad98c 100644 (file)
@@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
                if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
                        /*
                         * We could fail to take off the target page from buddy
-                        * for example due to racy page allocaiton, but that's
+                        * for example due to racy page allocation, but that's
                         * acceptable because soft-offlined page is not broken
                         * and if someone really want to use it, they should
                         * take it.
index cbdc2cd..730daa0 100644 (file)
@@ -3339,7 +3339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }
 
 
-       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;
 
@@ -3388,7 +3388,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
-                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }
 
@@ -3402,13 +3402,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                goto out_release;
        }
 
        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
@@ -3727,7 +3727,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                return ret;
 
        /*
-        * Archs like ppc64 need additonal space to store information
+        * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -4503,7 +4503,7 @@ retry_pud:
 }
 
 /**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
  *
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
@@ -4512,9 +4512,9 @@ retry_pud:
  * @flags: the fault flags.
  * @ret: the fault retcode.
  *
- * This will take care of most of the page fault accountings.  Meanwhile, it
+ * This will take care of most of the page fault accounting.  Meanwhile, it
  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
 static inline void mm_account_fault(struct pt_regs *regs,
@@ -4848,7 +4848,7 @@ out:
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
  * @buf: buffer to read/write
  * @len: length of transfer
  * @write: set to FOLL_WRITE when writing, otherwise reading
index 0cdbbfb..70620d0 100644 (file)
 #include "internal.h"
 #include "shuffle.h"
 
+
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+#endif
+
 /*
  * online_page_callback contains pointer to current page onlining function.
  * Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
         * decide to not expose all pages to the buddy (e.g., expose them
         * later). We account all pages as being online and belonging to this
         * zone ("present").
+        * When using memmap_on_memory, the range might not be aligned to
+        * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+        * this and the first chunk to online will be pageblock_nr_pages.
         */
-       for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
-               (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+       for (pfn = start_pfn; pfn < end_pfn;) {
+               int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+               (*online_page_callback)(pfn_to_page(pfn), order);
+               pfn += (1UL << order);
+       }
 
        /* mark all involved sections as online */
        online_mem_sections(start_pfn, end_pfn);
@@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
        return movable_node_enabled ? movable_zone : kernel_zone;
 }
 
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
                unsigned long nr_pages)
 {
        if (online_type == MMOP_ONLINE_KERNEL)
@@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
        return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-                      int online_type, int nid)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
+{
+       unsigned long flags;
+
+       zone->present_pages += nr_pages;
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       zone->zone_pgdat->node_present_pages += nr_pages;
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                             struct zone *zone)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+       int ret;
+
+       ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+       if (ret)
+               return ret;
+
+       move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections online here as otherwise they will be
+        * left offline.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+       return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections offline here as otherwise they will be
+        * left online.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+        /*
+        * The pages associated with this vmemmap have been offlined, so
+        * we can reset its state here.
+        */
+       remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+       kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
 {
        unsigned long flags;
-       struct zone *zone;
        int need_zonelists_rebuild = 0;
+       const int nid = zone_to_nid(zone);
        int ret;
        struct memory_notify arg;
 
-       /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+       /*
+        * {on,off}lining is constrained to full memory sections (or more
+        * precisly to memory blocks from the user space POV).
+        * memmap_on_memory is an exception because it reserves initial part
+        * of the physical memory space for vmemmaps. That space is pageblock
+        * aligned.
+        */
        if (WARN_ON_ONCE(!nr_pages ||
-                        !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+                        !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+                        !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
                return -EINVAL;
 
        mem_hotplug_begin();
 
        /* associate pfn range with the zone */
-       zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
        move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
 
        arg.start_pfn = pfn;
@@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
        }
 
        online_pages_range(pfn, nr_pages);
-       zone->present_pages += nr_pages;
-
-       pgdat_resize_lock(zone->zone_pgdat, &flags);
-       zone->zone_pgdat->node_present_pages += nr_pages;
-       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+       adjust_present_page_count(zone, nr_pages);
 
        node_states_set_node(nid, &arg);
        if (need_zonelists_rebuild)
@@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
        return device_online(&mem->dev);
 }
 
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+       unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+       unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+       unsigned long remaining_size = size - vmemmap_size;
+
+       /*
+        * Besides having arch support and the feature enabled at runtime, we
+        * need a few more assumptions to hold true:
+        *
+        * a) We span a single memory block: memory onlining/offlinin;g happens
+        *    in memory block granularity. We don't want the vmemmap of online
+        *    memory blocks to reside on offline memory blocks. In the future,
+        *    we might want to support variable-sized memory blocks to make the
+        *    feature more versatile.
+        *
+        * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+        *    to populate memory from the altmap for unrelated parts (i.e.,
+        *    other memory blocks)
+        *
+        * c) The vmemmap pages (and thereby the pages that will be exposed to
+        *    the buddy) have to cover full pageblocks: memory onlining/offlining
+        *    code requires applicable ranges to be page-aligned, for example, to
+        *    set the migratetypes properly.
+        *
+        * TODO: Although we have a check here to make sure that vmemmap pages
+        *       fully populate a PMD, it is not the right place to check for
+        *       this. A much better solution involves improving vmemmap code
+        *       to fallback to base pages when trying to populate vmemmap using
+        *       altmap as an alternative source of memory, and we do not exactly
+        *       populate a single PMD.
+        */
+       return memmap_on_memory &&
+              IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+              size == memory_block_size_bytes() &&
+              IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+              IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+       struct vmem_altmap mhp_altmap = {};
        u64 start, size;
        bool new_node = false;
        int ret;
@@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
                goto error;
        new_node = ret;
 
+       /*
+        * Self hosted memmap array
+        */
+       if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+               if (!mhp_supports_memmap_on_memory(size)) {
+                       ret = -EINVAL;
+                       goto error;
+               }
+               mhp_altmap.free = PHYS_PFN(size);
+               mhp_altmap.base_pfn = PHYS_PFN(start);
+               params.altmap = &mhp_altmap;
+       }
+
        /* call arch's memory hotadd */
        ret = arch_add_memory(nid, start, size, &params);
        if (ret < 0)
                goto error;
 
        /* create memory block devices after memory was added */
-       ret = create_memory_block_devices(start, size);
+       ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
        if (ret) {
                arch_remove_memory(nid, start, size, NULL);
                goto error;
@@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        int ret, node;
        char *reason;
 
-       /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+       /*
+        * {on,off}lining is constrained to full memory sections (or more
+        * precisly to memory blocks from the user space POV).
+        * memmap_on_memory is an exception because it reserves initial part
+        * of the physical memory space for vmemmaps. That space is pageblock
+        * aligned.
+        */
        if (WARN_ON_ONCE(!nr_pages ||
-                        !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+                        !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+                        !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
                return -EINVAL;
 
        mem_hotplug_begin();
@@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
         * in a way that pages from isolated pageblock are left on pcplists.
         */
        zone_pcp_disable(zone);
+       lru_cache_disable();
 
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
@@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
                        }
 
                        cond_resched();
-                       lru_add_drain_all();
 
                        ret = scan_movable_pages(pfn, end_pfn, &pfn);
                        if (!ret) {
@@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
        spin_unlock_irqrestore(&zone->lock, flags);
 
+       lru_cache_enable();
        zone_pcp_enable(zone);
 
        /* removal success */
        adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
-       zone->present_pages -= nr_pages;
-
-       pgdat_resize_lock(zone->zone_pgdat, &flags);
-       zone->zone_pgdat->node_present_pages -= nr_pages;
-       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+       adjust_present_page_count(zone, -nr_pages);
 
        init_per_zone_wmark_min();
 
@@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
        return 0;
 }
 
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+       /*
+        * If not set, continue with the next block.
+        */
+       return mem->nr_vmemmap_pages;
+}
+
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
        int cpu;
@@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
        int rc = 0;
+       struct vmem_altmap mhp_altmap = {};
+       struct vmem_altmap *altmap = NULL;
+       unsigned long nr_vmemmap_pages;
 
        BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
        if (rc)
                return rc;
 
+       /*
+        * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+        * the same granularity it was added - a single memory block.
+        */
+       if (memmap_on_memory) {
+               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+                                                     get_nr_vmemmap_pages_cb);
+               if (nr_vmemmap_pages) {
+                       if (size != memory_block_size_bytes()) {
+                               pr_warn("Refuse to remove %#llx - %#llx,"
+                                       "wrong granularity\n",
+                                       start, start + size);
+                               return -EINVAL;
+                       }
+
+                       /*
+                        * Let remove_pmd_table->free_hugepage_table do the
+                        * right thing if we used vmem_altmap when hot-adding
+                        * the range.
+                        */
+                       mhp_altmap.alloc = nr_vmemmap_pages;
+                       altmap = &mhp_altmap;
+               }
+       }
+
        /* remove memmap entry */
        firmware_map_remove(start, start + size, "System RAM");
 
@@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 
        mem_hotplug_begin();
 
-       arch_remove_memory(nid, start, size, NULL);
+       arch_remove_memory(nid, start, size, altmap);
 
        if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
                memblock_free(start, size);
index cd02955..d79fa29 100644 (file)
@@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
-               nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+               nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }
@@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, lookup_node()
-                        * wil drop the mmap_lock, so after calling
+                        * will drop the mmap_lock, so after calling
                         * lookup_node() only "pol" remains valid, "vma"
                         * is stale.
                         */
@@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
        int err = 0;
        nodemask_t tmp;
 
-       migrate_prep();
+       lru_cache_disable();
 
        mmap_read_lock(mm);
 
@@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 
        tmp = *from;
        while (!nodes_empty(tmp)) {
-               int s,d;
+               int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;
 
@@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                        break;
        }
        mmap_read_unlock(mm);
+
+       lru_cache_enable();
        if (err < 0)
                return err;
        return busy;
@@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 
-               migrate_prep();
+               lru_cache_disable();
        }
        {
                NODEMASK_SCRATCH(scratch);
@@ -1371,6 +1373,8 @@ up_out:
        mmap_write_unlock(mm);
 mpol_out:
        mpol_put(new);
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+               lru_cache_enable();
        return err;
 }
 
@@ -1863,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->v.nodes is intersect with node_states[N_MEMORY].
-        * so if the following test faile, it implies
+        * so if the following test fails, it implies
         * policy->v.nodes has movable memory only.
         */
        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
@@ -2094,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
  *
  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
  * policy.  Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * nodemask for 'bind' or 'interleave' policy.  For 'preferred' or 'local'
  * policy, always return true since it may allocate elsewhere on fallback.
  *
  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
index fe19d29..a258cf4 100644 (file)
@@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init);
 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-       return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+       return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
                                   GFP_KERNEL, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(mempool_create);
index 47df0df..b234c3f 100644 (file)
 
 #include "internal.h"
 
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-void migrate_prep(void)
-{
-       /*
-        * Clear the LRU lists so pages can be isolated.
-        * Note that pages may be moved off the LRU after we have
-        * drained them. Those pages will fail to migrate like other
-        * pages that may be busy.
-        */
-       lru_add_drain_all();
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-void migrate_prep_local(void)
-{
-       lru_add_drain();
-}
-
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
        struct address_space *mapping;
@@ -140,15 +118,10 @@ out:
        return -EBUSY;
 }
 
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
 {
        struct address_space *mapping;
 
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       VM_BUG_ON_PAGE(!PageMovable(page), page);
-       VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
        mapping = page_mapping(page);
        mapping->a_ops->putback_page(page);
        __ClearPageIsolated(page);
@@ -1375,7 +1348,7 @@ out_unlock:
 out:
        if (rc == MIGRATEPAGE_SUCCESS)
                putback_active_hugepage(hpage);
-       else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+       else if (rc != -EAGAIN)
                list_move_tail(&hpage->lru, ret);
 
        /*
@@ -1445,6 +1418,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int rc, nr_subpages;
        LIST_HEAD(ret_pages);
 
+       trace_mm_migrate_pages_start(mode, reason);
+
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
 
@@ -1769,7 +1744,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
        int start, i;
        int err = 0, err1;
 
-       migrate_prep();
+       lru_cache_disable();
 
        for (i = start = 0; i < nr_pages; i++) {
                const void __user *p;
@@ -1838,6 +1813,7 @@ out_flush:
        if (err >= 0)
                err = err1;
 out:
+       lru_cache_enable();
        return err;
 }
 
@@ -2110,17 +2086,6 @@ bool pmd_trans_migrating(pmd_t pmd)
        return PageLocked(page);
 }
 
-static inline bool is_shared_exec_page(struct vm_area_struct *vma,
-                                      struct page *page)
-{
-       if (page_mapcount(page) != 1 &&
-           (page_is_file_lru(page) || vma_is_shmem(vma)) &&
-           (vma->vm_flags & VM_EXEC))
-               return true;
-
-       return false;
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
@@ -2138,7 +2103,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
         * Don't migrate file pages that are mapped in multiple processes
         * with execute permissions as they are probably shared libraries.
         */
-       if (is_shared_exec_page(vma, page))
+       if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+           (vma->vm_flags & VM_EXEC))
                goto out;
 
        /*
@@ -2193,9 +2159,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        int page_lru = page_is_file_lru(page);
        unsigned long start = address & HPAGE_PMD_MASK;
 
-       if (is_shared_exec_page(vma, page))
-               goto out;
-
        new_page = alloc_pages_node(node,
                (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
                HPAGE_PMD_ORDER);
@@ -2307,7 +2270,6 @@ out_fail:
 
 out_unlock:
        unlock_page(page);
-out:
        put_page(page);
        return 0;
 }
@@ -2316,44 +2278,38 @@ out:
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
+static int migrate_vma_collect_skip(unsigned long start,
                                    unsigned long end,
-                                   __always_unused int depth,
                                    struct mm_walk *walk)
 {
        struct migrate_vma *migrate = walk->private;
        unsigned long addr;
 
-       /* Only allow populating anonymous memory. */
-       if (!vma_is_anonymous(walk->vma)) {
-               for (addr = start; addr < end; addr += PAGE_SIZE) {
-                       migrate->src[migrate->npages] = 0;
-                       migrate->dst[migrate->npages] = 0;
-                       migrate->npages++;
-               }
-               return 0;
-       }
-
        for (addr = start; addr < end; addr += PAGE_SIZE) {
-               migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
                migrate->dst[migrate->npages] = 0;
-               migrate->npages++;
-               migrate->cpages++;
+               migrate->src[migrate->npages++] = 0;
        }
 
        return 0;
 }
 
-static int migrate_vma_collect_skip(unsigned long start,
+static int migrate_vma_collect_hole(unsigned long start,
                                    unsigned long end,
+                                   __always_unused int depth,
                                    struct mm_walk *walk)
 {
        struct migrate_vma *migrate = walk->private;
        unsigned long addr;
 
+       /* Only allow populating anonymous memory. */
+       if (!vma_is_anonymous(walk->vma))
+               return migrate_vma_collect_skip(start, end, walk);
+
        for (addr = start; addr < end; addr += PAGE_SIZE) {
+               migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
                migrate->dst[migrate->npages] = 0;
-               migrate->src[migrate->npages++] = 0;
+               migrate->npages++;
+               migrate->cpages++;
        }
 
        return 0;
@@ -2823,11 +2779,11 @@ restore:
  *
  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address.  For this the caller simply has to allocate device memory and
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses.  For this the caller simply has to allocate device memory and
  * properly set the destination entry like for regular migration.  Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
  * just like for regular migration.
  *
  * After that, the callers must call migrate_vma_pages() to go over each entry
@@ -2973,6 +2929,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
                        swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
                        entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       /*
+                        * For now we only support migrating to un-addressable
+                        * device memory.
+                        */
+                       pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+                       goto abort;
                }
        } else {
                entry = mk_pte(page, vma->vm_page_prot);
index f8f8cc3..df590fd 100644 (file)
@@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
 {
        unsigned long nstart, end, tmp;
-       struct vm_area_struct * vma, * prev;
+       struct vm_area_struct *vma, *prev;
        int error;
 
        VM_BUG_ON(offset_in_page(start));
@@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
  */
 static int apply_mlockall_flags(int flags)
 {
-       struct vm_area_struct * vma, * prev = NULL;
+       struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;
 
        current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
index 347ef9b..0584e54 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
        unsigned long nr_pages = 0;
        struct vm_area_struct *vma;
 
-       /* Find first overlaping mapping */
+       /* Find first overlapping mapping */
        vma = find_vma_intersection(mm, addr, end);
        if (!vma)
                return 0;
@@ -2875,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
        if (unlikely(uf)) {
                /*
                 * If userfaultfd_unmap_prep returns an error the vmas
-                * will remain splitted, but userland will get a
+                * will remain split, but userland will get a
                 * highly unexpected error anyway. This is no
                 * different than the case where the first of the two
                 * __split_vma fails, but we don't undo the first
@@ -3029,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
-       if (vma->vm_flags & VM_LOCKED) {
-               struct vm_area_struct *tmp;
+       if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
 
-               /* drop PG_Mlocked flag for over-mapped range */
-               for (tmp = vma; tmp->vm_start >= start + size;
-                               tmp = tmp->vm_next) {
-                       /*
-                        * Split pmd and munlock page on the border
-                        * of the range.
-                        */
-                       vma_adjust_trans_huge(tmp, start, start + size, 0);
-
-                       munlock_vma_pages_range(tmp,
-                                       max(tmp->vm_start, start),
-                                       min(tmp->vm_end, start + size));
-               }
-       }
-
        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, pgoff, &populate, NULL);
index 94188df..e7a4431 100644 (file)
@@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
        mmap_write_unlock(current->mm);
 
        /*
-        * We could provie warnings or errors if any VMA still
+        * We could provide warnings or errors if any VMA still
         * has the pkey set here.
         */
        return ret;
index d22629f..47c255b 100644 (file)
@@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
         * So, to avoid such scenario we can pre-compute if the whole
         * operation has high chances to success map-wise.
         * Worst-scenario case is when both vma's (new_addr and old_addr) get
-        * split in 3 before unmaping it.
+        * split in 3 before unmapping it.
         * That means 2 more maps (1 for each) to the ones we already hold.
         * Check whether current map count plus 2 still leads us to 4 maps below
         * the threshold, otherwise return -ENOMEM here to be more safe.
index 5c9ab79..85a3a68 100644 (file)
@@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count)
        return count;
 }
 
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-
-       memcpy(addr, buf, count);
-       return count;
-}
-
 /*
  *     vmalloc  -  allocate virtually contiguous memory
  *
index fa1cf18..eefd3f5 100644 (file)
@@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 
 #ifdef CONFIG_NUMA
 /**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
  * @start: task struct of which task to consider
  * @oc: pointer to struct oom_control
  *
@@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
        if (oom_group) {
                mem_cgroup_print_oom_group(oom_group);
                mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
-                                     (void*)message);
+                                     (void *)message);
                mem_cgroup_put(oom_group);
        }
 }
index 5e761fb..0062d5c 100644 (file)
@@ -1806,7 +1806,7 @@ pause:
                        break;
 
                /*
-                * In the case of an unresponding NFS server and the NFS dirty
+                * In the case of an unresponsive NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
@@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping,
                         * Page truncated or invalidated. We can freely skip it
                         * then, even for data integrity operations: the page
                         * has disappeared concurrently, so there could be no
-                        * real expectation of this data interity operation
+                        * real expectation of this data integrity operation
                         * even if there is now a new, dirty page at the same
                         * pagecache address.
                         */
index 6b208b1..aaa1655 100644 (file)
@@ -893,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
                return false;
 
        /*
-        * Do not let lower order allocations polluate a movable pageblock.
+        * Do not let lower order allocations pollute a movable pageblock.
         * This might let an unmovable request use a reclaimable pageblock
         * and vice-versa but no more than normal fallback logic which can
         * have trouble finding a high-order free page.
@@ -2776,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
                        /*
                         * In page freeing path, migratetype change is racy so
                         * we can counter several free pages in a pageblock
-                        * in this loop althoug we changed the pageblock type
+                        * in this loop although we changed the pageblock type
                         * from highatomic to ac->migratetype. So we should
                         * adjust the count once.
                         */
@@ -3080,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work)
         * drain_all_pages doesn't use proper cpu hotplug protection so
         * we can race with cpu offline when the WQ can move this from
         * a cpu pinned worker to an unbound one. We can operate on a different
-        * cpu which is allright but we also have to make sure to not move to
+        * cpu which is alright but we also have to make sure to not move to
         * a different one.
         */
        preempt_disable();
@@ -3859,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
        return alloc_flags;
 }
 
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
-                                       unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+                                                 unsigned int alloc_flags)
 {
 #ifdef CONFIG_CMA
-       unsigned int pflags = current->flags;
-
-       if (!(pflags & PF_MEMALLOC_NOCMA) &&
-                       gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+       if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
-
 #endif
        return alloc_flags;
 }
@@ -3968,7 +3965,7 @@ retry:
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
 
-                       if (node_reclaim_mode == 0 ||
+                       if (!node_reclaim_enabled() ||
                            !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                continue;
 
@@ -4176,7 +4173,7 @@ out:
 }
 
 /*
- * Maximum number of compaction retries wit a progress before OOM
+ * Maximum number of compaction retries with a progress before OOM
  * killer is consider as the only way to move forward.
  */
 #define MAX_COMPACT_RETRIES 16
@@ -4204,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        memalloc_noreclaim_restore(noreclaim_flag);
        psi_memstall_leave(&pflags);
 
+       if (*compact_result == COMPACT_SKIPPED)
+               return NULL;
        /*
         * At least in one zone compaction wasn't deferred or skipped, so let's
         * count a compaction stall
@@ -4524,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
-       alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+       alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
 
        return alloc_flags;
 }
@@ -4826,7 +4825,7 @@ retry:
 
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
        if (reserve_flags)
-               alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+               alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
 
        /*
         * Reset the nodemask and zonelist iterators if memory policies can be
@@ -4995,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
        if (should_fail_alloc_page(gfp_mask, order))
                return false;
 
-       *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+       *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
 
        /* Dirty zone balancing only done in the fast path */
        ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -5178,6 +5177,14 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
        }
 
        gfp &= gfp_allowed_mask;
+       /*
+        * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+        * resp. GFP_NOIO which has to be inherited for all allocation requests
+        * from a particular context which has been marked by
+        * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+        * movable zones are not used during allocation.
+        */
+       gfp = current_gfp_context(gfp);
        alloc_gfp = gfp;
        if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
                        &alloc_gfp, &alloc_flags))
@@ -5194,13 +5201,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
        if (likely(page))
                goto out;
 
-       /*
-        * Apply scoped allocation constraints. This is mainly about GFP_NOFS
-        * resp. GFP_NOIO which has to be inherited for all allocation requests
-        * from a particular context which has been marked by
-        * memalloc_no{fs,io}_{save,restore}.
-        */
-       alloc_gfp = current_gfp_context(gfp);
+       alloc_gfp = gfp;
        ac.spread_dirty_pages = false;
 
        /*
@@ -5928,7 +5929,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 static int __parse_numa_zonelist_order(char *s)
 {
        /*
-        * We used to support different zonlists modes but they turned
+        * We used to support different zonelists modes but they turned
         * out to be just not useful. Let's keep the warning in place
         * if somebody still use the cmd line parameter so that we do
         * not fail it silently
@@ -7669,7 +7670,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 }
 
 /*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
  * such cases we allow max_zone_pfn sorted in the descending order
  */
 bool __weak arch_has_descending_max_zone_pfns(void)
@@ -8679,7 +8680,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
        };
 
-       migrate_prep();
+       lru_cache_disable();
 
        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
@@ -8689,14 +8690,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                       pfn = isolate_migratepages_range(cc, pfn, end);
-                       if (!pfn) {
-                               ret = -EINTR;
+                       ret = isolate_migratepages_range(cc, pfn, end);
+                       if (ret && ret != -EAGAIN)
                                break;
-                       }
+                       pfn = cc->migrate_pfn;
                        tries = 0;
                } else if (++tries == 5) {
-                       ret = ret < 0 ? ret : -EBUSY;
+                       ret = -EBUSY;
                        break;
                }
 
@@ -8706,7 +8706,16 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
                ret = migrate_pages(&cc->migratepages, alloc_migration_target,
                                NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+
+               /*
+                * On -ENOMEM, migrate_pages() bails out right away. It is pointless
+                * to retry again over this error, so do the same here.
+                */
+               if (ret == -ENOMEM)
+                       break;
        }
+
+       lru_cache_enable();
        if (ret < 0) {
                alloc_contig_dump_pages(&cc->migratepages);
                putback_movable_pages(&cc->migratepages);
@@ -8719,7 +8728,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:     start PFN to allocate
  * @end:       one-past-the-last PFN to allocate
- * @migratetype:       migratetype of the underlaying pageblocks (either
+ * @migratetype:       migratetype of the underlying pageblocks (either
  *                     #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *                     in range must have the same migratetype and it must
  *                     be either of the two.
@@ -8799,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret && ret != -EBUSY)
                goto done;
-       ret =0;
+       ret = 0;
 
        /*
         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -8892,12 +8901,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
 
                if (PageReserved(page))
                        return false;
-
-               if (page_count(page) > 0)
-                       return false;
-
-               if (PageHuge(page))
-                       return false;
        }
        return true;
 }
@@ -8969,9 +8972,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
 }
 #endif /* CONFIG_CONTIG_ALLOC */
 
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 {
-       unsigned int count = 0;
+       unsigned long count = 0;
 
        for (; nr_pages--; pfn++) {
                struct page *page = pfn_to_page(pfn);
@@ -8979,13 +8982,13 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
                count += page_count(page) != 1;
                __free_page(page);
        }
-       WARN(count != 0, "%d pages are still in use!\n", count);
+       WARN(count != 0, "%lu pages are still in use!\n", count);
 }
 EXPORT_SYMBOL(free_contig_range);
 
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * page high values need to be recalculated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
@@ -9017,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone)
 
 void zone_pcp_reset(struct zone *zone)
 {
-       unsigned long flags;
        int cpu;
        struct per_cpu_pageset *pset;
 
-       /* avoid races with drain_pages()  */
-       local_irq_save(flags);
        if (zone->pageset != &boot_pageset) {
                for_each_online_cpu(cpu) {
                        pset = per_cpu_ptr(zone->pageset, cpu);
@@ -9031,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone)
                free_percpu(zone->pageset);
                zone->pageset = &boot_pageset;
        }
-       local_irq_restore(flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
index 9661d53..adfabb5 100644 (file)
@@ -233,7 +233,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
        /*
         * We don't clear the bit on the oldpage as it's going to be freed
         * after migration. Until then, the info can be useful in case of
-        * a bug, and the overal stats will be off a bit only temporarily.
+        * a bug, and the overall stats will be off a bit only temporarily.
         * Also, migrate_misplaced_transhuge_page() can still fail the
         * migration and then we want the oldpage to retain the info. But
         * in that case we also don't need to explicitly clear the info from
index 86e3a36..2cf01d9 100644 (file)
@@ -134,7 +134,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
  * regardless of which page table level the page is mapped at. @pvmw->pmd is
  * NULL.
  *
- * Retruns false if there are no more page table entries for the page in
+ * Returns false if there are no more page table entries for the page in
  * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
  *
  * If you need to stop the walk before page_vma_mapped_walk() returned false,
index 095d7ea..ae26b11 100644 (file)
@@ -170,7 +170,7 @@ struct percpu_stats {
        u64 nr_max_alloc;       /* max # of live allocations */
        u32 nr_chunks;          /* current # of live chunks */
        u32 nr_max_chunks;      /* max # of live chunks */
-       size_t min_alloc_size;  /* min allocaiton size */
+       size_t min_alloc_size;  /* min allocation size */
        size_t max_alloc_size;  /* max allocation size */
 };
 
index 2330811..f99e930 100644 (file)
@@ -1862,7 +1862,7 @@ fail:
                        pr_info("limit reached, disable warning\n");
        }
        if (is_atomic) {
-               /* see the flag handling in pcpu_blance_workfn() */
+               /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
index 1dcc865..e9e879d 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
 
 #if defined(CONFIG_MMU)
 static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))
 
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
index f5fee9c..4bcc119 100644 (file)
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
-#include <linux/compat.h>
 #include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
index b0fc27e..693a610 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
  * Attach the anon_vmas from src to dst.
  * Returns 0 on success, -ENOMEM on failure.
  *
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
  * anon_vma_fork(). The first three want an exact copy of src, while the last
  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
index 162d8f8..a08cede 100644 (file)
@@ -3508,7 +3508,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
                        }
                }
                if (*this_char) {
-                       char *value = strchr(this_char,'=');
+                       char *value = strchr(this_char, '=');
                        size_t len = 0;
                        int err;
 
index df45c43..d0f7256 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 
 #define BATCHREFILL_LIMIT      16
 /*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
  * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
@@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep)
  * Because if it is the case, that means we defer the creation of
  * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
  * And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
  * This is a "chicken-and-egg" problem.
  *
  * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
@@ -2381,8 +2381,8 @@ union freelist_init_state {
 };
 
 /*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
  */
 static bool freelist_state_initialize(union freelist_init_state *state,
                                struct kmem_cache *cachep,
index 68123b2..feda53a 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3391,7 +3391,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  */
 
 /*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
  * and slab fragmentation. A higher order reduces the number of partial slabs
  * and increases the number of allocations possible without having to
  * take the list_lock.
index 33406ea..b2ada9d 100644 (file)
@@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
        if (unlikely(!mem_section)) {
                unsigned long size, align;
 
-               size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+               size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
                align = 1 << (INTERNODE_CACHE_SHIFT);
                mem_section = memblock_alloc(size, align);
                if (!mem_section)
@@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
        }
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
                ms->section_mem_map &= ~SECTION_IS_ONLINE;
        }
 }
-#endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static struct page * __meminit populate_section_memmap(unsigned long pfn,
index 31b844d..dfb48cf 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
+#include <linux/buffer_head.h>
 
 #include "internal.h"
 
@@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
        }
 }
 
+/* return true if pagevec needs to drain */
+static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
+{
+       bool ret = false;
+
+       if (!pagevec_add(pvec, page) || PageCompound(page) ||
+                       lru_cache_disabled())
+               ret = true;
+
+       return ret;
+}
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
@@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page)
                get_page(page);
                local_lock_irqsave(&lru_rotate.lock, flags);
                pvec = this_cpu_ptr(&lru_rotate.pvec);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
@@ -343,7 +356,7 @@ static void activate_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.activate_page);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, __activate_page);
                local_unlock(&lru_pvecs.lock);
        }
@@ -458,7 +471,7 @@ void lru_cache_add(struct page *page)
        get_page(page);
        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-       if (!pagevec_add(pvec, page) || PageCompound(page))
+       if (pagevec_add_and_need_flush(pvec, page))
                __pagevec_lru_add(pvec);
        local_unlock(&lru_pvecs.lock);
 }
@@ -483,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
        if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);
                /*
-                * We use the irq-unsafe __mod_zone_page_stat because this
+                * We use the irq-unsafe __mod_zone_page_state because this
                 * counter is not modified from interrupt context, and the pte
                 * lock is held(spinlock), which implies preemption disabled.
                 */
@@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu)
                pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
 
        activate_page_drain(cpu);
+       invalidate_bh_lrus_cpu(cpu);
 }
 
 /**
@@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
 
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -676,7 +690,7 @@ void deactivate_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  * Calling this function with cpu hotplug locks held can actually lead
  * to obscure indirect dependencies via WQ context.
  */
-void lru_add_drain_all(void)
+inline void __lru_add_drain_all(bool force_all_cpus)
 {
        /*
         * lru_drain_gen - Global pages generation number
@@ -780,7 +794,7 @@ void lru_add_drain_all(void)
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
-       if (unlikely(this_gen != lru_drain_gen))
+       if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;
 
        /*
@@ -794,7 +808,7 @@ void lru_add_drain_all(void)
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
-        * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+        * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
@@ -810,12 +824,14 @@ void lru_add_drain_all(void)
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 
-               if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+               if (force_all_cpus ||
+                   pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
                    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
-                   need_activate_page_drain(cpu)) {
+                   need_activate_page_drain(cpu) ||
+                   has_bh_in_lru(cpu, NULL)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
@@ -828,6 +844,11 @@ void lru_add_drain_all(void)
 done:
        mutex_unlock(&lock);
 }
+
+void lru_add_drain_all(void)
+{
+       __lru_add_drain_all(false);
+}
 #else
 void lru_add_drain_all(void)
 {
@@ -835,6 +856,34 @@ void lru_add_drain_all(void)
 }
 #endif /* CONFIG_SMP */
 
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+       atomic_inc(&lru_disable_count);
+#ifdef CONFIG_SMP
+       /*
+        * lru_add_drain_all in the force mode will schedule draining on
+        * all online CPUs so any calls of lru_cache_disabled wrapped by
+        * local_lock or preemption disabled would be ordered by that.
+        * The atomic operation doesn't need to have stronger ordering
+        * requirements because that is enforeced by the scheduling
+        * guarantees.
+        */
+       __lru_add_drain_all(true);
+#else
+       lru_add_drain();
+#endif
+}
+
 /**
  * release_pages - batched put_page()
  * @pages: array of pages to release
index be9de6d..6248d10 100644 (file)
@@ -16,7 +16,7 @@
  * to local caches without needing to acquire swap_info
  * lock.  We do not reuse the returned slots directly but
  * move them back to the global pool in a batch.  This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
  *
  * The swap entry allocated is marked with SWAP_HAS_CACHE
  * flag in map_count that prevents it from being allocated
index fb7efa0..272ea21 100644 (file)
@@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
                        xas_store(&xas, page);
                        xas_next(&xas);
                }
-               address_space->nrexceptional -= nr_shadows;
                address_space->nrpages += nr;
                __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
                __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
@@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page,
                xas_next(&xas);
        }
        ClearPageSwapCache(page);
-       if (shadow)
-               address_space->nrexceptional += nr;
        address_space->nrpages -= nr;
        __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
        __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
@@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
                        xas_store(&xas, NULL);
                        nr_shadows++;
                }
-               address_space->nrexceptional -= nr_shadows;
                xa_unlock_irq(&address_space->i_pages);
 
                /* search the next swapcache until we meet end */
@@ -796,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf,
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
  * virtual addresses are around the fault address in the same vma.
  *
  * Caller must hold read mmap_lock if vmf->vma is not NULL.
index 084a5b9..149e774 100644 (file)
@@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v)
        unsigned int bytes, inuse;
 
        if (si == SEQ_START_TOKEN) {
-               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+               seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }
 
@@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                                         sizeof(long),
                                         GFP_KERNEL);
 
-       if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+       if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
index 4559442..95af244 100644 (file)
@@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
        if (xas_load(&xas) != entry)
                return;
        xas_store(&xas, NULL);
-       mapping->nrexceptional--;
 }
 
 static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
@@ -295,7 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t         index;
        int             i;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                goto out;
 
        /* Offsets within partial pages */
@@ -440,9 +439,6 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-       unsigned long nrexceptional;
-       unsigned long nrpages;
-
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
@@ -452,16 +448,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
         */
        mapping_set_exiting(mapping);
 
-       /*
-        * When reclaim installs eviction entries, it increases
-        * nrexceptional first, then decreases nrpages.  Make sure we see
-        * this in the right order or we might miss an entry.
-        */
-       nrpages = mapping->nrpages;
-       smp_rmb();
-       nrexceptional = mapping->nrexceptional;
-
-       if (nrpages || nrexceptional) {
+       if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
@@ -633,7 +620,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int ret2 = 0;
        int did_range_unmap = 0;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                goto out;
 
        pagevec_init(&pvec);
index 9a3d451..e14b382 100644 (file)
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
                                              unsigned long len,
-                                             bool zeropage)
+                                             enum mcopy_atomic_mode mode)
 {
        int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
         * by THP.  Since we can not reliably insert a zero page, this
         * feature is not supported.
         */
-       if (zeropage) {
+       if (mode == MCOPY_ATOMIC_ZEROPAGE) {
                mmap_read_unlock(dst_mm);
                return -EINVAL;
        }
@@ -273,8 +273,6 @@ retry:
        }
 
        while (src_addr < src_start + len) {
-               pte_t dst_pteval;
-
                BUG_ON(dst_addr >= dst_start + len);
 
                /*
@@ -290,23 +288,23 @@ retry:
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                err = -ENOMEM;
-               dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+               dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
                if (!dst_pte) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
 
-               err = -EEXIST;
-               dst_pteval = huge_ptep_get(dst_pte);
-               if (!huge_pte_none(dst_pteval)) {
+               if (mode != MCOPY_ATOMIC_CONTINUE &&
+                   !huge_pte_none(huge_ptep_get(dst_pte))) {
+                       err = -EEXIST;
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
 
                err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-                                               dst_addr, src_addr, &page);
+                                              dst_addr, src_addr, mode, &page);
 
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                i_mmap_unlock_read(mapping);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                      unsigned long dst_start,
                                      unsigned long src_start,
                                      unsigned long len,
-                                     bool zeropage);
+                                     enum mcopy_atomic_mode mode);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
                                              unsigned long len,
-                                             bool zeropage,
+                                             enum mcopy_atomic_mode mcopy_mode,
                                              bool *mmap_changing,
                                              __u64 mode)
 {
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
        long copied;
        struct page *page;
        bool wp_copy;
+       bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
 
        /*
         * Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
         */
        if (is_vm_hugetlb_page(dst_vma))
                return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-                                               src_start, len, zeropage);
+                                               src_start, len, mcopy_mode);
 
        if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
                goto out_unlock;
+       if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+               goto out_unlock;
 
        /*
         * Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                     unsigned long src_start, unsigned long len,
                     bool *mmap_changing, __u64 mode)
 {
-       return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-                             mmap_changing, mode);
+       return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+                             MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
                       unsigned long len, bool *mmap_changing)
 {
-       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+                             mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+                      unsigned long len, bool *mmap_changing)
+{
+       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+                             mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
index 083c5c4..a8bf17f 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -765,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
-        * with the strict "NEVER", and to avoid possible race condtion (even
+        * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *      1. changing the batch
@@ -987,22 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
  */
 void mem_dump_obj(void *object)
 {
+       const char *type;
+
        if (kmem_valid_obj(object)) {
                kmem_dump_obj(object);
                return;
        }
+
        if (vmalloc_dump_obj(object))
                return;
-       if (!virt_addr_valid(object)) {
-               if (object == NULL)
-                       pr_cont(" NULL pointer.\n");
-               else if (object == ZERO_SIZE_PTR)
-                       pr_cont(" zero-size pointer.\n");
-               else
-                       pr_cont(" non-paged memory.\n");
-               return;
-       }
-       pr_cont(" non-slab/vmalloc memory.\n");
+
+       if (virt_addr_valid(object))
+               type = "non-slab/vmalloc memory";
+       else if (object == NULL)
+               type = "NULL pointer";
+       else if (object == ZERO_SIZE_PTR)
+               type = "zero-size pointer";
+       else
+               type = "non-paged memory";
+
+       pr_cont(" %s\n", type);
 }
 EXPORT_SYMBOL_GPL(mem_dump_obj);
 #endif
index d33894d..a13ac52 100644 (file)
@@ -1583,7 +1583,7 @@ static unsigned long lazy_max_pages(void)
 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
 
 /*
- * Serialize vmap purging.  There is no actual criticial section protected
+ * Serialize vmap purging.  There is no actual critical section protected
  * by this look, but we want to avoid concurrent calls for performance
  * reasons and to make the pcpu_get_vm_areas more deterministic.
  */
@@ -2628,7 +2628,7 @@ static void __vfree(const void *addr)
  * May sleep if called *not* from interrupt context.
  * Must not be called in NMI context (strictly speaking, it could be
  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea).
+ * conventions for vfree() arch-dependent would be a really bad idea).
  */
 void vfree(const void *addr)
 {
@@ -3083,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node);
  * 64b systems should always have either DMA or DMA32 zones. For others
  * GFP_DMA32 should do the right thing and use the normal zone.
  */
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
 #endif
 
 /**
@@ -3141,15 +3141,12 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
                /*
                 * To do safe access to this _mapped_ area, we need
                 * lock. But adding lock here means that we need to add
-                * overhead of vmalloc()/vfree() calles for this _debug_
+                * overhead of vmalloc()/vfree() calls for this _debug_
                 * interface, rarely used. Instead of that, we'll use
                 * kmap() and get small overhead in this access function.
                 */
                if (p) {
-                       /*
-                        * we can expect USER0 is not used (see vread/vwrite's
-                        * function description)
-                        */
+                       /* We can expect USER0 is not used -- see vread() */
                        void *map = kmap_atomic(p);
                        memcpy(buf, map + offset, length);
                        kunmap_atomic(map);
@@ -3164,43 +3161,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
        return copied;
 }
 
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
-       struct page *p;
-       int copied = 0;
-
-       while (count) {
-               unsigned long offset, length;
-
-               offset = offset_in_page(addr);
-               length = PAGE_SIZE - offset;
-               if (length > count)
-                       length = count;
-               p = vmalloc_to_page(addr);
-               /*
-                * To do safe access to this _mapped_ area, we need
-                * lock. But adding lock here means that we need to add
-                * overhead of vmalloc()/vfree() calles for this _debug_
-                * interface, rarely used. Instead of that, we'll use
-                * kmap() and get small overhead in this access function.
-                */
-               if (p) {
-                       /*
-                        * we can expect USER0 is not used (see vread/vwrite's
-                        * function description)
-                        */
-                       void *map = kmap_atomic(p);
-                       memcpy(map + offset, buf, length);
-                       kunmap_atomic(map);
-               }
-               addr += length;
-               buf += length;
-               copied += length;
-               count -= length;
-       }
-       return copied;
-}
-
 /**
  * vread() - read vmalloc area in a safe way.
  * @buf:     buffer for reading data
@@ -3219,7 +3179,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
  * Note: In usual ops, vread() is never necessary because the caller
  * should know vmalloc() area is valid and can use memcpy().
  * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
  *
  * Return: number of bytes for which addr and buf should be increased
  * (same number as @count) or %0 if [addr...addr+count) doesn't
@@ -3284,80 +3244,6 @@ finished:
 }
 
 /**
- * vwrite() - write vmalloc area in a safe way.
- * @buf:      buffer for source data
- * @addr:     vm address.
- * @count:    number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-       struct vmap_area *va;
-       struct vm_struct *vm;
-       char *vaddr;
-       unsigned long n, buflen;
-       int copied = 0;
-
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-       buflen = count;
-
-       spin_lock(&vmap_area_lock);
-       list_for_each_entry(va, &vmap_area_list, list) {
-               if (!count)
-                       break;
-
-               if (!va->vm)
-                       continue;
-
-               vm = va->vm;
-               vaddr = (char *) vm->addr;
-               if (addr >= vaddr + get_vm_area_size(vm))
-                       continue;
-               while (addr < vaddr) {
-                       if (count == 0)
-                               goto finished;
-                       buf++;
-                       addr++;
-                       count--;
-               }
-               n = vaddr + get_vm_area_size(vm) - addr;
-               if (n > count)
-                       n = count;
-               if (!(vm->flags & VM_IOREMAP)) {
-                       aligned_vwrite(buf, addr, n);
-                       copied++;
-               }
-               buf += n;
-               addr += n;
-               count -= n;
-       }
-finished:
-       spin_unlock(&vmap_area_lock);
-       if (!copied)
-               return 0;
-       return buflen;
-}
-
-/**
  * remap_vmalloc_range_partial - map vmalloc pages to userspace
  * @vma:               vma to cover
  * @uaddr:             target user address to start at
index 562e87c..5199b96 100644 (file)
@@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
+{
+       return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+static inline int shrinker_defer_size(int nr_items)
+{
+       return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+                                                    int nid)
+{
+       return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+                                        lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+                                   int map_size, int defer_size,
+                                   int old_map_size, int old_defer_size)
+{
+       struct shrinker_info *new, *old;
+       struct mem_cgroup_per_node *pn;
+       int nid;
+       int size = map_size + defer_size;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               old = shrinker_info_protected(memcg, nid);
+               /* Not yet online memcg */
+               if (!old)
+                       return 0;
+
+               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+               if (!new)
+                       return -ENOMEM;
+
+               new->nr_deferred = (atomic_long_t *)(new + 1);
+               new->map = (void *)new->nr_deferred + defer_size;
+
+               /* map: set all old bits, clear all new bits */
+               memset(new->map, (int)0xff, old_map_size);
+               memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+               /* nr_deferred: copy old values, clear all new values */
+               memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+               memset((void *)new->nr_deferred + old_defer_size, 0,
+                      defer_size - old_defer_size);
+
+               rcu_assign_pointer(pn->shrinker_info, new);
+               kvfree_rcu(old, rcu);
+       }
+
+       return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *pn;
+       struct shrinker_info *info;
+       int nid;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               info = rcu_dereference_protected(pn->shrinker_info, true);
+               kvfree(info);
+               rcu_assign_pointer(pn->shrinker_info, NULL);
+       }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+       int nid, size, ret = 0;
+       int map_size, defer_size = 0;
+
+       down_write(&shrinker_rwsem);
+       map_size = shrinker_map_size(shrinker_nr_max);
+       defer_size = shrinker_defer_size(shrinker_nr_max);
+       size = map_size + defer_size;
+       for_each_node(nid) {
+               info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+               if (!info) {
+                       free_shrinker_info(memcg);
+                       ret = -ENOMEM;
+                       break;
+               }
+               info->nr_deferred = (atomic_long_t *)(info + 1);
+               info->map = (void *)info->nr_deferred + defer_size;
+               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+       }
+       up_write(&shrinker_rwsem);
+
+       return ret;
+}
+
+static inline bool need_expand(int nr_max)
+{
+       return round_up(nr_max, BITS_PER_LONG) >
+              round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
+{
+       int ret = 0;
+       int new_nr_max = new_id + 1;
+       int map_size, defer_size = 0;
+       int old_map_size, old_defer_size = 0;
+       struct mem_cgroup *memcg;
+
+       if (!need_expand(new_nr_max))
+               goto out;
+
+       if (!root_mem_cgroup)
+               goto out;
+
+       lockdep_assert_held(&shrinker_rwsem);
+
+       map_size = shrinker_map_size(new_nr_max);
+       defer_size = shrinker_defer_size(new_nr_max);
+       old_map_size = shrinker_map_size(shrinker_nr_max);
+       old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+                                              old_map_size, old_defer_size);
+               if (ret) {
+                       mem_cgroup_iter_break(NULL, memcg);
+                       goto out;
+               }
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+       if (!ret)
+               shrinker_nr_max = new_nr_max;
+
+       return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+       if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+               struct shrinker_info *info;
+
+               rcu_read_lock();
+               info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+               /* Pairs with smp mb in shrink_slab() */
+               smp_mb__before_atomic();
+               set_bit(shrinker_id, info->map);
+               rcu_read_unlock();
+       }
+}
 
 static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
 
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
        int id, ret = -ENOMEM;
 
+       if (mem_cgroup_disabled())
+               return -ENOSYS;
+
        down_write(&shrinker_rwsem);
        /* This may call shrinker, so it must use down_read_trylock() */
-       id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+       id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;
 
        if (id >= shrinker_nr_max) {
-               if (memcg_expand_shrinker_maps(id)) {
+               if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
-
-               shrinker_nr_max = id + 1;
        }
        shrinker->id = id;
        ret = 0;
@@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 
        BUG_ON(id < 0);
 
-       down_write(&shrinker_rwsem);
+       lockdep_assert_held(&shrinker_rwsem);
+
        idr_remove(&shrinker_idr, id);
-       up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+       int i, nid;
+       long nr;
+       struct mem_cgroup *parent;
+       struct shrinker_info *child_info, *parent_info;
+
+       parent = parent_mem_cgroup(memcg);
+       if (!parent)
+               parent = root_mem_cgroup;
+
+       /* Prevent from concurrent shrinker_info expand */
+       down_read(&shrinker_rwsem);
+       for_each_node(nid) {
+               child_info = shrinker_info_protected(memcg, nid);
+               parent_info = shrinker_info_protected(parent, nid);
+               for (i = 0; i < shrinker_nr_max; i++) {
+                       nr = atomic_long_read(&child_info->nr_deferred[i]);
+                       atomic_long_add(nr, &parent_info->nr_deferred[i]);
+               }
+       }
+       up_read(&shrinker_rwsem);
 }
 
 static bool cgroup_reclaim(struct scan_control *sc)
@@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #else
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
-       return 0;
+       return -ENOSYS;
 }
 
 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 {
 }
 
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
 static bool cgroup_reclaim(struct scan_control *sc)
 {
        return false;
@@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static long xchg_nr_deferred(struct shrinker *shrinker,
+                            struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return xchg_nr_deferred_memcg(nid, shrinker,
+                                             sc->memcg);
+
+       return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+                           struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return add_nr_deferred_memcg(nr, nid, shrinker,
+                                            sc->memcg);
+
+       return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
  */
 int prealloc_shrinker(struct shrinker *shrinker)
 {
-       unsigned int size = sizeof(*shrinker->nr_deferred);
+       unsigned int size;
+       int err;
+
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               err = prealloc_memcg_shrinker(shrinker);
+               if (err != -ENOSYS)
+                       return err;
+
+               shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+       }
 
+       size = sizeof(*shrinker->nr_deferred);
        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;
 
@@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
        if (!shrinker->nr_deferred)
                return -ENOMEM;
 
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-               if (prealloc_memcg_shrinker(shrinker))
-                       goto free_deferred;
-       }
-
        return 0;
-
-free_deferred:
-       kfree(shrinker->nr_deferred);
-       shrinker->nr_deferred = NULL;
-       return -ENOMEM;
 }
 
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
-               return;
-
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               down_write(&shrinker_rwsem);
                unregister_memcg_shrinker(shrinker);
+               up_write(&shrinker_rwsem);
+               return;
+       }
 
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
@@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+       shrinker->flags |= SHRINKER_REGISTERED;
        up_write(&shrinker_rwsem);
 }
 
@@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
+       if (!(shrinker->flags & SHRINKER_REGISTERED))
                return;
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               unregister_memcg_shrinker(shrinker);
+
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
+       shrinker->flags &= ~SHRINKER_REGISTERED;
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+               unregister_memcg_shrinker(shrinker);
        up_write(&shrinker_rwsem);
+
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
 }
@@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        long freeable;
        long nr;
        long new_nr;
-       int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;
 
-       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-               nid = 0;
-
        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;
@@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
-       nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+       nr = xchg_nr_deferred(shrinker, shrinkctl);
 
-       total_scan = nr;
        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
@@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                delta = freeable / 2;
        }
 
+       total_scan = nr >> priority;
        total_scan += delta;
-       if (total_scan < 0) {
-               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
-                      shrinker->scan_objects, total_scan);
-               total_scan = freeable;
-               next_deferred = nr;
-       } else
-               next_deferred = total_scan;
-
-       /*
-        * We need to avoid excessive windup on filesystem shrinkers
-        * due to large numbers of GFP_NOFS allocations causing the
-        * shrinkers to return -1 all the time. This results in a large
-        * nr being built up so when a shrink that can do some work
-        * comes along it empties the entire cache due to nr >>>
-        * freeable. This is bad for sustaining a working set in
-        * memory.
-        *
-        * Hence only allow the shrinker to scan the entire cache when
-        * a large delta change is calculated directly.
-        */
-       if (delta < freeable / 4)
-               total_scan = min(total_scan, freeable / 2);
-
-       /*
-        * Avoid risking looping forever due to too large nr value:
-        * never try to free more than twice the estimate number of
-        * freeable entries.
-        */
-       if (total_scan > freeable * 2)
-               total_scan = freeable * 2;
+       total_scan = min(total_scan, (2 * freeable));
 
        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);
@@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                cond_resched();
        }
 
-       if (next_deferred >= scanned)
-               next_deferred -= scanned;
-       else
-               next_deferred = 0;
+       /*
+        * The deferred work is increased by any new work (delta) that wasn't
+        * done, decreased by old deferred work that was done now.
+        *
+        * And it is capped to two times of the freeable items.
+        */
+       next_deferred = max_t(long, (nr + delta - scanned), 0);
+       next_deferred = min(next_deferred, (2 * freeable));
+
        /*
         * move the unused scan count back into the shrinker in a
-        * manner that handles concurrent updates. If we exhausted the
-        * scan, there is no need to do an update.
+        * manner that handles concurrent updates.
         */
-       if (next_deferred > 0)
-               new_nr = atomic_long_add_return(next_deferred,
-                                               &shrinker->nr_deferred[nid]);
-       else
-               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+       new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
-       trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+       trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
 }
 
@@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
 {
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int i;
 
@@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
        if (!down_read_trylock(&shrinker_rwsem))
                return 0;
 
-       map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
-                                       true);
-       if (unlikely(!map))
+       info = shrinker_info_protected(memcg, nid);
+       if (unlikely(!info))
                goto unlock;
 
-       for_each_set_bit(i, map->map, shrinker_nr_max) {
+       for_each_set_bit(i, info->map, shrinker_nr_max) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
@@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                struct shrinker *shrinker;
 
                shrinker = idr_find(&shrinker_idr, i);
-               if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+               if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
                        if (!shrinker)
-                               clear_bit(i, map->map);
+                               clear_bit(i, info->map);
                        continue;
                }
 
@@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 
                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY) {
-                       clear_bit(i, map->map);
+                       clear_bit(i, info->map);
                        /*
                         * After the shrinker reported that it had no objects to
                         * free, but before we cleared the corresponding bit in
@@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                         * case, we invoke the shrinker one more time and reset
                         * the bit if it reports that it is not empty anymore.
                         * The memory barrier here pairs with the barrier in
-                        * memcg_set_shrinker_bit():
+                        * set_shrinker_bit():
                         *
                         * list_lru_add()     shrink_slab_memcg()
                         *   list_add_tail()    clear_bit()
@@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        if (ret == SHRINK_EMPTY)
                                ret = 0;
                        else
-                               memcg_set_shrinker_bit(memcg, nid, i);
+                               set_shrinker_bit(memcg, nid, i);
                }
                freed += ret;
 
@@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
        LIST_HEAD(clean_pages);
 
        list_for_each_entry_safe(page, next, page_list, lru) {
-               if (page_is_file_lru(page) && !PageDirty(page) &&
-                   !__PageMovable(page) && !PageUnevictable(page)) {
+               if (!PageHuge(page) && page_is_file_lru(page) &&
+                   !PageDirty(page) && !__PageMovable(page) &&
+                   !PageUnevictable(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &clean_pages);
                }
@@ -3862,7 +4059,7 @@ static int kswapd(void *p)
 {
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -4086,14 +4283,6 @@ module_init(kswapd_init)
 int node_reclaim_mode __read_mostly;
 
 /*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI.  New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE  (1<<0)   /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
-
-/*
  * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
index 74b2c37..cccee36 100644 (file)
@@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu)
 
 /*
  * this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
  */
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 {
@@ -1313,6 +1313,10 @@ const char * const vmstat_text[] = {
        "htlb_buddy_alloc_success",
        "htlb_buddy_alloc_fail",
 #endif
+#ifdef CONFIG_CMA
+       "cma_alloc_success",
+       "cma_alloc_fail",
+#endif
        "unevictable_pgs_culled",
        "unevictable_pgs_scanned",
        "unevictable_pgs_rescued",
@@ -1365,6 +1369,10 @@ const char * const vmstat_text[] = {
        "swap_ra",
        "swap_ra_hit",
 #endif
+#ifdef CONFIG_X86
+       "direct_map_level2_splits",
+       "direct_map_level3_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1854,25 +1862,34 @@ int vmstat_refresh(struct ctl_table *table, int write,
        if (err)
                return err;
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               /*
+                * Skip checking stats known to go negative occasionally.
+                */
+               switch (i) {
+               case NR_ZONE_WRITE_PENDING:
+               case NR_FREE_CMA_PAGES:
+                       continue;
+               }
                val = atomic_long_read(&vm_zone_stat[i]);
                if (val < 0) {
                        pr_warn("%s: %s %ld\n",
                                __func__, zone_stat_name(i), val);
-                       err = -EINVAL;
                }
        }
-#ifdef CONFIG_NUMA
-       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
-               val = atomic_long_read(&vm_numa_stat[i]);
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+               /*
+                * Skip checking stats known to go negative occasionally.
+                */
+               switch (i) {
+               case NR_WRITEBACK:
+                       continue;
+               }
+               val = atomic_long_read(&vm_node_stat[i]);
                if (val < 0) {
                        pr_warn("%s: %s %ld\n",
-                               __func__, numa_stat_name(i), val);
-                       err = -EINVAL;
+                               __func__, node_stat_name(i), val);
                }
        }
-#endif
-       if (err)
-               return err;
        if (write)
                *ppos += *lenp;
        else
index cd39902..b7cdeca 100644 (file)
@@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
-       mapping->nrexceptional -= node->nr_values;
        xa_delete_node(node, workingset_update_node);
        __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
 
index 9d889ad..7fe7ada 100644 (file)
@@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
 {
        if (pool->inode)
                iput(pool->inode);
- }
+}
 
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
index 5ed7120..6d9ed48 100644 (file)
@@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
  * This may hold locks, disable interrupts, and/or preemption,
  * and the zpool_unmap_handle() must be called to undo those
  * actions.  The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
  * as soon as possible.  As the implementation may use per-cpu
  * data, multiple handles should not be mapped concurrently on
  * any cpu.
index 30c358b..19b563b 100644 (file)
@@ -61,7 +61,7 @@
 #define ZSPAGE_MAGIC   0x58
 
 /*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
  * These two conditions ensure that any 'struct link_free' itself doesn't
  * span more than 1 page which avoids complex case of mapping 2 pages simply
  * to restore link_free pointer values.
@@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage,
  * class maintains a list of zspages where each zspage is divided
  * into equal sized chunks. Each allocation falls into one of these
  * classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
  */
 static int get_size_class_index(int size)
 {
@@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
  * zs_map_object - get address of allocated object from handle.
  * @pool: pool from which the object was allocated
  * @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
  *
  * Before using an object allocated from zs_malloc, it must be mapped using
  * this function. When done with the object, it must be unmapped using
@@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
                head = obj_to_head(page, addr);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
-                       if (!testpin_tag(handle))
-                               BUG();
+                       BUG_ON(!testpin_tag(handle));
 
                        old_obj = handle_to_obj(handle);
                        obj_to_location(old_obj, &dummy, &obj_idx);
@@ -2035,8 +2034,7 @@ unpin_objects:
                head = obj_to_head(page, addr);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
-                       if (!testpin_tag(handle))
-                               BUG();
+                       BUG_ON(!testpin_tag(handle));
                        unpin_tag(handle);
                }
        }
index 578d9f2..2076326 100644 (file)
@@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
        }
        pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
 
-       strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+       strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
        pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
        if (!pool->acomp_ctx) {
index 0456593..e4e6e99 100644 (file)
@@ -103,8 +103,9 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 
        rcu_read_lock();
        if (netif_is_bridge_port(dev)) {
-               p = br_port_get_rcu(dev);
-               vg = nbp_vlan_group_rcu(p);
+               p = br_port_get_check_rcu(dev);
+               if (p)
+                       vg = nbp_vlan_group_rcu(p);
        } else if (dev->priv_flags & IFF_EBRIDGE) {
                br = netdev_priv(dev);
                vg = br_vlan_group_rcu(br);
index eb261aa..de407e8 100644 (file)
@@ -36,6 +36,20 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
        }
 }
 
+static void set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+       dout("%s global_id %llu\n", __func__, global_id);
+
+       if (!global_id)
+               pr_err("got zero global_id\n");
+
+       if (ac->global_id && global_id != ac->global_id)
+               pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+                      global_id);
+
+       ac->global_id = global_id;
+}
+
 /*
  * setup, teardown.
  */
@@ -222,11 +236,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
        payload_end = payload + payload_len;
 
-       if (global_id && ac->global_id != global_id) {
-               dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-               ac->global_id = global_id;
-       }
-
        if (ac->negotiating) {
                /* server does not support our protocols? */
                if (!protocol && result < 0) {
@@ -253,11 +262,16 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
        ret = ac->ops->handle_reply(ac, result, payload, payload_end,
                                    NULL, NULL, NULL, NULL);
-       if (ret == -EAGAIN)
+       if (ret == -EAGAIN) {
                ret = build_request(ac, true, reply_buf, reply_len);
-       else if (ret)
+               goto out;
+       } else if (ret) {
                pr_err("auth protocol '%s' mauth authentication failed: %d\n",
                       ceph_auth_proto_name(ac->protocol), result);
+               goto out;
+       }
+
+       set_global_id(ac, global_id);
 
 out:
        mutex_unlock(&ac->mutex);
@@ -484,15 +498,11 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
        int ret;
 
        mutex_lock(&ac->mutex);
-       if (global_id && ac->global_id != global_id) {
-               dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
-                    global_id);
-               ac->global_id = global_id;
-       }
-
        ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
                                    session_key, session_key_len,
                                    con_secret, con_secret_len);
+       if (!ret)
+               set_global_id(ac, global_id);
        mutex_unlock(&ac->mutex);
        return ret;
 }
index ca44c32..79641c4 100644 (file)
@@ -526,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                if (ret < 0)
                        return ret;
 
-               auth->struct_v = 2;  /* nautilus+ */
+               auth->struct_v = 3;  /* nautilus+ */
                auth->key = 0;
                for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
                        auth->key ^= *(__le64 *)u;
index b44f765..bc109a1 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/inet.h>
 
 #include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>  /* for ceph_pr_addr() */
 
 static int
 ceph_decode_entity_addr_versioned(void **p, void *end,
@@ -110,6 +111,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
        }
 
        ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+       dout("%s addr_cnt %d\n", __func__, addr_cnt);
 
        found = false;
        for (i = 0; i < addr_cnt; i++) {
@@ -117,6 +119,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
                if (ret)
                        return ret;
 
+               dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr));
                if (tmp_addr.type == my_type) {
                        if (found) {
                                pr_err("another match of type %d in addrvec\n",
@@ -128,13 +131,18 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
                        found = true;
                }
        }
-       if (!found && addr_cnt != 0) {
-               pr_err("no match of type %d in addrvec\n",
-                      le32_to_cpu(my_type));
-               return -ENOENT;
-       }
 
-       return 0;
+       if (found)
+               return 0;
+
+       if (!addr_cnt)
+               return 0;  /* normal -- e.g. unused OSD id/slot */
+
+       if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr)))
+               return 0;  /* weird but effectively the same as !addr_cnt */
+
+       pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type));
+       return -ENOENT;
 
 e_inval:
        return -EINVAL;
index 290012d..88d8a02 100644 (file)
@@ -387,7 +387,8 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
        int ret;
 
        ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
-                          &ethtool_genl_family, 0, ctx->ops->reply_cmd);
+                          &ethtool_genl_family, NLM_F_MULTI,
+                          ctx->ops->reply_cmd);
        if (!ehdr)
                return -EMSGSIZE;
 
index b218e45..6852e9b 100644 (file)
@@ -520,6 +520,10 @@ static int fill_frame_info(struct hsr_frame_info *frame,
        struct ethhdr *ethhdr;
        __be16 proto;
 
+       /* Check if skb contains hsr_ethhdr */
+       if (skb->mac_len < sizeof(struct hsr_ethhdr))
+               return -EINVAL;
+
        memset(frame, 0, sizeof(*frame));
        frame->is_supervision = is_supervision_frame(port->hsr, skb);
        frame->node_src = hsr_get_node(port, &hsr->node_db, skb,
index cf20316..c53f14b 100644 (file)
@@ -1556,13 +1556,12 @@ out_free:
        return ret;
 }
 
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-                                   const struct nf_hook_ops *ops)
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
 {
        struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
 
        if (table)
-               nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+               nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
 
index b8f45e9..6922612 100644 (file)
@@ -54,7 +54,7 @@ static int __net_init arptable_filter_table_init(struct net *net)
 
 static void __net_exit arptable_filter_net_pre_exit(struct net *net)
 {
-       arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops);
+       arpt_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
index e14fd0c..f1c1f9e 100644 (file)
@@ -2039,6 +2039,7 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
                (__kernel_size_t)zc->msg_controllen;
        cmsg_dummy.msg_flags = in_compat_syscall()
                ? MSG_CMSG_COMPAT : 0;
+       cmsg_dummy.msg_control_is_user = true;
        zc->msg_flags = 0;
        if (zc->msg_control == msg_control_addr &&
            zc->msg_controllen == cmsg_dummy.msg_controllen) {
index 563d016..db5831e 100644 (file)
@@ -230,6 +230,10 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
                ret = -ENOENT;
        } else if (!bpf_try_module_get(ca, ca->owner)) {
                ret = -EBUSY;
+       } else if (!net_eq(net, &init_net) &&
+                       !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
+               /* Only init netns can set default to a restricted algorithm */
+               ret = -EPERM;
        } else {
                prev = xchg(&net->ipv4.tcp_congestion_control, ca);
                if (prev)
index d2f8138..e412817 100644 (file)
@@ -122,9 +122,6 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
        hinfo = seg6_hmac_info_lookup(net, hmackeyid);
 
        if (!slen) {
-               if (!hinfo)
-                       err = -ENOENT;
-
                err = seg6_hmac_info_del(net, hmackeyid);
 
                goto out_unlock;
index bd71408..4ff38cb 100644 (file)
@@ -93,6 +93,35 @@ struct seg6_end_dt_info {
        int hdrlen;
 };
 
+struct pcpu_seg6_local_counters {
+       u64_stats_t packets;
+       u64_stats_t bytes;
+       u64_stats_t errors;
+
+       struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+       __u64 packets;
+       __u64 bytes;
+       __u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp)                          \
+       __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters,      \
+                                 ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS  SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
 struct seg6_local_lwt {
        int action;
        struct ipv6_sr_hdr *srh;
@@ -105,6 +134,7 @@ struct seg6_local_lwt {
 #ifdef CONFIG_NET_L3_MASTER_DEV
        struct seg6_end_dt_info dt_info;
 #endif
+       struct pcpu_seg6_local_counters __percpu *pcpu_counters;
 
        int headroom;
        struct seg6_action_desc *desc;
@@ -878,36 +908,43 @@ static struct seg6_action_desc seg6_action_table[] = {
        {
                .action         = SEG6_LOCAL_ACTION_END,
                .attrs          = 0,
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_X,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_x,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_T,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_t,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX2,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_OIF),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx2,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX6,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX4,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH4),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx4,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DT4,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
 #ifdef CONFIG_NET_L3_MASTER_DEV
                .input          = input_action_end_dt4,
                .slwt_ops       = {
@@ -919,30 +956,35 @@ static struct seg6_action_desc seg6_action_table[] = {
                .action         = SEG6_LOCAL_ACTION_END_DT6,
 #ifdef CONFIG_NET_L3_MASTER_DEV
                .attrs          = 0,
-               .optattrs       = SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+               .optattrs       = SEG6_F_LOCAL_COUNTERS         |
+                                 SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
                                  SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
                .slwt_ops       = {
                                        .build_state = seg6_end_dt6_build,
                                  },
 #else
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
 #endif
                .input          = input_action_end_dt6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_B6,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_b6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_B6_ENCAP,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_b6_encap,
                .static_headroom        = sizeof(struct ipv6hdr),
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_BPF,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_BPF),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_bpf,
        },
 
@@ -963,11 +1005,36 @@ static struct seg6_action_desc *__get_action_desc(int action)
        return NULL;
 }
 
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+       return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+                                      unsigned int len, int err)
+{
+       struct pcpu_seg6_local_counters *pcounters;
+
+       pcounters = this_cpu_ptr(slwt->pcpu_counters);
+       u64_stats_update_begin(&pcounters->syncp);
+
+       if (likely(!err)) {
+               u64_stats_inc(&pcounters->packets);
+               u64_stats_add(&pcounters->bytes, len);
+       } else {
+               u64_stats_inc(&pcounters->errors);
+       }
+
+       u64_stats_update_end(&pcounters->syncp);
+}
+
 static int seg6_local_input(struct sk_buff *skb)
 {
        struct dst_entry *orig_dst = skb_dst(skb);
        struct seg6_action_desc *desc;
        struct seg6_local_lwt *slwt;
+       unsigned int len = skb->len;
+       int rc;
 
        if (skb->protocol != htons(ETH_P_IPV6)) {
                kfree_skb(skb);
@@ -977,7 +1044,14 @@ static int seg6_local_input(struct sk_buff *skb)
        slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
        desc = slwt->desc;
 
-       return desc->input(skb, slwt);
+       rc = desc->input(skb, slwt);
+
+       if (!seg6_lwtunnel_counters_enabled(slwt))
+               return rc;
+
+       seg6_local_update_counters(slwt, len, rc);
+
+       return rc;
 }
 
 static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
@@ -992,6 +1066,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
        [SEG6_LOCAL_IIF]        = { .type = NLA_U32 },
        [SEG6_LOCAL_OIF]        = { .type = NLA_U32 },
        [SEG6_LOCAL_BPF]        = { .type = NLA_NESTED },
+       [SEG6_LOCAL_COUNTERS]   = { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -1296,6 +1371,112 @@ static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
                bpf_prog_put(slwt->bpf.prog);
 }
 
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+       [SEG6_LOCAL_CNT_PACKETS]        = { .type = NLA_U64 },
+       [SEG6_LOCAL_CNT_BYTES]          = { .type = NLA_U64 },
+       [SEG6_LOCAL_CNT_ERRORS]         = { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+                             struct seg6_local_lwt *slwt)
+{
+       struct pcpu_seg6_local_counters __percpu *pcounters;
+       struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+       int ret;
+
+       ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+                                         attrs[SEG6_LOCAL_COUNTERS],
+                                         seg6_local_counters_policy, NULL);
+       if (ret < 0)
+               return ret;
+
+       /* basic support for SRv6 Behavior counters requires at least:
+        * packets, bytes and errors.
+        */
+       if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+           !tb[SEG6_LOCAL_CNT_ERRORS])
+               return -EINVAL;
+
+       /* counters are always zero initialized */
+       pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+       if (!pcounters)
+               return -ENOMEM;
+
+       slwt->pcpu_counters = pcounters;
+
+       return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+                                       struct seg6_local_counters *counters)
+{
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+       struct seg6_local_counters counters = { 0, 0, 0 };
+       struct nlattr *nest;
+       int rc, i;
+
+       nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+       if (!nest)
+               return -EMSGSIZE;
+
+       for_each_possible_cpu(i) {
+               struct pcpu_seg6_local_counters *pcounters;
+               u64 packets, bytes, errors;
+               unsigned int start;
+
+               pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+               do {
+                       start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+
+                       packets = u64_stats_read(&pcounters->packets);
+                       bytes = u64_stats_read(&pcounters->bytes);
+                       errors = u64_stats_read(&pcounters->errors);
+
+               } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+
+               counters.packets += packets;
+               counters.bytes += bytes;
+               counters.errors += errors;
+       }
+
+       rc = seg6_local_fill_nla_counters(skb, &counters);
+       if (rc < 0) {
+               nla_nest_cancel(skb, nest);
+               return rc;
+       }
+
+       return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+       /* a and b are equal if both have pcpu_counters set or not */
+       return (!!((unsigned long)a->pcpu_counters)) ^
+               (!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+       free_percpu(slwt->pcpu_counters);
+}
+
 struct seg6_action_param {
        int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
        int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -1343,6 +1524,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
                                    .put = put_nla_vrftable,
                                    .cmp = cmp_nla_vrftable },
 
+       [SEG6_LOCAL_COUNTERS]   = { .parse = parse_nla_counters,
+                                   .put = put_nla_counters,
+                                   .cmp = cmp_nla_counters,
+                                   .destroy = destroy_attr_counters },
 };
 
 /* call the destroy() callback (if available) for each set attribute in
@@ -1645,6 +1830,15 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
        if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
                nlsize += nla_total_size(4);
 
+       if (attrs & SEG6_F_LOCAL_COUNTERS)
+               nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+                         /* SEG6_LOCAL_CNT_PACKETS */
+                         nla_total_size_64bit(sizeof(__u64)) +
+                         /* SEG6_LOCAL_CNT_BYTES */
+                         nla_total_size_64bit(sizeof(__u64)) +
+                         /* SEG6_LOCAL_CNT_ERRORS */
+                         nla_total_size_64bit(sizeof(__u64));
+
        return nlsize;
 }
 
index 82e91b0..a5ede35 100644 (file)
@@ -546,8 +546,7 @@ static void mptcp_sock_destruct(struct sock *sk)
         * ESTABLISHED state and will not have the SOCK_DEAD flag.
         * Both result in warnings from inet_sock_destruct.
         */
-
-       if (sk->sk_state == TCP_ESTABLISHED) {
+       if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
                sk->sk_state = TCP_CLOSE;
                WARN_ON_ONCE(sk->sk_socket);
                sock_orphan(sk);
index b22801f..a414274 100644 (file)
@@ -413,7 +413,10 @@ static int help(struct sk_buff *skb,
 
        spin_lock_bh(&nf_ftp_lock);
        fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
-       BUG_ON(fb_ptr == NULL);
+       if (!fb_ptr) {
+               spin_unlock_bh(&nf_ftp_lock);
+               return NF_ACCEPT;
+       }
 
        ends_in_nl = (fb_ptr[datalen - 1] == '\n');
        seq = ntohl(th->seq) + datalen;
index 8ba037b..aafaff0 100644 (file)
@@ -146,7 +146,8 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
                /* Get first TPKT pointer */
                tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
                                          h323_buffer);
-               BUG_ON(tpkt == NULL);
+               if (!tpkt)
+                       goto clear_out;
 
                /* Validate TPKT identifier */
                if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
index e40988a..08ee4e7 100644 (file)
@@ -143,7 +143,10 @@ static int help(struct sk_buff *skb, unsigned int protoff,
        spin_lock_bh(&irc_buffer_lock);
        ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
                                    irc_buffer);
-       BUG_ON(ib_ptr == NULL);
+       if (!ib_ptr) {
+               spin_unlock_bh(&irc_buffer_lock);
+               return NF_ACCEPT;
+       }
 
        data = ib_ptr;
        data_limit = ib_ptr + skb->len - dataoff;
index 5105d42..7d5708b 100644 (file)
@@ -544,7 +544,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
 
        nexthdr_off = protoff;
        tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
-       BUG_ON(!tcph);
+       if (!tcph)
+               return NF_ACCEPT;
+
        nexthdr_off += tcph->doff * 4;
        datalen = tcplen - tcph->doff * 4;
 
index 318b8f7..34e2241 100644 (file)
@@ -338,7 +338,8 @@ static void tcp_options(const struct sk_buff *skb,
 
        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
-       BUG_ON(ptr == NULL);
+       if (!ptr)
+               return;
 
        state->td_scale =
        state->flags = 0;
@@ -394,7 +395,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 
        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
-       BUG_ON(ptr == NULL);
+       if (!ptr)
+               return;
 
        /* Fast path for timestamp-only option */
        if (length == TCPOLEN_TSTAMP_ALIGNED
index 1aebd65..fcb33b1 100644 (file)
@@ -95,7 +95,10 @@ static int help(struct sk_buff *skb,
 
        spin_lock_bh(&nf_sane_lock);
        sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
-       BUG_ON(sb_ptr == NULL);
+       if (!sb_ptr) {
+               spin_unlock_bh(&nf_sane_lock);
+               return NF_ACCEPT;
+       }
 
        if (dir == IP_CT_DIR_ORIGINAL) {
                if (datalen != sizeof(struct sane_request))
index 0b7fe0a..d63d2d8 100644 (file)
@@ -4184,6 +4184,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
        unsigned char *udata;
        struct nft_set *set;
        struct nft_ctx ctx;
+       size_t alloc_size;
        u64 timeout;
        char *name;
        int err, i;
@@ -4329,8 +4330,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
        size = 0;
        if (ops->privsize != NULL)
                size = ops->privsize(nla, &desc);
-
-       set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
+       alloc_size = sizeof(*set) + size + udlen;
+       if (alloc_size < size)
+               return -ENOMEM;
+       set = kvzalloc(alloc_size, GFP_KERNEL);
        if (!set)
                return -ENOMEM;
 
@@ -6615,9 +6618,9 @@ err_obj_ht:
        INIT_LIST_HEAD(&obj->list);
        return err;
 err_trans:
-       kfree(obj->key.name);
-err_userdata:
        kfree(obj->udata);
+err_userdata:
+       kfree(obj->key.name);
 err_strdup:
        if (obj->ops->destroy)
                obj->ops->destroy(&ctx, obj);
index d7a9628..e8dbd83 100644 (file)
@@ -295,6 +295,7 @@ replay:
                        nfnl_unlock(subsys_id);
                        break;
                default:
+                       rcu_read_unlock();
                        err = -EINVAL;
                        break;
                }
index e8f8875..0fa2e20 100644 (file)
@@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
 
                ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
                                sizeof(struct tcphdr), ctx->optsize, opts);
+               if (!ctx->optp)
+                       return NULL;
        }
 
        return tcp;
index 58f576a..7b3d0a7 100644 (file)
@@ -412,9 +412,17 @@ static void nft_rhash_destroy(const struct nft_set *set)
                                    (void *)set);
 }
 
+/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
+#define NFT_MAX_BUCKETS (1U << 31)
+
 static u32 nft_hash_buckets(u32 size)
 {
-       return roundup_pow_of_two(size * 4 / 3);
+       u64 val = div_u64((u64)size * 4, 3);
+
+       if (val >= NFT_MAX_BUCKETS)
+               return NFT_MAX_BUCKETS;
+
+       return roundup_pow_of_two(val);
 }
 
 static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features,
@@ -615,7 +623,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[],
                             const struct nft_set_desc *desc)
 {
        return sizeof(struct nft_hash) +
-              nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
+              (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
 }
 
 static int nft_hash_init(const struct nft_set *set,
@@ -655,8 +663,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
                return false;
 
        est->size   = sizeof(struct nft_hash) +
-                     nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
-                     desc->size * sizeof(struct nft_hash_elem);
+                     (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+                     (u64)desc->size * sizeof(struct nft_hash_elem);
        est->lookup = NFT_SET_CLASS_O_1;
        est->space  = NFT_SET_CLASS_O_N;
 
@@ -673,8 +681,8 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
                return false;
 
        est->size   = sizeof(struct nft_hash) +
-                     nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
-                     desc->size * sizeof(struct nft_hash_elem);
+                     (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+                     (u64)desc->size * sizeof(struct nft_hash_elem);
        est->lookup = NFT_SET_CLASS_O_1;
        est->space  = NFT_SET_CLASS_O_N;
 
index 75625d1..498a0bf 100644 (file)
@@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK");
 static u8 mode;
 
 static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
 {
        u32 secmark = 0;
-       const struct xt_secmark_target_info *info = par->targinfo;
 
        switch (mode) {
        case SECMARK_MODE_SEL:
@@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
        return XT_CONTINUE;
 }
 
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
 {
        int err;
 
@@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
        return 0;
 }
 
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
 {
-       struct xt_secmark_target_info *info = par->targinfo;
        int err;
 
-       if (strcmp(par->table, "mangle") != 0 &&
-           strcmp(par->table, "security") != 0) {
+       if (strcmp(table, "mangle") != 0 &&
+           strcmp(table, "security") != 0) {
                pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
-                                   par->table);
+                                   table);
                return -EINVAL;
        }
 
@@ -116,25 +115,76 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
        }
 }
 
-static struct xt_target secmark_tg_reg __read_mostly = {
-       .name       = "SECMARK",
-       .revision   = 0,
-       .family     = NFPROTO_UNSPEC,
-       .checkentry = secmark_tg_check,
-       .destroy    = secmark_tg_destroy,
-       .target     = secmark_tg,
-       .targetsize = sizeof(struct xt_secmark_target_info),
-       .me         = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+       struct xt_secmark_target_info *info = par->targinfo;
+       struct xt_secmark_target_info_v1 newinfo = {
+               .mode   = info->mode,
+       };
+       int ret;
+
+       memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+       ret = secmark_tg_check(par->table, &newinfo);
+       info->secid = newinfo.secid;
+
+       return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       const struct xt_secmark_target_info *info = par->targinfo;
+       struct xt_secmark_target_info_v1 newinfo = {
+               .secid  = info->secid,
+       };
+
+       return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+       return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+       {
+               .name           = "SECMARK",
+               .revision       = 0,
+               .family         = NFPROTO_UNSPEC,
+               .checkentry     = secmark_tg_check_v0,
+               .destroy        = secmark_tg_destroy,
+               .target         = secmark_tg_v0,
+               .targetsize     = sizeof(struct xt_secmark_target_info),
+               .me             = THIS_MODULE,
+       },
+       {
+               .name           = "SECMARK",
+               .revision       = 1,
+               .family         = NFPROTO_UNSPEC,
+               .checkentry     = secmark_tg_check_v1,
+               .destroy        = secmark_tg_destroy,
+               .target         = secmark_tg_v1,
+               .targetsize     = sizeof(struct xt_secmark_target_info_v1),
+               .usersize       = offsetof(struct xt_secmark_target_info_v1, secid),
+               .me             = THIS_MODULE,
+       },
 };
 
 static int __init secmark_tg_init(void)
 {
-       return xt_register_target(&secmark_tg_reg);
+       return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 static void __exit secmark_tg_exit(void)
 {
-       xt_unregister_target(&secmark_tg_reg);
+       xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 module_init(secmark_tg_init);
index a3b46f8..53dbe73 100644 (file)
@@ -109,12 +109,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
                                          GFP_KERNEL);
        if (!llcp_sock->service_name) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                ret = -ENOMEM;
                goto put_dev;
        }
        llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                kfree(llcp_sock->service_name);
                llcp_sock->service_name = NULL;
                ret = -EADDRINUSE;
@@ -709,6 +711,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr,
        llcp_sock->ssap = nfc_llcp_get_local_ssap(local);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                ret = -ENOMEM;
                goto put_dev;
        }
@@ -756,6 +759,7 @@ sock_unlink:
 sock_llcp_release:
        nfc_llcp_put_ssap(local, llcp_sock->ssap);
        nfc_llcp_local_put(llcp_sock->local);
+       llcp_sock->local = NULL;
 
 put_dev:
        nfc_put_device(dev);
index 92a0b67..77d924a 100644 (file)
@@ -827,17 +827,17 @@ static void ovs_fragment(struct net *net, struct vport *vport,
        }
 
        if (key->eth.type == htons(ETH_P_IP)) {
-               struct dst_entry ovs_dst;
+               struct rtable ovs_rt = { 0 };
                unsigned long orig_dst;
 
                prepare_frag(vport, skb, orig_network_offset,
                             ovs_key_mac_proto(key));
-               dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
+               dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
                         DST_OBSOLETE_NONE, DST_NOCOUNT);
-               ovs_dst.dev = vport->dev;
+               ovs_rt.dst.dev = vport->dev;
 
                orig_dst = skb->_skb_refdst;
-               skb_dst_set_noref(skb, &ovs_dst);
+               skb_dst_set_noref(skb, &ovs_rt.dst);
                IPCB(skb)->frag_max_size = mru;
 
                ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
index e1e77d3..8c06381 100644 (file)
@@ -90,16 +90,16 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
        }
 
        if (skb_protocol(skb, true) == htons(ETH_P_IP)) {
-               struct dst_entry sch_frag_dst;
+               struct rtable sch_frag_rt = { 0 };
                unsigned long orig_dst;
 
                sch_frag_prepare_frag(skb, xmit);
-               dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1,
+               dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
                         DST_OBSOLETE_NONE, DST_NOCOUNT);
-               sch_frag_dst.dev = skb->dev;
+               sch_frag_rt.dst.dev = skb->dev;
 
                orig_dst = skb->_skb_refdst;
-               skb_dst_set_noref(skb, &sch_frag_dst);
+               skb_dst_set_noref(skb, &sch_frag_rt.dst);
                IPCB(skb)->frag_max_size = mru;
 
                ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
index 5f9a7c0..5b44d22 100644 (file)
@@ -858,11 +858,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
        struct sctp_chunk *retval;
        __u32 ctsn;
 
-       if (chunk && chunk->asoc)
-               ctsn = sctp_tsnmap_get_ctsn(&chunk->asoc->peer.tsn_map);
-       else
-               ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
-
+       ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
        shut.cum_tsn_ack = htonl(ctsn);
 
        retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
index 0948f14..ce15d59 100644 (file)
@@ -826,28 +826,6 @@ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds,
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
 }
 
-static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds,
-                                 struct sctp_association *asoc,
-                                 struct sctp_association *new)
-{
-       struct net *net = asoc->base.net;
-       struct sctp_chunk *abort;
-
-       if (!sctp_assoc_update(asoc, new))
-               return;
-
-       abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
-       if (abort) {
-               sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
-               sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
-       }
-       sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED));
-       sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED,
-                       SCTP_PERR(SCTP_ERROR_RSRC_LOW));
-       SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
-       SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
-}
-
 /* Helper function to change the state of an association. */
 static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
                               struct sctp_association *asoc,
@@ -1301,10 +1279,6 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
                        sctp_endpoint_add_asoc(ep, asoc);
                        break;
 
-               case SCTP_CMD_UPDATE_ASSOC:
-                      sctp_cmd_assoc_update(commands, asoc, cmd->obj.asoc);
-                      break;
-
                case SCTP_CMD_PURGE_OUTQUEUE:
                       sctp_outq_teardown(&asoc->outqueue);
                       break;
index 7632714..fd1e319 100644 (file)
@@ -1773,6 +1773,30 @@ enum sctp_disposition sctp_sf_do_5_2_3_initack(
                return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
 }
 
+static int sctp_sf_do_assoc_update(struct sctp_association *asoc,
+                                  struct sctp_association *new,
+                                  struct sctp_cmd_seq *cmds)
+{
+       struct net *net = asoc->base.net;
+       struct sctp_chunk *abort;
+
+       if (!sctp_assoc_update(asoc, new))
+               return 0;
+
+       abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
+       if (abort) {
+               sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
+               sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+       }
+       sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED));
+       sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED,
+                       SCTP_PERR(SCTP_ERROR_RSRC_LOW));
+       SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+       SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+       return -ENOMEM;
+}
+
 /* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A')
  *
  * Section 5.2.4
@@ -1852,20 +1876,22 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
        sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());
 
-       repl = sctp_make_cookie_ack(new_asoc, chunk);
+       /* Update the content of current association. */
+       if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
+               goto nomem;
+
+       repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;
 
        /* Report association restart to upper layer. */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0,
-                                            new_asoc->c.sinit_num_ostreams,
-                                            new_asoc->c.sinit_max_instreams,
+                                            asoc->c.sinit_num_ostreams,
+                                            asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);
        if (!ev)
                goto nomem_ev;
 
-       /* Update the content of current association. */
-       sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
        if ((sctp_state(asoc, SHUTDOWN_PENDING) ||
             sctp_state(asoc, SHUTDOWN_SENT)) &&
@@ -1925,14 +1951,17 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
        if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
                return SCTP_DISPOSITION_DISCARD;
 
-       /* Update the content of current association.  */
-       sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_ESTABLISHED));
-       SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+       if (asoc->state < SCTP_STATE_ESTABLISHED)
+               SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
 
-       repl = sctp_make_cookie_ack(new_asoc, chunk);
+       /* Update the content of current association.  */
+       if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
+               goto nomem;
+
+       repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;
 
index b7b9013..40f9f6c 100644 (file)
@@ -357,6 +357,18 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
        return af;
 }
 
+static void sctp_auto_asconf_init(struct sctp_sock *sp)
+{
+       struct net *net = sock_net(&sp->inet.sk);
+
+       if (net->sctp.default_auto_asconf) {
+               spin_lock(&net->sctp.addr_wq_lock);
+               list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist);
+               spin_unlock(&net->sctp.addr_wq_lock);
+               sp->do_auto_asconf = 1;
+       }
+}
+
 /* Bind a local address either to an endpoint or to an association.  */
 static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 {
@@ -418,8 +430,10 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
                return -EADDRINUSE;
 
        /* Refresh ephemeral port.  */
-       if (!bp->port)
+       if (!bp->port) {
                bp->port = inet_sk(sk)->inet_num;
+               sctp_auto_asconf_init(sp);
+       }
 
        /* Add the address to the bind address list.
         * Use GFP_ATOMIC since BHs will be disabled.
@@ -1520,9 +1534,11 @@ static void sctp_close(struct sock *sk, long timeout)
 
        /* Supposedly, no process has access to the socket, but
         * the net layers still may.
+        * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+        * held and that should be grabbed before socket lock.
         */
-       local_bh_disable();
-       bh_lock_sock(sk);
+       spin_lock_bh(&net->sctp.addr_wq_lock);
+       bh_lock_sock_nested(sk);
 
        /* Hold the sock, since sk_common_release() will put sock_put()
         * and we have just a little more cleanup.
@@ -1531,7 +1547,7 @@ static void sctp_close(struct sock *sk, long timeout)
        sk_common_release(sk);
 
        bh_unlock_sock(sk);
-       local_bh_enable();
+       spin_unlock_bh(&net->sctp.addr_wq_lock);
 
        sock_put(sk);
 
@@ -4991,16 +5007,6 @@ static int sctp_init_sock(struct sock *sk)
        sk_sockets_allocated_inc(sk);
        sock_prot_inuse_add(net, sk->sk_prot, 1);
 
-       if (net->sctp.default_auto_asconf) {
-               spin_lock(&sock_net(sk)->sctp.addr_wq_lock);
-               list_add_tail(&sp->auto_asconf_list,
-                   &net->sctp.auto_asconf_splist);
-               sp->do_auto_asconf = 1;
-               spin_unlock(&sock_net(sk)->sctp.addr_wq_lock);
-       } else {
-               sp->do_auto_asconf = 0;
-       }
-
        local_bh_enable();
 
        return 0;
@@ -5025,9 +5031,7 @@ static void sctp_destroy_sock(struct sock *sk)
 
        if (sp->do_auto_asconf) {
                sp->do_auto_asconf = 0;
-               spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
                list_del(&sp->auto_asconf_list);
-               spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
        }
        sctp_endpoint_free(sp->ep);
        local_bh_disable();
@@ -9398,6 +9402,8 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                        return err;
        }
 
+       sctp_auto_asconf_init(newsp);
+
        /* Move any messages in the old socket's receive queue that are for the
         * peeled off association to the new socket's receive queue.
         */
index be3e80b..5eff7cc 100644 (file)
@@ -2161,6 +2161,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
        struct smc_sock *smc;
        int val, rc;
 
+       if (level == SOL_TCP && optname == TCP_ULP)
+               return -EOPNOTSUPP;
+
        smc = smc_sk(sk);
 
        /* generic setsockopts reaching us here always apply to the
@@ -2185,7 +2188,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
        if (rc || smc->use_fallback)
                goto out;
        switch (optname) {
-       case TCP_ULP:
        case TCP_FASTOPEN:
        case TCP_FASTOPEN_CONNECT:
        case TCP_FASTOPEN_KEY:
index 612f0a6..f555d33 100644 (file)
@@ -1799,7 +1799,6 @@ call_allocate(struct rpc_task *task)
 
        status = xprt->ops->buf_alloc(task);
        trace_rpc_buf_alloc(task, status);
-       xprt_inject_disconnect(xprt);
        if (status == 0)
                return;
        if (status != -ENOMEM) {
@@ -2458,12 +2457,6 @@ call_decode(struct rpc_task *task)
        }
 
        /*
-        * Ensure that we see all writes made by xprt_complete_rqst()
-        * before it changed req->rq_reply_bytes_recvd.
-        */
-       smp_rmb();
-
-       /*
         * Did we ever call xprt_complete_rqst()? If not, we should assume
         * the message is incomplete.
         */
@@ -2471,6 +2464,11 @@ call_decode(struct rpc_task *task)
        if (!req->rq_reply_bytes_recvd)
                goto out;
 
+       /* Ensure that we see all writes made by xprt_complete_rqst()
+        * before it changed req->rq_reply_bytes_recvd.
+        */
+       smp_rmb();
+
        req->rq_rcv_buf.len = req->rq_private_buf.len;
        trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf);
 
index 38fe2ce..647b323 100644 (file)
@@ -344,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
                                    const char *hostname,
                                    struct sockaddr *srvaddr, size_t salen,
                                    int proto, u32 version,
-                                   const struct cred *cred)
+                                   const struct cred *cred,
+                                   const struct rpc_timeout *timeo)
 {
        struct rpc_create_args args = {
                .net            = net,
                .protocol       = proto,
                .address        = srvaddr,
                .addrsize       = salen,
+               .timeout        = timeo,
                .servername     = hostname,
                .nodename       = nodename,
                .program        = &rpcb_program,
@@ -705,7 +707,8 @@ void rpcb_getport_async(struct rpc_task *task)
                                clnt->cl_nodename,
                                xprt->servername, sap, salen,
                                xprt->prot, bind_version,
-                               clnt->cl_cred);
+                               clnt->cl_cred,
+                               task->tk_client->cl_timeout);
        if (IS_ERR(rpcb_clnt)) {
                status = PTR_ERR(rpcb_clnt);
                goto bailout_nofree;
index d76dc9d..0de918c 100644 (file)
@@ -846,7 +846,8 @@ void
 svc_rqst_free(struct svc_rqst *rqstp)
 {
        svc_release_buffer(rqstp);
-       put_page(rqstp->rq_scratch_page);
+       if (rqstp->rq_scratch_page)
+               put_page(rqstp->rq_scratch_page);
        kfree(rqstp->rq_resp);
        kfree(rqstp->rq_argp);
        kfree(rqstp->rq_auth_data);
index 9eb5b6b..478f857 100644 (file)
@@ -1174,7 +1174,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
        tcp_sock_set_cork(svsk->sk_sk, true);
        err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
        xdr_free_bvec(xdr);
-       trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
+       trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
        if (err < 0 || sent != (xdr->len + sizeof(marker)))
                goto out_close;
        if (atomic_dec_and_test(&svsk->sk_sendqlen))
index 691ccf8..e5b5a96 100644 (file)
@@ -698,9 +698,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
        const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
        int status = 0;
 
-       if (time_before(jiffies, req->rq_minortimeo))
-               return status;
        if (time_before(jiffies, req->rq_majortimeo)) {
+               if (time_before(jiffies, req->rq_minortimeo))
+                       return status;
                if (to->to_exponential)
                        req->rq_timeout <<= 1;
                else
@@ -1352,6 +1352,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
                list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
                INIT_LIST_HEAD(&req->rq_xmit2);
 out:
+               atomic_long_inc(&xprt->xmit_queuelen);
                set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
                spin_unlock(&xprt->queue_lock);
        }
@@ -1381,6 +1382,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
                }
        } else
                list_del(&req->rq_xmit2);
+       atomic_long_dec(&req->rq_xprt->xmit_queuelen);
 }
 
 /**
@@ -1469,8 +1471,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
        struct rpc_xprt *xprt = req->rq_xprt;
 
        if (!xprt_lock_write(xprt, task)) {
-               trace_xprt_transmit_queued(xprt, task);
-
                /* Race breaker: someone may have transmitted us */
                if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
                        rpc_wake_up_queued_task_set_status(&xprt->sending,
@@ -1483,7 +1483,10 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 
 void xprt_end_transmit(struct rpc_task *task)
 {
-       xprt_release_write(task->tk_rqstp->rq_xprt, task);
+       struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+       xprt_inject_disconnect(xprt);
+       xprt_release_write(xprt, task);
 }
 
 /**
@@ -1537,8 +1540,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
                return status;
        }
 
-       if (is_retrans)
+       if (is_retrans) {
                task->tk_client->cl_stats->rpcretrans++;
+               trace_xprt_retransmit(req);
+       }
 
        xprt_inject_disconnect(xprt);
 
@@ -1885,7 +1890,6 @@ void xprt_release(struct rpc_task *task)
        spin_unlock(&xprt->transport_lock);
        if (req->rq_buffer)
                xprt->ops->buf_free(task);
-       xprt_inject_disconnect(xprt);
        xdr_free_bvec(&req->rq_rcv_buf);
        xdr_free_bvec(&req->rq_snd_buf);
        if (req->rq_cred != NULL)
index a249837..1151efd 100644 (file)
@@ -155,9 +155,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
 void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 {
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_rep *rep = req->rl_reply;
        struct rpc_xprt *xprt = rqst->rq_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
-       rpcrdma_recv_buffer_put(req->rl_reply);
+       rpcrdma_rep_put(&r_xprt->rx_buf, rep);
        req->rl_reply = NULL;
 
        spin_lock(&xprt->bc_pa_lock);
index 766a104..229fcc9 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-/**
- * frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_mr_init
- *
- */
-void frwr_release_mr(struct rpcrdma_mr *mr)
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+                         struct rpcrdma_mr *mr)
 {
-       int rc;
+       struct rpc_rdma_cid *cid = &mr->mr_cid;
 
-       rc = ib_dereg_mr(mr->frwr.fr_mr);
-       if (rc)
-               trace_xprtrdma_frwr_dereg(mr, rc);
-       kfree(mr->mr_sg);
-       kfree(mr);
+       cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+       cid->ci_completion_id = mr->mr_ibmr->res.id;
 }
 
 static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
@@ -75,20 +68,22 @@ static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
        }
 }
 
-static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+/**
+ * frwr_mr_release - Destroy one MR
+ * @mr: MR allocated by frwr_mr_init
+ *
+ */
+void frwr_mr_release(struct rpcrdma_mr *mr)
 {
-       struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
-       trace_xprtrdma_mr_recycle(mr);
-
-       frwr_mr_unmap(r_xprt, mr);
+       int rc;
 
-       spin_lock(&r_xprt->rx_buf.rb_lock);
-       list_del(&mr->mr_all);
-       r_xprt->rx_stats.mrs_recycled++;
-       spin_unlock(&r_xprt->rx_buf.rb_lock);
+       frwr_mr_unmap(mr->mr_xprt, mr);
 
-       frwr_release_mr(mr);
+       rc = ib_dereg_mr(mr->mr_ibmr);
+       if (rc)
+               trace_xprtrdma_frwr_dereg(mr, rc);
+       kfree(mr->mr_sg);
+       kfree(mr);
 }
 
 static void frwr_mr_put(struct rpcrdma_mr *mr)
@@ -144,10 +139,11 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
                goto out_list_err;
 
        mr->mr_xprt = r_xprt;
-       mr->frwr.fr_mr = frmr;
+       mr->mr_ibmr = frmr;
        mr->mr_device = NULL;
        INIT_LIST_HEAD(&mr->mr_list);
-       init_completion(&mr->frwr.fr_linv_done);
+       init_completion(&mr->mr_linv_done);
+       frwr_cid_init(ep, mr);
 
        sg_init_table(sg, depth);
        mr->mr_sg = sg;
@@ -257,6 +253,7 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
        ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
        ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
        ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+       ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
        ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
 
        ep->re_max_rdma_segs =
@@ -326,7 +323,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                goto out_dmamap_err;
        mr->mr_device = ep->re_id->device;
 
-       ibmr = mr->frwr.fr_mr;
+       ibmr = mr->mr_ibmr;
        n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
        if (n != dma_nents)
                goto out_mapmr_err;
@@ -336,7 +333,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
        key = (u8)(ibmr->rkey & 0x000000FF);
        ib_update_fast_reg_key(ibmr, ++key);
 
-       reg_wr = &mr->frwr.fr_regwr;
+       reg_wr = &mr->mr_regwr;
        reg_wr->mr = ibmr;
        reg_wr->key = ibmr->rkey;
        reg_wr->access = writing ?
@@ -364,29 +361,19 @@ out_mapmr_err:
  * @cq: completion queue
  * @wc: WCE for a completed FastReg WR
  *
+ * Each flushed MR gets destroyed after the QP has drained.
  */
 static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
-       /* The MR will get recycled when the associated req is retransmitted */
+       trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
 
-static void frwr_cid_init(struct rpcrdma_ep *ep,
-                         struct rpcrdma_frwr *frwr)
-{
-       struct rpc_rdma_cid *cid = &frwr->fr_cid;
-
-       cid->ci_queue_id = ep->re_attr.send_cq->res.id;
-       cid->ci_completion_id = frwr->fr_mr->res.id;
-}
-
 /**
  * frwr_send - post Send WRs containing the RPC Call message
  * @r_xprt: controlling transport instance
@@ -403,27 +390,36 @@ static void frwr_cid_init(struct rpcrdma_ep *ep,
  */
 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
+       struct ib_send_wr *post_wr, *send_wr = &req->rl_wr;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       struct ib_send_wr *post_wr;
        struct rpcrdma_mr *mr;
+       unsigned int num_wrs;
 
-       post_wr = &req->rl_wr;
+       num_wrs = 1;
+       post_wr = send_wr;
        list_for_each_entry(mr, &req->rl_registered, mr_list) {
-               struct rpcrdma_frwr *frwr;
-
-               frwr = &mr->frwr;
-
-               frwr->fr_cqe.done = frwr_wc_fastreg;
-               frwr_cid_init(ep, frwr);
-               frwr->fr_regwr.wr.next = post_wr;
-               frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
-               frwr->fr_regwr.wr.num_sge = 0;
-               frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
-               frwr->fr_regwr.wr.send_flags = 0;
+               trace_xprtrdma_mr_fastreg(mr);
+
+               mr->mr_cqe.done = frwr_wc_fastreg;
+               mr->mr_regwr.wr.next = post_wr;
+               mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+               mr->mr_regwr.wr.num_sge = 0;
+               mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+               mr->mr_regwr.wr.send_flags = 0;
+               post_wr = &mr->mr_regwr.wr;
+               ++num_wrs;
+       }
 
-               post_wr = &frwr->fr_regwr.wr;
+       if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) {
+               send_wr->send_flags |= IB_SEND_SIGNALED;
+               ep->re_send_count = min_t(unsigned int, ep->re_send_batch,
+                                         num_wrs - ep->re_send_count);
+       } else {
+               send_wr->send_flags &= ~IB_SEND_SIGNALED;
+               ep->re_send_count -= num_wrs;
        }
 
+       trace_xprtrdma_post_send(req);
        return ib_post_send(ep->re_id->qp, post_wr, NULL);
 }
 
@@ -440,6 +436,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
        list_for_each_entry(mr, mrs, mr_list)
                if (mr->mr_handle == rep->rr_inv_rkey) {
                        list_del_init(&mr->mr_list);
+                       trace_xprtrdma_mr_reminv(mr);
                        frwr_mr_put(mr);
                        break;  /* only one invalidated MR per RPC */
                }
@@ -447,9 +444,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 
 static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 {
-       if (wc->status != IB_WC_SUCCESS)
-               frwr_mr_recycle(mr);
-       else
+       if (likely(wc->status == IB_WC_SUCCESS))
                frwr_mr_put(mr);
 }
 
@@ -462,12 +457,10 @@ static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
+       trace_xprtrdma_wc_li(wc, &mr->mr_cid);
        frwr_mr_done(wc, mr);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -483,14 +476,12 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
+       trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid);
        frwr_mr_done(wc, mr);
-       complete(&frwr->fr_linv_done);
+       complete(&mr->mr_linv_done);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
@@ -511,7 +502,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        struct ib_send_wr *first, **prev, *last;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        const struct ib_send_wr *bad_wr;
-       struct rpcrdma_frwr *frwr;
        struct rpcrdma_mr *mr;
        int rc;
 
@@ -520,35 +510,34 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * Chain the LOCAL_INV Work Requests and post them with
         * a single ib_post_send() call.
         */
-       frwr = NULL;
        prev = &first;
        while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
                trace_xprtrdma_mr_localinv(mr);
                r_xprt->rx_stats.local_inv_needed++;
 
-               frwr = &mr->frwr;
-               frwr->fr_cqe.done = frwr_wc_localinv;
-               frwr_cid_init(ep, frwr);
-               last = &frwr->fr_invwr;
+               last = &mr->mr_invwr;
                last->next = NULL;
-               last->wr_cqe = &frwr->fr_cqe;
+               last->wr_cqe = &mr->mr_cqe;
                last->sg_list = NULL;
                last->num_sge = 0;
                last->opcode = IB_WR_LOCAL_INV;
                last->send_flags = IB_SEND_SIGNALED;
                last->ex.invalidate_rkey = mr->mr_handle;
 
+               last->wr_cqe->done = frwr_wc_localinv;
+
                *prev = last;
                prev = &last->next;
        }
+       mr = container_of(last, struct rpcrdma_mr, mr_invwr);
 
        /* Strong send queue ordering guarantees that when the
         * last WR in the chain completes, all WRs in the chain
         * are complete.
         */
-       frwr->fr_cqe.done = frwr_wc_localinv_wake;
-       reinit_completion(&frwr->fr_linv_done);
+       last->wr_cqe->done = frwr_wc_localinv_wake;
+       reinit_completion(&mr->mr_linv_done);
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
@@ -562,22 +551,12 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * not happen, so don't wait in that case.
         */
        if (bad_wr != first)
-               wait_for_completion(&frwr->fr_linv_done);
+               wait_for_completion(&mr->mr_linv_done);
        if (!rc)
                return;
 
-       /* Recycle MRs in the LOCAL_INV chain that did not get posted.
-        */
+       /* On error, the MRs get destroyed once the QP has drained. */
        trace_xprtrdma_post_linv_err(req, rc);
-       while (bad_wr) {
-               frwr = container_of(bad_wr, struct rpcrdma_frwr,
-                                   fr_invwr);
-               mr = container_of(frwr, struct rpcrdma_mr, frwr);
-               bad_wr = bad_wr->next;
-
-               list_del_init(&mr->mr_list);
-               frwr_mr_recycle(mr);
-       }
 }
 
 /**
@@ -589,20 +568,24 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
-       struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
+       struct rpcrdma_rep *rep;
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
-       frwr_mr_done(wc, mr);
+       trace_xprtrdma_wc_li_done(wc, &mr->mr_cid);
 
-       /* Ensure @rep is generated before frwr_mr_done */
+       /* Ensure that @rep is generated before the MR is released */
+       rep = mr->mr_req->rl_reply;
        smp_rmb();
-       rpcrdma_complete_rqst(rep);
 
-       rpcrdma_flush_disconnect(cq->cq_context, wc);
+       if (wc->status != IB_WC_SUCCESS) {
+               if (rep)
+                       rpcrdma_unpin_rqst(rep);
+               rpcrdma_flush_disconnect(cq->cq_context, wc);
+               return;
+       }
+       frwr_mr_put(mr);
+       rpcrdma_complete_rqst(rep);
 }
 
 /**
@@ -619,33 +602,29 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *first, *last, **prev;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       const struct ib_send_wr *bad_wr;
-       struct rpcrdma_frwr *frwr;
        struct rpcrdma_mr *mr;
        int rc;
 
        /* Chain the LOCAL_INV Work Requests and post them with
         * a single ib_post_send() call.
         */
-       frwr = NULL;
        prev = &first;
        while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
                trace_xprtrdma_mr_localinv(mr);
                r_xprt->rx_stats.local_inv_needed++;
 
-               frwr = &mr->frwr;
-               frwr->fr_cqe.done = frwr_wc_localinv;
-               frwr_cid_init(ep, frwr);
-               last = &frwr->fr_invwr;
+               last = &mr->mr_invwr;
                last->next = NULL;
-               last->wr_cqe = &frwr->fr_cqe;
+               last->wr_cqe = &mr->mr_cqe;
                last->sg_list = NULL;
                last->num_sge = 0;
                last->opcode = IB_WR_LOCAL_INV;
                last->send_flags = IB_SEND_SIGNALED;
                last->ex.invalidate_rkey = mr->mr_handle;
 
+               last->wr_cqe->done = frwr_wc_localinv;
+
                *prev = last;
                prev = &last->next;
        }
@@ -655,31 +634,23 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * are complete. The last completion will wake up the
         * RPC waiter.
         */
-       frwr->fr_cqe.done = frwr_wc_localinv_done;
+       last->wr_cqe->done = frwr_wc_localinv_done;
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
         * unless re_id->qp is a valid pointer.
         */
-       bad_wr = NULL;
-       rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
+       rc = ib_post_send(ep->re_id->qp, first, NULL);
        if (!rc)
                return;
 
-       /* Recycle MRs in the LOCAL_INV chain that did not get posted.
-        */
+       /* On error, the MRs get destroyed once the QP has drained. */
        trace_xprtrdma_post_linv_err(req, rc);
-       while (bad_wr) {
-               frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
-               mr = container_of(frwr, struct rpcrdma_mr, frwr);
-               bad_wr = bad_wr->next;
-
-               frwr_mr_recycle(mr);
-       }
 
        /* The final LOCAL_INV WR in the chain is supposed to
-        * do the wake. If it was never posted, the wake will
-        * not happen, so wake here in that case.
+        * do the wake. If it was never posted, the wake does
+        * not happen. Unpin the rqst in preparation for its
+        * retransmission.
         */
-       rpcrdma_complete_rqst(req->rl_reply);
+       rpcrdma_unpin_rqst(req->rl_reply);
 }
index 292f066..649f7d8 100644 (file)
@@ -1326,9 +1326,35 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
        return -EIO;
 }
 
-/* Perform XID lookup, reconstruction of the RPC reply, and
- * RPC completion while holding the transport lock to ensure
- * the rep, rqst, and rq_task pointers remain stable.
+/**
+ * rpcrdma_unpin_rqst - Release rqst without completing it
+ * @rep: RPC/RDMA Receive context
+ *
+ * This is done when a connection is lost so that a Reply
+ * can be dropped and its matching Call can be subsequently
+ * retransmitted on a new connection.
+ */
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
+{
+       struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt;
+       struct rpc_rqst *rqst = rep->rr_rqst;
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+       req->rl_reply = NULL;
+       rep->rr_rqst = NULL;
+
+       spin_lock(&xprt->queue_lock);
+       xprt_unpin_rqst(rqst);
+       spin_unlock(&xprt->queue_lock);
+}
+
+/**
+ * rpcrdma_complete_rqst - Pass completed rqst back to RPC
+ * @rep: RPC/RDMA Receive context
+ *
+ * Reconstruct the RPC reply and complete the transaction
+ * while @rqst is still pinned to ensure the rep, rqst, and
+ * rq_task pointers remain stable.
  */
 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
 {
@@ -1430,13 +1456,14 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                credits = 1;    /* don't deadlock */
        else if (credits > r_xprt->rx_ep->re_max_requests)
                credits = r_xprt->rx_ep->re_max_requests;
+       rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
+                          false);
        if (buf->rb_credits != credits)
                rpcrdma_update_cwnd(r_xprt, credits);
-       rpcrdma_post_recvs(r_xprt, false);
 
        req = rpcr_to_rdmar(rqst);
        if (unlikely(req->rl_reply))
-               rpcrdma_recv_buffer_put(req->rl_reply);
+               rpcrdma_rep_put(buf, req->rl_reply);
        req->rl_reply = rep;
        rep->rr_rqst = rqst;
 
@@ -1464,5 +1491,5 @@ out_shortreply:
        trace_xprtrdma_reply_short_err(rep);
 
 out:
-       rpcrdma_recv_buffer_put(rep);
+       rpcrdma_rep_put(buf, rep);
 }
index 056452c..d6bbafb 100644 (file)
@@ -921,42 +921,48 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
        __be32 *rdma_argp = rctxt->rc_recv_buf;
        struct svc_rdma_send_ctxt *sctxt;
+       unsigned int rc_size;
        __be32 *p;
        int ret;
 
        ret = -ENOTCONN;
        if (svc_xprt_is_dead(xprt))
-               goto err0;
+               goto drop_connection;
 
        ret = -ENOMEM;
        sctxt = svc_rdma_send_ctxt_get(rdma);
        if (!sctxt)
-               goto err0;
+               goto drop_connection;
 
+       ret = -EMSGSIZE;
        p = xdr_reserve_space(&sctxt->sc_stream,
                              rpcrdma_fixed_maxsz * sizeof(*p));
        if (!p)
-               goto err0;
+               goto put_ctxt;
 
        ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
        if (ret < 0)
-               goto err2;
+               goto reply_chunk;
+       rc_size = ret;
 
        *p++ = *rdma_argp;
        *p++ = *(rdma_argp + 1);
        *p++ = rdma->sc_fc_credits;
        *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg;
 
-       if (svc_rdma_encode_read_list(sctxt) < 0)
-               goto err0;
-       if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
-               goto err0;
-       if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
-               goto err0;
+       ret = svc_rdma_encode_read_list(sctxt);
+       if (ret < 0)
+               goto put_ctxt;
+       ret = svc_rdma_encode_write_list(rctxt, sctxt);
+       if (ret < 0)
+               goto put_ctxt;
+       ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size);
+       if (ret < 0)
+               goto put_ctxt;
 
        ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
        if (ret < 0)
-               goto err1;
+               goto put_ctxt;
 
        /* Prevent svc_xprt_release() from releasing the page backing
         * rq_res.head[0].iov_base. It's no longer being accessed by
@@ -964,16 +970,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        rqstp->rq_respages++;
        return 0;
 
- err2:
+reply_chunk:
        if (ret != -E2BIG && ret != -EINVAL)
-               goto err1;
+               goto put_ctxt;
 
        svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
        return 0;
 
- err1:
+put_ctxt:
        svc_rdma_send_ctxt_put(rdma, sctxt);
- err0:
+drop_connection:
        trace_svcrdma_send_err(rqstp, ret);
        svc_xprt_deferred_close(&rdma->sc_xprt);
        return -ENOTCONN;
index 78d29d1..0995359 100644 (file)
@@ -262,8 +262,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
  * xprt_rdma_inject_disconnect - inject a connection fault
  * @xprt: transport context
  *
- * If @xprt is connected, disconnect it to simulate spurious connection
- * loss.
+ * If @xprt is connected, disconnect it to simulate spurious
+ * connection loss. Caller must hold @xprt's send lock to
+ * ensure that data structures and hardware resources are
+ * stable during the rdma_disconnect() call.
  */
 static void
 xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
index ec912cf..1e965a3 100644 (file)
@@ -101,6 +101,12 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct rdma_cm_id *id = ep->re_id;
 
+       /* Wait for rpcrdma_post_recvs() to leave its critical
+        * section.
+        */
+       if (atomic_inc_return(&ep->re_receiving) > 1)
+               wait_for_completion(&ep->re_done);
+
        /* Flush Receives, then wait for deferred Reply work
         * to complete.
         */
@@ -114,22 +120,6 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
        rpcrdma_ep_put(ep);
 }
 
-/**
- * rpcrdma_qp_event_handler - Handle one QP event (error notification)
- * @event: details of the event
- * @context: ep that owns QP where event occurred
- *
- * Called from the RDMA provider (device driver) possibly in an interrupt
- * context. The QP is always destroyed before the ID, so the ID will be
- * reliably available when this handler is invoked.
- */
-static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
-{
-       struct rpcrdma_ep *ep = context;
-
-       trace_xprtrdma_qp_event(ep, event);
-}
-
 /* Ensure xprt_force_disconnect() is invoked exactly once when a
  * connection is closed or lost. (The important thing is it needs
  * to be invoked "at least" once).
@@ -205,7 +195,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 out_flushed:
        rpcrdma_flush_disconnect(r_xprt, wc);
-       rpcrdma_rep_destroy(rep);
+       rpcrdma_rep_put(&r_xprt->rx_buf, rep);
 }
 
 static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
@@ -414,6 +404,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
        __module_get(THIS_MODULE);
        device = id->device;
        ep->re_id = id;
+       reinit_completion(&ep->re_done);
 
        ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
        ep->re_inline_send = xprt_rdma_max_inline_write;
@@ -424,8 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 
        r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
 
-       ep->re_attr.event_handler = rpcrdma_qp_event_handler;
-       ep->re_attr.qp_context = ep;
        ep->re_attr.srq = NULL;
        ep->re_attr.cap.max_inline_data = 0;
        ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -535,7 +524,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
         * outstanding Receives.
         */
        rpcrdma_ep_get(ep);
-       rpcrdma_post_recvs(r_xprt, true);
+       rpcrdma_post_recvs(r_xprt, 1, true);
 
        rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
        if (rc)
@@ -954,13 +943,11 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
                rpcrdma_req_reset(req);
 }
 
-/* No locking needed here. This function is called only by the
- * Receive completion handler.
- */
 static noinline
 struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
                                       bool temp)
 {
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
        struct rpcrdma_rep *rep;
 
        rep = kzalloc(sizeof(*rep), GFP_KERNEL);
@@ -987,7 +974,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
        rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
        rep->rr_recv_wr.num_sge = 1;
        rep->rr_temp = temp;
-       list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
+
+       spin_lock(&buf->rb_lock);
+       list_add(&rep->rr_all, &buf->rb_all_reps);
+       spin_unlock(&buf->rb_lock);
        return rep;
 
 out_free_regbuf:
@@ -998,16 +988,23 @@ out:
        return NULL;
 }
 
-/* No locking needed here. This function is invoked only by the
- * Receive completion handler, or during transport shutdown.
- */
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
 {
-       list_del(&rep->rr_all);
        rpcrdma_regbuf_free(rep->rr_rdmabuf);
        kfree(rep);
 }
 
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+       struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
+
+       spin_lock(&buf->rb_lock);
+       list_del(&rep->rr_all);
+       spin_unlock(&buf->rb_lock);
+
+       rpcrdma_rep_free(rep);
+}
+
 static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
 {
        struct llist_node *node;
@@ -1019,12 +1016,21 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
        return llist_entry(node, struct rpcrdma_rep, rr_node);
 }
 
-static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
-                           struct rpcrdma_rep *rep)
+/**
+ * rpcrdma_rep_put - Release rpcrdma_rep back to free list
+ * @buf: buffer pool
+ * @rep: rep to release
+ *
+ */
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep)
 {
        llist_add(&rep->rr_node, &buf->rb_free_reps);
 }
 
+/* Caller must ensure the QP is quiescent (RQ is drained) before
+ * invoking this function, to guarantee rb_all_reps is not
+ * changing.
+ */
 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1032,7 +1038,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
 
        list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
                rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
-               rep->rr_temp = true;
+               rep->rr_temp = true;    /* Mark this rep for destruction */
        }
 }
 
@@ -1040,8 +1046,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
 {
        struct rpcrdma_rep *rep;
 
-       while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
-               rpcrdma_rep_destroy(rep);
+       spin_lock(&buf->rb_lock);
+       while ((rep = list_first_entry_or_null(&buf->rb_all_reps,
+                                              struct rpcrdma_rep,
+                                              rr_all)) != NULL) {
+               list_del(&rep->rr_all);
+               spin_unlock(&buf->rb_lock);
+
+               rpcrdma_rep_free(rep);
+
+               spin_lock(&buf->rb_lock);
+       }
+       spin_unlock(&buf->rb_lock);
 }
 
 /**
@@ -1104,7 +1120,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)
                list_del(&mr->mr_all);
                spin_unlock(&buf->rb_lock);
 
-               frwr_release_mr(mr);
+               frwr_mr_release(mr);
        }
 
        rpcrdma_regbuf_free(req->rl_recvbuf);
@@ -1135,7 +1151,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
                list_del(&mr->mr_all);
                spin_unlock(&buf->rb_lock);
 
-               frwr_release_mr(mr);
+               frwr_mr_release(mr);
 
                spin_lock(&buf->rb_lock);
        }
@@ -1221,17 +1237,6 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
        spin_unlock(&buffers->rb_lock);
 }
 
-/**
- * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
- * @rep: rep to release
- *
- * Used after error conditions.
- */
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
-{
-       rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
-}
-
 /* Returns a pointer to a rpcrdma_regbuf object, or NULL.
  *
  * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
@@ -1342,21 +1347,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
  */
 int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-       struct ib_send_wr *send_wr = &req->rl_wr;
-       struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       int rc;
-
-       if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
-               send_wr->send_flags |= IB_SEND_SIGNALED;
-               ep->re_send_count = ep->re_send_batch;
-       } else {
-               send_wr->send_flags &= ~IB_SEND_SIGNALED;
-               --ep->re_send_count;
-       }
-
-       trace_xprtrdma_post_send(req);
-       rc = frwr_send(r_xprt, req);
-       if (rc)
+       if (frwr_send(r_xprt, req))
                return -ENOTCONN;
        return 0;
 }
@@ -1364,27 +1355,30 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 /**
  * rpcrdma_post_recvs - Refill the Receive Queue
  * @r_xprt: controlling transport instance
- * @temp: mark Receive buffers to be deleted after use
+ * @needed: current credit grant
+ * @temp: mark Receive buffers to be deleted after one use
  *
  */
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct ib_recv_wr *wr, *bad_wr;
        struct rpcrdma_rep *rep;
-       int needed, count, rc;
+       int count, rc;
 
        rc = 0;
        count = 0;
 
-       needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
        if (likely(ep->re_receive_count > needed))
                goto out;
        needed -= ep->re_receive_count;
        if (!temp)
                needed += RPCRDMA_MAX_RECV_BATCH;
 
+       if (atomic_inc_return(&ep->re_receiving) > 1)
+               goto out;
+
        /* fast path: all needed reps can be found on the free list */
        wr = NULL;
        while (needed) {
@@ -1410,6 +1404,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 
        rc = ib_post_recv(ep->re_id->qp, wr,
                          (const struct ib_recv_wr **)&bad_wr);
+       if (atomic_dec_return(&ep->re_receiving) > 0)
+               complete(&ep->re_done);
+
 out:
        trace_xprtrdma_post_recvs(r_xprt, count, rc);
        if (rc) {
@@ -1418,7 +1415,7 @@ out:
 
                        rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
                        wr = wr->next;
-                       rpcrdma_recv_buffer_put(rep);
+                       rpcrdma_rep_put(buf, rep);
                        --count;
                }
        }
index fe3be98..436ad73 100644 (file)
@@ -83,6 +83,7 @@ struct rpcrdma_ep {
        unsigned int            re_max_inline_recv;
        int                     re_async_rc;
        int                     re_connect_status;
+       atomic_t                re_receiving;
        atomic_t                re_force_disconnect;
        struct ib_qp_init_attr  re_attr;
        wait_queue_head_t       re_connect_wait;
@@ -228,31 +229,28 @@ struct rpcrdma_sendctx {
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  */
-struct rpcrdma_frwr {
-       struct ib_mr                    *fr_mr;
-       struct ib_cqe                   fr_cqe;
-       struct rpc_rdma_cid             fr_cid;
-       struct completion               fr_linv_done;
-       union {
-               struct ib_reg_wr        fr_regwr;
-               struct ib_send_wr       fr_invwr;
-       };
-};
-
 struct rpcrdma_req;
 struct rpcrdma_mr {
        struct list_head        mr_list;
        struct rpcrdma_req      *mr_req;
+
+       struct ib_mr            *mr_ibmr;
        struct ib_device        *mr_device;
        struct scatterlist      *mr_sg;
        int                     mr_nents;
        enum dma_data_direction mr_dir;
-       struct rpcrdma_frwr     frwr;
+       struct ib_cqe           mr_cqe;
+       struct completion       mr_linv_done;
+       union {
+               struct ib_reg_wr        mr_regwr;
+               struct ib_send_wr       mr_invwr;
+       };
        struct rpcrdma_xprt     *mr_xprt;
        u32                     mr_handle;
        u32                     mr_length;
        u64                     mr_offset;
        struct list_head        mr_all;
+       struct rpc_rdma_cid     mr_cid;
 };
 
 /*
@@ -461,7 +459,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
 
 int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
@@ -480,7 +478,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
                        struct rpcrdma_req *req);
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep);
 
 bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
                            gfp_t flags);
@@ -527,7 +525,7 @@ rpcrdma_data_dir(bool writing)
 void frwr_reset(struct rpcrdma_req *req);
 int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
 int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
-void frwr_release_mr(struct rpcrdma_mr *mr);
+void frwr_mr_release(struct rpcrdma_mr *mr);
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                                struct rpcrdma_mr_seg *seg,
                                int nsegs, bool writing, __be32 xid,
@@ -560,6 +558,7 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
 void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep);
 void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
 
 static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
index e35760f..47aa47a 100644 (file)
@@ -558,6 +558,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
        struct rpc_rqst *req;
        ssize_t ret;
 
+       /* Is this transport associated with the backchannel? */
+       if (!xprt->bc_serv)
+               return -ESHUTDOWN;
+
        /* Look up and lock the request corresponding to the given XID */
        req = xprt_lookup_bc_request(xprt, transport->recv.xid);
        if (!req) {
@@ -1018,6 +1022,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
         * to cope with writespace callbacks arriving _after_ we have
         * called sendmsg(). */
        req->rq_xtime = ktime_get();
+       tcp_sock_set_cork(transport->inet, true);
        while (1) {
                status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
                                           transport->xmit.offset, rm, &sent);
@@ -1032,6 +1037,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
                if (likely(req->rq_bytes_sent >= msglen)) {
                        req->rq_xmit_bytes_sent += transport->xmit.offset;
                        transport->xmit.offset = 0;
+                       if (atomic_long_read(&xprt->xmit_queuelen) == 1)
+                               tcp_sock_set_cork(transport->inet, false);
                        return 0;
                }
 
@@ -2163,6 +2170,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                }
 
                xs_tcp_set_socket_timeouts(xprt, sock);
+               tcp_sock_set_nodelay(sk);
 
                write_lock_bh(&sk->sk_callback_lock);
 
@@ -2177,7 +2185,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
                /* socket options */
                sock_reset_flag(sk, SOCK_LINGER);
-               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
                xprt_clear_connected(xprt);
 
index 1c9ecb1..c99bc4c 100644 (file)
@@ -944,8 +944,6 @@ static int vmci_transport_recv_listen(struct sock *sk,
        bool old_request = false;
        bool old_pkt_proto = false;
 
-       err = 0;
-
        /* Because we are in the listen state, we could be receiving a packet
         * for ourself or any previous connection requests that we received.
         * If it's the latter, we try to find a socket in our list of pending
index 2ac3802..9d2a89d 100644 (file)
@@ -128,13 +128,12 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
                                            struct xdp_desc *desc)
 {
-       u64 chunk, chunk_end;
+       u64 chunk;
 
-       chunk = xp_aligned_extract_addr(pool, desc->addr);
-       chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
-       if (chunk != chunk_end)
+       if (desc->len > pool->chunk_size)
                return false;
 
+       chunk = xp_aligned_extract_addr(pool, desc->addr);
        if (chunk >= pool->addrs_cnt)
                return false;
 
index 2ed744c..d023816 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-cfag12864b-example
+/cfag12864b-example
index eb60241..8fa415a 100644 (file)
@@ -1 +1,2 @@
-binderfs_example
+# SPDX-License-Identifier: GPL-2.0
+/binderfs_example
index f9008be..37a657b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * vim: noexpandtab ts=8 sts=0 sw=8:
- *
  * configfs_example_macros.c - This file is a demonstration module
  *      containing a number of configfs subsystems.  It uses the helper
  *      macros defined by configfs.h
index d86f2ff..0e26039 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-ucon
+/ucon
index d7a6074..5233ab6 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-hid-example
+/hid-example
index 331dcf1..c495664 100644 (file)
@@ -47,6 +47,10 @@ static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
        pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n",
                p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr);
 #endif
+#ifdef CONFIG_RISCV
+       pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n",
+               p->symbol_name, p->addr, regs->epc, regs->status);
+#endif
 #ifdef CONFIG_S390
        pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
                p->symbol_name, p->addr, regs->psw.addr, regs->flags);
@@ -80,6 +84,10 @@ static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
        pr_info("<%s> post_handler: p->addr = 0x%p, cpsr = 0x%lx\n",
                p->symbol_name, p->addr, (long)regs->ARM_cpsr);
 #endif
+#ifdef CONFIG_RISCV
+       pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
+               p->symbol_name, p->addr, regs->status);
+#endif
 #ifdef CONFIG_S390
        pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
                p->symbol_name, p->addr, regs->flags);
index db5e802..fe894bc 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-mei-amt-version
+/mei-amt-version
index 8279341..6a718ee 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-ne_ioctl_sample
+/ne_ioctl_sample
index eea857f..d4cfa31 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-pidfd-metadata
+/pidfd-metadata
index 4a5a5b7..a6df0da 100644 (file)
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-bpf-direct
-bpf-fancy
-dropper
-user-trap
+/bpf-direct
+/bpf-fancy
+/dropper
+/user-trap
index 40510c3..cd9ff7b 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-hpet_example
+/hpet_example
index 861c769..881ef9a 100644 (file)
@@ -513,8 +513,6 @@ static int mbochs_create(struct mdev_device *mdev)
        struct device *dev = mdev_dev(mdev);
        struct mdev_state *mdev_state;
 
-       if (!type)
-               type = &mbochs_types[0];
        if (type->mbytes + mbochs_used_mbytes > max_mbytes)
                return -ENOMEM;
 
index f0c0e72..e889c1c 100644 (file)
@@ -667,8 +667,7 @@ static ssize_t description_show(struct mdev_type *mtype,
                &mdpy_types[mtype_get_type_group_id(mtype)];
 
        return sprintf(buf, "virtual display, %dx%d framebuffer\n",
-                      type ? type->width  : 0,
-                      type ? type->height : 0);
+                      type->width, type->height);
 }
 static MDEV_TYPE_ATTR_RO(description);
 
index 8fdabf7..79212d9 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-test-fsmount
-test-statx
+/test-fsmount
+/test-statx
index 2aa3c7e..823b351 100644 (file)
@@ -1 +1,2 @@
-watch_test
+# SPDX-License-Identifier: GPL-2.0-only
+/watch_test
index 74153b8..a70a015 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-watchdog-simple
+/watchdog-simple
index a6c1131..e83c620 100644 (file)
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
-bin2c
-kallsyms
-unifdef
-recordmcount
-sorttable
-asn1_compiler
-extract-cert
-sign-file
-insert-sys-cert
+/asn1_compiler
+/bin2c
+/extract-cert
+/insert-sys-cert
+/kallsyms
 /module.lds
+/recordmcount
+/sign-file
+/sorttable
+/unifdef
index 5e39b05..949f723 100644 (file)
@@ -354,7 +354,7 @@ $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE
 
 targets += $(filter-out $(subdir-builtin), $(real-obj-y))
 targets += $(filter-out $(subdir-modorder), $(real-obj-m))
-targets += $(lib-y) $(always-y) $(MAKECMDGOALS)
+targets += $(real-dtb-y) $(lib-y) $(always-y) $(MAKECMDGOALS)
 
 # Linker scripts preprocessor (.lds.S -> .lds)
 # ---------------------------------------------------------------------------
index 64daf37..1095055 100644 (file)
@@ -44,19 +44,22 @@ else
 obj-y          := $(filter-out %/, $(obj-y))
 endif
 
-# Expand $(foo-objs) $(foo-y) by calling $(call suffix-search,foo.o,-objs -y)
-suffix-search = $(strip $(foreach s, $2, $($(1:.o=$s))))
+# Expand $(foo-objs) $(foo-y) etc. by replacing their individuals
+suffix-search = $(strip $(foreach s, $3, $($(1:%$(strip $2)=%$s))))
+# List composite targets that are constructed by combining other targets
+multi-search = $(sort $(foreach m, $1, $(if $(call suffix-search, $m, $2, $3 -), $m)))
+# List primitive targets that are compiled from source files
+real-search = $(foreach m, $1, $(if $(call suffix-search, $m, $2, $3 -), $(call suffix-search, $m, $2, $3), $m))
+
 # If $(foo-objs), $(foo-y), $(foo-m), or $(foo-) exists, foo.o is a composite object
-multi-search = $(sort $(foreach m, $1, $(if $(call suffix-search, $m, $2 -), $m)))
-multi-obj-y := $(call multi-search,$(obj-y),-objs -y)
-multi-obj-m := $(call multi-search,$(obj-m),-objs -y -m)
+multi-obj-y := $(call multi-search, $(obj-y), .o, -objs -y)
+multi-obj-m := $(call multi-search, $(obj-m), .o, -objs -y -m)
 multi-obj-ym := $(multi-obj-y) $(multi-obj-m)
 
 # Replace multi-part objects by their individual parts,
 # including built-in.a from subdirectories
-real-search = $(foreach m, $1, $(if $(call suffix-search, $m, $2 -), $(call suffix-search, $m, $2), $m))
-real-obj-y := $(call real-search, $(obj-y),-objs -y)
-real-obj-m := $(call real-search, $(obj-m),-objs -y -m)
+real-obj-y := $(call real-search, $(obj-y), .o, -objs -y)
+real-obj-m := $(call real-search, $(obj-m), .o, -objs -y -m)
 
 always-y += $(always-m)
 
@@ -75,24 +78,18 @@ always-y += $(userprogs-always-y) $(userprogs-always-m)
 # If CONFIG_OF_ALL_DTBS is enabled, all DT blobs are built
 dtb-$(CONFIG_OF_ALL_DTBS)       += $(dtb-)
 
-# List all dtbs to be generated by fdtoverlay
-overlay-y := $(foreach m,$(dtb-y), $(if $(strip $($(m:.dtb=-dtbs))),$(m),))
-
-# Generate symbols for the base files so overlays can be applied to them.
-$(foreach m,$(overlay-y), $(eval DTC_FLAGS_$(basename $(firstword $($(m:.dtb=-dtbs)))) += -@))
-
-# Add base dtb and overlay dtbo
-dtb-y += $(foreach m,$(overlay-y), $($(m:.dtb=-dtbs)))
+# Composite DTB (i.e. DTB constructed by overlay)
+multi-dtb-y := $(call multi-search, $(dtb-y), .dtb, -dtbs)
+# Primitive DTB compiled from *.dts
+real-dtb-y := $(call real-search, $(dtb-y), .dtb, -dtbs)
+# Base DTB that overlay is applied onto (each first word of $(*-dtbs) expansion)
+base-dtb-y := $(foreach m, $(multi-dtb-y), $(firstword $(call suffix-search, $m, .dtb, -dtbs)))
 
 always-y                       += $(dtb-y)
 
 ifneq ($(CHECK_DTBS),)
-# Don't run schema checks for dtbs created by fdtoverlay as they don't
-# have corresponding dts files.
-dt-yaml-y := $(filter-out $(overlay-y),$(dtb-y))
-
-always-y += $(patsubst %.dtb,%.dt.yaml, $(dt-yaml-y))
-always-y += $(patsubst %.dtbo,%.dt.yaml, $(dt-yaml-y))
+always-y += $(patsubst %.dtb,%.dt.yaml, $(real-dtb-y))
+always-y += $(patsubst %.dtbo,%.dt.yaml, $(real-dtb-y))
 endif
 
 # Add subdir path
@@ -105,12 +102,14 @@ lib-y             := $(addprefix $(obj)/,$(lib-y))
 real-obj-y     := $(addprefix $(obj)/,$(real-obj-y))
 real-obj-m     := $(addprefix $(obj)/,$(real-obj-m))
 multi-obj-m    := $(addprefix $(obj)/, $(multi-obj-m))
+multi-dtb-y    := $(addprefix $(obj)/, $(multi-dtb-y))
+real-dtb-y     := $(addprefix $(obj)/, $(real-dtb-y))
 subdir-ym      := $(addprefix $(obj)/,$(subdir-ym))
 
 # Finds the multi-part object the current object will be linked into.
 # If the object belongs to two or more multi-part objects, list them all.
 modname-multi = $(sort $(foreach m,$(multi-obj-ym),\
-               $(if $(filter $*.o, $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m))),$(m:.o=))))
+               $(if $(filter $*.o, $(call suffix-search, $m, .o, -objs -y -m)),$(m:.o=))))
 
 __modname = $(if $(modname-multi),$(modname-multi),$(basetarget))
 
@@ -252,6 +251,9 @@ quiet_cmd_copy = COPY    $@
 
 # Shipped files
 # ===========================================================================
+# 'cp' preserves permissions. If you use it to copy a file in read-only srctree,
+# the copy would be read-only as well, leading to an error when executing the
+# rule next time. Use 'cat' instead in order to generate a writable file.
 
 quiet_cmd_shipped = SHIPPED $@
 cmd_shipped = cat $< > $@
@@ -319,6 +321,9 @@ endif
 
 DTC_FLAGS += $(DTC_FLAGS_$(basetarget))
 
+# Set -@ if the target is a base DTB that overlay is applied onto
+DTC_FLAGS += $(if $(filter $(patsubst $(obj)/%,%,$@), $(base-dtb-y)), -@)
+
 # Generate an assembly file to wrap the output of the device tree compiler
 quiet_cmd_dt_S_dtb= DTB     $@
 cmd_dt_S_dtb=                                          \
@@ -350,14 +355,12 @@ $(obj)/%.dtb: $(src)/%.dts $(DTC) FORCE
 $(obj)/%.dtbo: $(src)/%.dts $(DTC) FORCE
        $(call if_changed_dep,dtc)
 
-overlay-y := $(addprefix $(obj)/, $(overlay-y))
-
 quiet_cmd_fdtoverlay = DTOVL   $@
       cmd_fdtoverlay = $(objtree)/scripts/dtc/fdtoverlay -o $@ -i $(real-prereqs)
 
-$(overlay-y): FORCE
+$(multi-dtb-y): FORCE
        $(call if_changed,fdtoverlay)
-$(call multi_depend, $(overlay-y), .dtb, -dtbs)
+$(call multi_depend, $(multi-dtb-y), .dtb, -dtbs)
 
 DT_CHECKER ?= dt-validate
 DT_CHECKER_FLAGS ?= $(if $(DT_SCHEMA_FILES),,-m)
index 98ae1f5..961c91c 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-fixdep
+/fixdep
index ccb412a..23697a6 100755 (executable)
@@ -5829,7 +5829,7 @@ sub process {
                                next if ($arg =~ /\.\.\./);
                                next if ($arg =~ /^type$/i);
                                my $tmp_stmt = $define_stmt;
-                               $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
+                               $tmp_stmt =~ s/\b(__must_be_array|offsetof|sizeof|sizeof_field|__stringify|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
                                $tmp_stmt =~ s/\#+\s*$arg\b//g;
                                $tmp_stmt =~ s/\b$arg\s*\#\#//g;
                                my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g;
@@ -7006,7 +7006,7 @@ sub process {
                }
 
 # check for alloc argument mismatch
-               if ($line =~ /\b(kcalloc|kmalloc_array)\s*\(\s*sizeof\b/) {
+               if ($line =~ /\b((?:devm_)?(?:kcalloc|kmalloc_array))\s*\(\s*sizeof\b/) {
                        WARN("ALLOC_ARRAY_ARGS",
                             "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr);
                }
@@ -7198,6 +7198,17 @@ sub process {
                             "Using $1 should generally have parentheses around the comparison\n" . $herecurr);
                }
 
+# return sysfs_emit(foo, fmt, ...) fmt without newline
+               if ($line =~ /\breturn\s+sysfs_emit\s*\(\s*$FuncArg\s*,\s*($String)/ &&
+                   substr($rawline, $-[6], $+[6] - $-[6]) !~ /\\n"$/) {
+                       my $offset = $+[6] - 1;
+                       if (WARN("SYSFS_EMIT",
+                                "return sysfs_emit(...) formats should include a terminating newline\n" . $herecurr) &&
+                           $fix) {
+                               substr($fixed[$fixlinenr], $offset, 0) = '\\n';
+                       }
+               }
+
 # nested likely/unlikely calls
                if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) {
                        WARN("LIKELY_MISUSE",
index 8a8b62b..e0b5c1d 100644 (file)
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-dtc
-fdtoverlay
+/dtc
+/fdtoverlay
index b04e0f0..5cc385b 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-randomize_layout_seed.h
+/randomize_layout_seed.h
index 008e62f..15fc462 100644 (file)
@@ -16,6 +16,9 @@ import gdb
 from linux import tasks, utils
 
 
+task_type = utils.CachedType("struct task_struct")
+
+
 MAX_CPUS = 4096
 
 
@@ -156,6 +159,23 @@ Note that VAR has to be quoted as string."""
 
 PerCpu()
 
+def get_current_task(cpu):
+    task_ptr_type = task_type.get_type().pointer()
+
+    if utils.is_target_arch("x86"):
+         var_ptr = gdb.parse_and_eval("&current_task")
+         return per_cpu(var_ptr, cpu).dereference()
+    elif utils.is_target_arch("aarch64"):
+         current_task_addr = gdb.parse_and_eval("$SP_EL0")
+         if((current_task_addr >> 63) != 0):
+             current_task = current_task_addr.cast(task_ptr_type)
+             return current_task.dereference()
+         else:
+             raise gdb.GdbError("Sorry, obtaining the current task is not allowed "
+                                "while running in userspace(EL0)")
+    else:
+        raise gdb.GdbError("Sorry, obtaining the current task is not yet "
+                           "supported with this arch")
 
 class LxCurrentFunc(gdb.Function):
     """Return current task.
@@ -167,8 +187,7 @@ number. If CPU is omitted, the CPU of the current context is used."""
         super(LxCurrentFunc, self).__init__("lx_current")
 
     def invoke(self, cpu=-1):
-        var_ptr = gdb.parse_and_eval("&current_task")
-        return per_cpu(var_ptr, cpu).dereference()
+        return get_current_task(cpu)
 
 
 LxCurrentFunc()
index 1be9763..08d264a 100644 (file)
@@ -164,7 +164,8 @@ lx-symbols command."""
             saved_state['breakpoint'].enabled = saved_state['enabled']
 
     def invoke(self, arg, from_tty):
-        self.module_paths = [os.path.expanduser(p) for p in arg.split()]
+        self.module_paths = [os.path.abspath(os.path.expanduser(p))
+                             for p in arg.split()]
         self.module_paths.append(os.getcwd())
 
         # enforce update
index 999af71..0b275ab 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-genksyms
+/genksyms
index ce4f999..d6a422a 100644 (file)
@@ -22,7 +22,7 @@ $(obj)/pars%.tab.c $(obj)/pars%.tab.h: $(src)/pars%.y FORCE
 
 endif
 
-# -I needed for generated C source (shipped source)
+# -I needed for generated C source to include headers in source tree
 HOSTCFLAGS_parse.tab.o := -I $(srctree)/$(src)
 HOSTCFLAGS_lex.lex.o := -I $(srctree)/$(src)
 
index 2a85d34..4840e74 100755 (executable)
@@ -1777,6 +1777,7 @@ sub dump_function($$) {
     $prototype =~ s/^noinline +//;
     $prototype =~ s/__init +//;
     $prototype =~ s/__init_or_module +//;
+    $prototype =~ s/__deprecated +//;
     $prototype =~ s/__flatten +//;
     $prototype =~ s/__meminit +//;
     $prototype =~ s/__must_check +//;
index 7d11268..f4de4c9 100755 (executable)
@@ -320,20 +320,6 @@ cleanup()
        rm -f .vmlinux.d
 }
 
-on_exit()
-{
-       if [ $? -ne 0 ]; then
-               cleanup
-       fi
-}
-trap on_exit EXIT
-
-on_signals()
-{
-       exit 1
-}
-trap on_signals HUP INT QUIT TERM
-
 # Use "make V=1" to debug this script
 case "${KBUILD_VERBOSE}" in
 *1*)
index 07e4a39..0465ec3 100644 (file)
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-elfconfig.h
-mk_elfconfig
-modpost
-devicetable-offsets.h
+/devicetable-offsets.h
+/elfconfig.h
+/mk_elfconfig
+/modpost
index e8ce2a4..04c4b96 100644 (file)
@@ -44,7 +44,7 @@ generate_deps() {
                for source_file in $mod_source_files; do
                        sed '/MODULE_IMPORT_NS/Q' $source_file > ${source_file}.tmp
                        offset=$(wc -l ${source_file}.tmp | awk '{print $1;}')
-                       cat $source_file | grep MODULE_IMPORT_NS | LANG=C sort -u >> ${source_file}.tmp
+                       cat $source_file | grep MODULE_IMPORT_NS | LC_ALL=C sort -u >> ${source_file}.tmp
                        tail -n +$((offset +1)) ${source_file} | grep -v MODULE_IMPORT_NS >> ${source_file}.tmp
                        if ! diff -q ${source_file} ${source_file}.tmp; then
                                mv ${source_file}.tmp ${source_file}
index 936198a..221aa7d 100755 (executable)
@@ -125,6 +125,14 @@ case "${ARCH}" in
                        fi
                done
                ;;
+       riscv)
+               for i in Image.bz2 Image.gz Image; do
+                       if [ -f "${objtree}/arch/riscv/boot/${i}" ] ; then
+                               cp -v -- "${objtree}/arch/riscv/boot/${i}" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}"
+                               break
+                       fi
+               done
+               ;;
        *)
                [ -f "${KBUILD_IMAGE}" ] && cp -v -- "${KBUILD_IMAGE}" "${tmpdir}/boot/vmlinux-kbuild-${KERNELRELEASE}"
                echo "" >&2
index a5429b3..c17e480 100755 (executable)
@@ -392,7 +392,7 @@ if ($arch eq "x86_64") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
 } elsif ($arch eq "riscv") {
     $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:";
-    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL\\s_mcount\$";
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$";
     $type = ".quad";
     $alignment = 2;
 } elsif ($arch eq "nds32") {
@@ -497,7 +497,7 @@ sub update_funcs
 #
 # Step 2: find the sections and mcount call sites
 #
-open(IN, "LANG=C $objdump -hdr $inputfile|") || die "error running $objdump";
+open(IN, "LC_ALL=C $objdump -hdr $inputfile|") || die "error running $objdump";
 
 my $text;
 
diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files
new file mode 100755 (executable)
index 0000000..c3eb81c
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+set -e
+
+# When you move, remove or rename generated files, you probably also update
+# .gitignore and cleaning rules in the Makefile. This is the right thing
+# to do. However, people usually do 'git pull', 'git bisect', etc. without
+# running 'make clean'. Then, the stale generated files are left over, often
+# causing build issues.
+#
+# Also, 'git status' shows such stale build artifacts as untracked files.
+# What is worse, some people send a wrong patch to get them back to .gitignore
+# without checking the commit history.
+#
+# So, when you (re)move generated files, please move the cleaning rules from
+# the Makefile to this script. This is run before Kbuild starts building
+# anything, so people will not be annoyed by such garbage files.
+#
+# This script is not intended to grow endlessly. Rather, it is a temporary scrap
+# yard. Stale files stay in this file for a while (for some release cycles?),
+# then will be really dead and removed from the code base entirely.
+
+# These were previously generated source files. When you are building the kernel
+# with O=, make sure to remove the stale files in the output tree. Otherwise,
+# the build system wrongly compiles the stale ones.
+if [ -n "${building_out_of_srctree}" ]; then
+       for f in fdt_rw.c fdt_ro.c fdt_wip.c fdt.c
+       do
+               rm -f arch/arm/boot/compressed/${f}
+       done
+fi
index bb709ed..db941f6 100755 (executable)
@@ -126,7 +126,7 @@ scm_version()
        fi
 
        # Check for svn and a svn repo.
-       if rev=$(LANG= LC_ALL= LC_MESSAGES=C svn info 2>/dev/null | grep '^Last Changed Rev'); then
+       if rev=$(LC_ALL=C svn info 2>/dev/null | grep '^Last Changed Rev'); then
                rev=$(echo $rev | awk '{print $NF}')
                printf -- '-svn%s' "$rev"
 
index 7beb426..7b6a012 100644 (file)
@@ -480,6 +480,7 @@ devided||divided
 deviece||device
 devision||division
 diable||disable
+diabled||disabled
 dicline||decline
 dictionnary||dictionary
 didnt||didn't
@@ -1027,6 +1028,8 @@ oustanding||outstanding
 overaall||overall
 overhread||overhead
 overlaping||overlapping
+overflw||overflow
+overlfow||overflow
 overide||override
 overrided||overridden
 overriden||overridden
index fd96734..db8ba41 100755 (executable)
@@ -326,5 +326,5 @@ esac
 
 # Remove structure forward declarations.
 if [ -n "$remove_structs" ]; then
-    LANG=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1
+    LC_ALL=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1
 fi
index a92acc7..1a8ee4f 100755 (executable)
@@ -47,7 +47,6 @@ BEGIN {
        printversion("Net-tools", version("ifconfig --version"))
        printversion("Kbd", version("loadkeys -V"))
        printversion("Console-tools", version("loadkeys -V"))
-       printversion("Oprofile", version("oprofiled --version"))
        printversion("Sh-utils", version("expr --v"))
        printversion("Udev", version("udevadm --version"))
        printversion("Wireless-tools", version("iwconfig --version"))
index 3998e17..b638fc2 100644 (file)
@@ -1204,11 +1204,17 @@ static const char *get_line_out_pfx(struct hda_codec *codec, int ch,
                *index = ch;
                return "Headphone";
        case AUTO_PIN_LINE_OUT:
-               /* This deals with the case where we have two DACs and
-                * one LO, one HP and one Speaker */
-               if (!ch && cfg->speaker_outs && cfg->hp_outs) {
-                       bool hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type);
-                       bool spk_lo_shared = !path_has_mixer(codec, spec->speaker_paths[0], ctl_type);
+               /* This deals with the case where one HP or one Speaker or
+                * one HP + one Speaker need to share the DAC with LO
+                */
+               if (!ch) {
+                       bool hp_lo_shared = false, spk_lo_shared = false;
+
+                       if (cfg->speaker_outs)
+                               spk_lo_shared = !path_has_mixer(codec,
+                                                               spec->speaker_paths[0], ctl_type);
+                       if (cfg->hp_outs)
+                               hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type);
                        if (hp_lo_shared && spk_lo_shared)
                                return spec->vmaster_mute.hook ? "PCM" : "Master";
                        if (hp_lo_shared)
index bd7bfd7..6d58f24 100644 (file)
@@ -4338,6 +4338,35 @@ static void alc245_fixup_hp_x360_amp(struct hda_codec *codec,
        }
 }
 
+/* toggle GPIO2 at each time stream is started; we use PREPARE state instead */
+static void alc274_hp_envy_pcm_hook(struct hda_pcm_stream *hinfo,
+                                   struct hda_codec *codec,
+                                   struct snd_pcm_substream *substream,
+                                   int action)
+{
+       switch (action) {
+       case HDA_GEN_PCM_ACT_PREPARE:
+               alc_update_gpio_data(codec, 0x04, true);
+               break;
+       case HDA_GEN_PCM_ACT_CLEANUP:
+               alc_update_gpio_data(codec, 0x04, false);
+               break;
+       }
+}
+
+static void alc274_fixup_hp_envy_gpio(struct hda_codec *codec,
+                                     const struct hda_fixup *fix,
+                                     int action)
+{
+       struct alc_spec *spec = codec->spec;
+
+       if (action == HDA_FIXUP_ACT_PROBE) {
+               spec->gpio_mask |= 0x04;
+               spec->gpio_dir |= 0x04;
+               spec->gen.pcm_playback_hook = alc274_hp_envy_pcm_hook;
+       }
+}
+
 static void alc_update_coef_led(struct hda_codec *codec,
                                struct alc_coef_led *led,
                                bool polarity, bool on)
@@ -5695,6 +5724,18 @@ static void alc_fixup_tpt470_dacs(struct hda_codec *codec,
                spec->gen.preferred_dacs = preferred_pairs;
 }
 
+static void alc295_fixup_asus_dacs(struct hda_codec *codec,
+                                  const struct hda_fixup *fix, int action)
+{
+       static const hda_nid_t preferred_pairs[] = {
+               0x17, 0x02, 0x21, 0x03, 0
+       };
+       struct alc_spec *spec = codec->spec;
+
+       if (action == HDA_FIXUP_ACT_PRE_PROBE)
+               spec->gen.preferred_dacs = preferred_pairs;
+}
+
 static void alc_shutup_dell_xps13(struct hda_codec *codec)
 {
        struct alc_spec *spec = codec->spec;
@@ -6453,6 +6494,7 @@ enum {
        ALC255_FIXUP_XIAOMI_HEADSET_MIC,
        ALC274_FIXUP_HP_MIC,
        ALC274_FIXUP_HP_HEADSET_MIC,
+       ALC274_FIXUP_HP_ENVY_GPIO,
        ALC256_FIXUP_ASUS_HPE,
        ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
        ALC287_FIXUP_HP_GPIO_LED,
@@ -6463,6 +6505,8 @@ enum {
        ALC256_FIXUP_ACER_HEADSET_MIC,
        ALC285_FIXUP_IDEAPAD_S740_COEF,
        ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST,
+       ALC295_FIXUP_ASUS_DACS,
+       ALC295_FIXUP_HP_OMEN,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -7894,6 +7938,10 @@ static const struct hda_fixup alc269_fixups[] = {
                .chained = true,
                .chain_id = ALC274_FIXUP_HP_MIC
        },
+       [ALC274_FIXUP_HP_ENVY_GPIO] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc274_fixup_hp_envy_gpio,
+       },
        [ALC256_FIXUP_ASUS_HPE] = {
                .type = HDA_FIXUP_VERBS,
                .v.verbs = (const struct hda_verb[]) {
@@ -7963,6 +8011,30 @@ static const struct hda_fixup alc269_fixups[] = {
                .chained = true,
                .chain_id = ALC285_FIXUP_HP_MUTE_LED,
        },
+       [ALC295_FIXUP_ASUS_DACS] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc295_fixup_asus_dacs,
+       },
+       [ALC295_FIXUP_HP_OMEN] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+                       { 0x12, 0xb7a60130 },
+                       { 0x13, 0x40000000 },
+                       { 0x14, 0x411111f0 },
+                       { 0x16, 0x411111f0 },
+                       { 0x17, 0x90170110 },
+                       { 0x18, 0x411111f0 },
+                       { 0x19, 0x02a11030 },
+                       { 0x1a, 0x411111f0 },
+                       { 0x1b, 0x04a19030 },
+                       { 0x1d, 0x40600001 },
+                       { 0x1e, 0x411111f0 },
+                       { 0x21, 0x03211020 },
+                       {}
+               },
+               .chained = true,
+               .chain_id = ALC269_FIXUP_HP_LINE1_MIC1_LED,
+       },
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -8121,8 +8193,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x82c0, "HP G3 mini premium", ALC221_FIXUP_HP_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x83b9, "HP Spectre x360", ALC269_FIXUP_HP_MUTE_LED_MIC3),
        SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3),
+       SND_PCI_QUIRK(0x103c, 0x84da, "HP OMEN dc0019-ur", ALC295_FIXUP_HP_OMEN),
        SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3),
        SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED),
+       SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32", ALC274_FIXUP_HP_ENVY_GPIO),
        SND_PCI_QUIRK(0x103c, 0x8724, "HP EliteBook 850 G7", ALC285_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8729, "HP", ALC285_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8730, "HP ProBook 445 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
@@ -8161,6 +8235,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK),
        SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A),
        SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC),
+       SND_PCI_QUIRK(0x1043, 0x1740, "ASUS UX430UA", ALC295_FIXUP_ASUS_DACS),
        SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK),
        SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS),
        SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC),
@@ -8524,6 +8599,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
        {.id = ALC255_FIXUP_XIAOMI_HEADSET_MIC, .name = "alc255-xiaomi-headset"},
        {.id = ALC274_FIXUP_HP_MIC, .name = "alc274-hp-mic-detect"},
        {.id = ALC245_FIXUP_HP_X360_AMP, .name = "alc245-hp-x360-amp"},
+       {.id = ALC295_FIXUP_HP_OMEN, .name = "alc295-hp-omen"},
        {}
 };
 #define ALC225_STANDARD_PINS \
@@ -8801,6 +8877,16 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                {0x19, 0x03a11020},
                {0x21, 0x0321101f}),
        SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE,
+               {0x12, 0x90a60130},
+               {0x14, 0x90170110},
+               {0x19, 0x04a11040},
+               {0x21, 0x04211020}),
+       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE,
+               {0x14, 0x90170110},
+               {0x19, 0x04a11040},
+               {0x1d, 0x40600001},
+               {0x21, 0x04211020}),
+       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
                {0x14, 0x90170110},
                {0x19, 0x04a11040},
                {0x21, 0x04211020}),
@@ -8971,10 +9057,6 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
        SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
                {0x19, 0x40000000},
                {0x1a, 0x40000000}),
-       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
-               {0x14, 0x90170110},
-               {0x19, 0x04a11040},
-               {0x21, 0x04211020}),
        {}
 };
 
index 646deb6..c5794e8 100644 (file)
@@ -337,6 +337,13 @@ static const struct usbmix_name_map bose_companion5_map[] = {
        { 0 }   /* terminator */
 };
 
+/* Sennheiser Communications Headset [PC 8], the dB value is reported as -6 negative maximum  */
+static const struct usbmix_dB_map sennheiser_pc8_dB = {-9500, 0};
+static const struct usbmix_name_map sennheiser_pc8_map[] = {
+       { 9, NULL, .dB = &sennheiser_pc8_dB },
+       { 0 }   /* terminator */
+};
+
 /*
  * Dell usb dock with ALC4020 codec had a firmware problem where it got
  * screwed up when zero volume is passed; just skip it as a workaround
@@ -593,6 +600,11 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = {
                .id = USB_ID(0x17aa, 0x1046),
                .map = lenovo_p620_rear_map,
        },
+       {
+               /* Sennheiser Communications Headset [PC 8] */
+               .id = USB_ID(0x1395, 0x0025),
+               .map = sennheiser_pc8_map,
+       },
        { 0 } /* terminator */
 };
 
index 16ed198..6481fd1 100644 (file)
@@ -2,6 +2,13 @@
 #ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
 #define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
 
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long nbits,
+               unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
-               size, unsigned long offset);
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+                           unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_and_bit
@@ -27,13 +48,26 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_and_bit(const unsigned long *addr1,
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
-               unsigned long offset);
+               unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_zero_bit
-
 /**
  * find_next_zero_bit - find the next cleared bit in a memory region
  * @addr: The address to base the search on
@@ -43,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
  * Returns the bit number of the next zero bit
  * If no bits are zero, returns @size.
  */
+static inline
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset);
+                                unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
 #endif
 
 #ifndef find_first_bit
@@ -57,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-extern unsigned long find_first_bit(const unsigned long *addr,
-                                   unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_first_bit(addr, size);
+}
 
 #endif /* find_first_bit */
 
@@ -72,7 +129,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_first_zero_bit(addr, size);
+}
 #endif
 
 #endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */
index 8f22830..2093d56 100644 (file)
@@ -18,4 +18,7 @@
 #define BITS_PER_LONG_LONG 64
 #endif
 
+#define small_const_nbits(nbits) \
+       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
 #endif /* __ASM_GENERIC_BITS_PER_LONG */
index 477a1ca..330dbf7 100644 (file)
@@ -20,17 +20,9 @@ int __bitmap_equal(const unsigned long *bitmap1,
 void bitmap_clear(unsigned long *map, unsigned int start, int len);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-#define BITMAP_LAST_WORD_MASK(nbits)                                   \
-(                                                                      \
-       ((nbits) % BITS_PER_LONG) ?                                     \
-               (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL               \
-)
-
-#define small_const_nbits(nbits) \
-       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-
-static inline void bitmap_zero(unsigned long *dst, int nbits)
+static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                *dst = 0UL;
@@ -66,7 +58,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
        return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_weight(const unsigned long *src, int nbits)
+static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -74,7 +66,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
 }
 
 static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
-                            const unsigned long *src2, int nbits)
+                            const unsigned long *src2, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
@@ -141,7 +133,7 @@ static inline void bitmap_free(unsigned long *bitmap)
  * @buf: buffer to store output
  * @size: size of @buf
  */
-size_t bitmap_scnprintf(unsigned long *bitmap, int nbits,
+size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
                        char *buf, size_t size);
 
 /**
index 5043747..f4e9147 100644 (file)
@@ -28,11 +28,11 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                dst[k] = bitmap1[k] | bitmap2[k];
 }
 
-size_t bitmap_scnprintf(unsigned long *bitmap, int nbits,
+size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
                        char *buf, size_t size)
 {
        /* current bit is 'cur', most recently seen range is [rbot, rtop] */
-       int cur, rbot, rtop;
+       unsigned int cur, rbot, rtop;
        bool first = true;
        size_t ret = 0;
 
index e7a8d84..1d80ad4 100644 (file)
@@ -202,9 +202,11 @@ static inline int roundup_len(__u32 len)
        return (len + 7) / 8 * 8;
 }
 
-static int ringbuf_process_ring(struct ring* r)
+static int64_t ringbuf_process_ring(struct ring* r)
 {
-       int *len_ptr, len, err, cnt = 0;
+       int *len_ptr, len, err;
+       /* 64-bit to avoid overflow in case of extreme application behavior */
+       int64_t cnt = 0;
        unsigned long cons_pos, prod_pos;
        bool got_new_data;
        void *sample;
@@ -244,12 +246,14 @@ done:
 }
 
 /* Consume available ring buffer(s) data without event polling.
- * Returns number of records consumed across all registered ring buffers, or
- * negative number if any of the callbacks return error.
+ * Returns number of records consumed across all registered ring buffers (or
+ * INT_MAX, whichever is less), or negative number if any of the callbacks
+ * return error.
  */
 int ring_buffer__consume(struct ring_buffer *rb)
 {
-       int i, err, res = 0;
+       int64_t err, res = 0;
+       int i;
 
        for (i = 0; i < rb->ring_cnt; i++) {
                struct ring *ring = &rb->rings[i];
@@ -259,18 +263,24 @@ int ring_buffer__consume(struct ring_buffer *rb)
                        return err;
                res += err;
        }
+       if (res > INT_MAX)
+               return INT_MAX;
        return res;
 }
 
 /* Poll for available data and consume records, if any are available.
- * Returns number of records consumed, or negative number, if any of the
- * registered callbacks returned error.
+ * Returns number of records consumed (or INT_MAX, whichever is less), or
+ * negative number, if any of the registered callbacks returned error.
  */
 int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
 {
-       int i, cnt, err, res = 0;
+       int i, cnt;
+       int64_t err, res = 0;
 
        cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
+       if (cnt < 0)
+               return -errno;
+
        for (i = 0; i < cnt; i++) {
                __u32 ring_id = rb->events[i].data.fd;
                struct ring *ring = &rb->rings[ring_id];
@@ -280,7 +290,9 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
                        return err;
                res += err;
        }
-       return cnt < 0 ? -errno : res;
+       if (res > INT_MAX)
+               return INT_MAX;
+       return res;
 }
 
 /* Get an fd that can be used to sleep until data is available in the ring(s) */
index ac37022..109aa7f 100644 (file)
  *    searching it for one bits.
  *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static inline unsigned long _find_next_bit(const unsigned long *addr1,
+unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
-               unsigned long start, unsigned long invert)
+               unsigned long start, unsigned long invert, unsigned long le)
 {
-       unsigned long tmp;
+       unsigned long tmp, mask;
+       (void) le;
 
        if (unlikely(start >= nbits))
                return nbits;
@@ -43,7 +44,19 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
        tmp ^= invert;
 
        /* Handle 1st word. */
-       tmp &= BITMAP_FIRST_WORD_MASK(start);
+       mask = BITMAP_FIRST_WORD_MASK(start);
+
+       /*
+        * Due to the lack of swab() in tools, and the fact that it doesn't
+        * need little-endian support, just comment it out
+        */
+#if (0)
+       if (le)
+               mask = swab(mask);
+#endif
+
+       tmp &= mask;
+
        start = round_down(start, BITS_PER_LONG);
 
        while (!tmp) {
@@ -57,18 +70,12 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
                tmp ^= invert;
        }
 
-       return min(start + __ffs(tmp), nbits);
-}
+#if (0)
+       if (le)
+               tmp = swab(tmp);
 #endif
 
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-                           unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL);
+       return min(start + __ffs(tmp), nbits);
 }
 #endif
 
@@ -76,7 +83,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 /*
  * Find the first set bit in a memory region.
  */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -93,7 +100,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 /*
  * Find the first cleared bit in a memory region.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -105,20 +112,3 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
        return size;
 }
 #endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL);
-}
-#endif
-
-#ifndef find_next_and_bit
-unsigned long find_next_and_bit(const unsigned long *addr1,
-               const unsigned long *addr2, unsigned long size,
-               unsigned long offset)
-{
-       return _find_next_bit(addr1, addr2, size, offset, 0UL);
-}
-#endif
index 25adfec..f9271f3 100644 (file)
@@ -38,6 +38,7 @@ EXTRA_WARNINGS += -Wswitch-enum
 EXTRA_WARNINGS += -Wundef
 EXTRA_WARNINGS += -Wwrite-strings
 EXTRA_WARNINGS += -Wformat
+EXTRA_WARNINGS += -Wno-type-limits
 
 # Makefiles suck: This macro sets a default value of $(2) for the
 # variable named by $(1), unless the variable has been set by
index a958c22..dffbcaa 100644 (file)
@@ -43,6 +43,8 @@ void test_snprintf_positive(void)
        if (!ASSERT_OK_PTR(skel, "skel_open"))
                return;
 
+       skel->bss->pid = getpid();
+
        if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach"))
                goto cleanup;
 
index 951a030..e35129b 100644 (file)
@@ -5,6 +5,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
+__u32 pid = 0;
+
 char num_out[64] = {};
 long num_ret = 0;
 
@@ -42,6 +44,9 @@ int handler(const void *ctx)
        static const char str1[] = "str1";
        static const char longstr[] = "longstr";
 
+       if ((int)bpf_get_current_pid_tgid() != pid)
+               return 0;
+
        /* Integer types */
        num_ret  = BPF_SNPRINTF(num_out, sizeof(num_out),
                                "%d %u %x %li %llu %lX",
index ad7fabd..65ede50 100644 (file)
@@ -3449,4 +3449,48 @@ TEST(epoll63)
        close(sfd[1]);
 }
 
+/*
+ *        t0    t1
+ *     (ew) \  / (ew)
+ *           e0
+ *            | (lt)
+ *           s0
+ */
+TEST(epoll64)
+{
+       pthread_t waiter[2];
+       struct epoll_event e;
+       struct epoll_mtcontext ctx = { 0 };
+
+       signal(SIGUSR1, signal_handler);
+
+       ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+       ctx.efd[0] = epoll_create(1);
+       ASSERT_GE(ctx.efd[0], 0);
+
+       e.events = EPOLLIN;
+       ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+       /*
+        * main will act as the emitter once both waiter threads are
+        * blocked and expects to both be awoken upon the ready event.
+        */
+       ctx.main = pthread_self();
+       ASSERT_EQ(pthread_create(&waiter[0], NULL, waiter_entry1a, &ctx), 0);
+       ASSERT_EQ(pthread_create(&waiter[1], NULL, waiter_entry1a, &ctx), 0);
+
+       usleep(100000);
+       ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1);
+
+       ASSERT_EQ(pthread_join(waiter[0], NULL), 0);
+       ASSERT_EQ(pthread_join(waiter[1], NULL), 0);
+
+       EXPECT_EQ(ctx.count, 2);
+
+       close(ctx.efd[0]);
+       close(ctx.sfd[0]);
+       close(ctx.sfd[1]);
+}
+
 TEST_HARNESS_MAIN
index 031ba3c..a0d0c83 100644 (file)
@@ -1890,7 +1890,6 @@ void sparsebit_validate_internal(struct sparsebit *s)
  */
 
 #include <stdlib.h>
-#include <assert.h>
 
 struct range {
        sparsebit_idx_t first, last;
index 5a1e85f..e541066 100644 (file)
@@ -14,7 +14,6 @@
 #include <sys/mman.h>
 #include <string.h>
 #include <fcntl.h>
-#include <string.h>
 
 #include "../kselftest.h"
 #include "../kselftest_harness.h"
index 9236609..3c4cb72 100755 (executable)
@@ -274,7 +274,7 @@ check_mptcp_disabled()
        ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0
 
        local err=0
-       LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
+       LC_ALL=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
                grep -q "^socket: Protocol not available$" && err=1
        ip netns delete ${disabled_ns}
 
index f85a093..48344a7 100644 (file)
@@ -33,7 +33,6 @@
 #include <sched.h>
 #include <time.h>
 #include <stdarg.h>
-#include <sched.h>
 #include <pthread.h>
 #include <signal.h>
 #include <sys/prctl.h>
index 8be8a03..1054e40 100644 (file)
@@ -12,6 +12,7 @@ TEST_GEN_PROGS += proc-self-map-files-001
 TEST_GEN_PROGS += proc-self-map-files-002
 TEST_GEN_PROGS += proc-self-syscall
 TEST_GEN_PROGS += proc-self-wchan
+TEST_GEN_PROGS += proc-subset-pid
 TEST_GEN_PROGS += proc-uptime-001
 TEST_GEN_PROGS += proc-uptime-002
 TEST_GEN_PROGS += read
diff --git a/tools/testing/selftests/proc/proc-subset-pid.c b/tools/testing/selftests/proc/proc-subset-pid.c
new file mode 100644 (file)
index 0000000..d1052bc
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that "mount -t proc -o subset=pid" hides everything but pids,
+ * /proc/self and /proc/thread-self.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdio.h>
+
+static inline bool streq(const char *a, const char *b)
+{
+       return strcmp(a, b) == 0;
+}
+
+static void make_private_proc(void)
+{
+       if (unshare(CLONE_NEWNS) == -1) {
+               if (errno == ENOSYS || errno == EPERM) {
+                       exit(4);
+               }
+               exit(1);
+       }
+       if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+               exit(1);
+       }
+       if (mount(NULL, "/proc", "proc", 0, "subset=pid") == -1) {
+               exit(1);
+       }
+}
+
+static bool string_is_pid(const char *s)
+{
+       while (1) {
+               switch (*s++) {
+               case '0':case '1':case '2':case '3':case '4':
+               case '5':case '6':case '7':case '8':case '9':
+                       continue;
+
+               case '\0':
+                       return true;
+
+               default:
+                       return false;
+               }
+       }
+}
+
+int main(void)
+{
+       make_private_proc();
+
+       DIR *d = opendir("/proc");
+       assert(d);
+
+       struct dirent *de;
+
+       bool dot = false;
+       bool dot_dot = false;
+       bool self = false;
+       bool thread_self = false;
+
+       while ((de = readdir(d))) {
+               if (streq(de->d_name, ".")) {
+                       assert(!dot);
+                       dot = true;
+                       assert(de->d_type == DT_DIR);
+               } else if (streq(de->d_name, "..")) {
+                       assert(!dot_dot);
+                       dot_dot = true;
+                       assert(de->d_type == DT_DIR);
+               } else if (streq(de->d_name, "self")) {
+                       assert(!self);
+                       self = true;
+                       assert(de->d_type == DT_LNK);
+               } else if (streq(de->d_name, "thread-self")) {
+                       assert(!thread_self);
+                       thread_self = true;
+                       assert(de->d_type == DT_LNK);
+               } else {
+                       if (!string_is_pid(de->d_name)) {
+                               fprintf(stderr, "d_name '%s'\n", de->d_name);
+                               assert(0);
+                       }
+                       assert(de->d_type == DT_DIR);
+               }
+       }
+
+       char c;
+       int rv = readlink("/proc/cpuinfo", &c, 1);
+       assert(rv == -1 && errno == ENOENT);
+
+       int fd = open("/proc/cpuinfo", O_RDONLY);
+       assert(fd == -1 && errno == ENOENT);
+
+       return 0;
+}
index b3ef9e1..35ee78d 100644 (file)
@@ -14,7 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 // Test
-// 1) read of every file in /proc
+// 1) read and lseek on every file in /proc
 // 2) readlink of every symlink in /proc
 // 3) recursively (1) + (2) for every directory in /proc
 // 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs
@@ -45,6 +45,8 @@ static void f_reg(DIR *d, const char *filename)
        fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK);
        if (fd == -1)
                return;
+       /* struct proc_ops::proc_lseek is mandatory if file is seekable. */
+       (void)lseek(fd, 0, SEEK_SET);
        rv = read(fd, buf, sizeof(buf));
        assert((0 <= rv && rv <= sizeof(buf)) || rv == -1);
        close(fd);
index 9a35c3f..1f651e8 100644 (file)
@@ -22,3 +22,4 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 local_config.*
+split_huge_page_test
index 8b0cd42..73e1cc9 100644 (file)
@@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += split_huge_page_test
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
index 6c6336d..1e662d5 100644 (file)
@@ -13,6 +13,7 @@
 
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE     0x01    /* check pte is writable */
+#define FOLL_TOUCH     0x02    /* mark page accessed */
 
 static char *cmd_to_str(unsigned long cmd)
 {
@@ -37,13 +38,13 @@ int main(int argc, char **argv)
 {
        struct gup_test gup = { 0 };
        unsigned long size = 128 * MB;
-       int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+       int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1;
        unsigned long cmd = GUP_FAST_BENCHMARK;
-       int flags = MAP_PRIVATE;
+       int flags = MAP_PRIVATE, touch = 0;
        char *file = "/dev/zero";
        char *p;
 
-       while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) {
+       while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) {
                switch (opt) {
                case 'a':
                        cmd = PIN_FAST_BENCHMARK;
@@ -65,9 +66,13 @@ int main(int argc, char **argv)
                         */
                        gup.which_pages[0] = 1;
                        break;
+               case 'p':
+                       /* works only with DUMP_USER_PAGES_TEST */
+                       gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
+                       break;
                case 'F':
                        /* strtol, so you can pass flags in hex form */
-                       gup.flags = strtol(optarg, 0, 0);
+                       gup.gup_flags = strtol(optarg, 0, 0);
                        break;
                case 'm':
                        size = atoi(optarg) * MB;
@@ -93,6 +98,9 @@ int main(int argc, char **argv)
                case 'w':
                        write = 1;
                        break;
+               case 'W':
+                       write = 0;
+                       break;
                case 'f':
                        file = optarg;
                        break;
@@ -103,6 +111,10 @@ int main(int argc, char **argv)
                case 'H':
                        flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
                        break;
+               case 'z':
+                       /* fault pages in gup, do not fault in userland */
+                       touch = 1;
+                       break;
                default:
                        return -1;
                }
@@ -140,7 +152,7 @@ int main(int argc, char **argv)
 
        gup.nr_pages_per_call = nr_pages;
        if (write)
-               gup.flags |= FOLL_WRITE;
+               gup.gup_flags |= FOLL_WRITE;
 
        fd = open("/sys/kernel/debug/gup_test", O_RDWR);
        if (fd == -1) {
@@ -160,8 +172,18 @@ int main(int argc, char **argv)
        else if (thp == 0)
                madvise(p, size, MADV_NOHUGEPAGE);
 
-       for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
-               p[0] = 0;
+       /*
+        * FOLL_TOUCH, in gup_test, is used as an either/or case: either
+        * fault pages in from the kernel via FOLL_TOUCH, or fault them
+        * in here, from user space. This allows comparison of performance
+        * between those two cases.
+        */
+       if (touch) {
+               gup.gup_flags |= FOLL_TOUCH;
+       } else {
+               for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+                       p[0] = 0;
+       }
 
        /* Only report timing information on the *_BENCHMARK commands: */
        if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
new file mode 100644 (file)
index 0000000..1af16d2
--- /dev/null
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define SMAP_PATH "/proc/self/smaps"
+#define INPUT_MAX 80
+
+#define PID_FMT "%d,0x%lx,0x%lx"
+#define PATH_FMT "%s,0x%lx,0x%lx"
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+       uint64_t paddr;
+       uint64_t page_flags;
+
+       if (pagemap_file) {
+               pread(pagemap_file, &paddr, sizeof(paddr),
+                       ((long)vaddr >> pageshift) * sizeof(paddr));
+
+               if (kpageflags_file) {
+                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
+                               (paddr & PFN_MASK) * sizeof(page_flags));
+
+                       return !!(page_flags & KPF_THP);
+               }
+       }
+       return 0;
+}
+
+
+static uint64_t read_pmd_pagesize(void)
+{
+       int fd;
+       char buf[20];
+       ssize_t num_read;
+
+       fd = open(PMD_SIZE_PATH, O_RDONLY);
+       if (fd == -1) {
+               perror("Open hpage_pmd_size failed");
+               exit(EXIT_FAILURE);
+       }
+       num_read = read(fd, buf, 19);
+       if (num_read < 1) {
+               close(fd);
+               perror("Read hpage_pmd_size failed");
+               exit(EXIT_FAILURE);
+       }
+       buf[num_read] = '\0';
+       close(fd);
+
+       return strtoul(buf, NULL, 10);
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+       int fd;
+       ssize_t numwritten;
+
+       fd = open(path, O_WRONLY);
+       if (fd == -1)
+               return 0;
+
+       numwritten = write(fd, buf, buflen - 1);
+       close(fd);
+       if (numwritten < 1)
+               return 0;
+
+       return (unsigned int) numwritten;
+}
+
+static void write_debugfs(const char *fmt, ...)
+{
+       char input[INPUT_MAX];
+       int ret;
+       va_list argp;
+
+       va_start(argp, fmt);
+       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+       va_end(argp);
+
+       if (ret >= INPUT_MAX) {
+               printf("%s: Debugfs input is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+               perror(SPLIT_DEBUGFS);
+               exit(EXIT_FAILURE);
+       }
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
+{
+       while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+               if (!strncmp(buf, pattern, strlen(pattern)))
+                       return true;
+       }
+       return false;
+}
+
+static uint64_t check_huge(void *addr)
+{
+       uint64_t thp = 0;
+       int ret;
+       FILE *fp;
+       char buffer[MAX_LINE_LENGTH];
+       char addr_pattern[MAX_LINE_LENGTH];
+
+       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+                      (unsigned long) addr);
+       if (ret >= MAX_LINE_LENGTH) {
+               printf("%s: Pattern is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+
+       fp = fopen(SMAP_PATH, "r");
+       if (!fp) {
+               printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
+               exit(EXIT_FAILURE);
+       }
+       if (!check_for_pattern(fp, addr_pattern, buffer))
+               goto err_out;
+
+       /*
+        * Fetch the AnonHugePages: in the same block and check the number of
+        * hugepages.
+        */
+       if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+               goto err_out;
+
+       if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
+               printf("Reading smap error\n");
+               exit(EXIT_FAILURE);
+       }
+
+err_out:
+       fclose(fp);
+       return thp;
+}
+
+void split_pmd_thp(void)
+{
+       char *one_page;
+       size_t len = 4 * pmd_pagesize;
+       uint64_t thp_size;
+       size_t i;
+
+       one_page = memalign(pmd_pagesize, len);
+
+       if (!one_page) {
+               printf("Fail to allocate memory\n");
+               exit(EXIT_FAILURE);
+       }
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       thp_size = check_huge(one_page);
+       if (!thp_size) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+               (uint64_t)one_page + len);
+
+       for (i = 0; i < len; i++)
+               if (one_page[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+
+
+       thp_size = check_huge(one_page);
+       if (thp_size) {
+               printf("Still %ld kB AnonHugePages not split\n", thp_size);
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split huge pages successful\n");
+       free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+       char *one_page, *pte_mapped, *pte_mapped2;
+       size_t len = 4 * pmd_pagesize;
+       uint64_t thp_size;
+       size_t i;
+       const char *pagemap_template = "/proc/%d/pagemap";
+       const char *kpageflags_proc = "/proc/kpageflags";
+       char pagemap_proc[255];
+       int pagemap_fd;
+       int kpageflags_fd;
+
+       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+               perror("get pagemap proc error");
+               exit(EXIT_FAILURE);
+       }
+       pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+       if (pagemap_fd == -1) {
+               perror("read pagemap:");
+               exit(EXIT_FAILURE);
+       }
+
+       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+       if (kpageflags_fd == -1) {
+               perror("read kpageflags:");
+               exit(EXIT_FAILURE);
+       }
+
+       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       thp_size = check_huge(one_page);
+       if (!thp_size) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* remap the first pagesize of first THP */
+       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+       /* remap the Nth pagesize of Nth THP */
+       for (i = 1; i < 4; i++) {
+               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+                                    pagesize, pagesize,
+                                    MREMAP_MAYMOVE|MREMAP_FIXED,
+                                    pte_mapped + pagesize * i);
+               if (pte_mapped2 == (char *)-1) {
+                       perror("mremap failed");
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++)
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+
+       if (thp_size != 4) {
+               printf("Some THPs are missing during mremap\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all remapped THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
+                     (uint64_t)pte_mapped + pagesize * 4);
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++) {
+               if (pte_mapped[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+       }
+
+       if (thp_size) {
+               printf("Still %ld THPs not split\n", thp_size);
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split PTE-mapped huge pages successful\n");
+       munmap(one_page, len);
+       close(pagemap_fd);
+       close(kpageflags_fd);
+}
+
+void split_file_backed_thp(void)
+{
+       int status;
+       int fd;
+       ssize_t num_written;
+       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+       const char *tmpfs_loc = mkdtemp(tmpfs_template);
+       char testfile[INPUT_MAX];
+       uint64_t pgoff_start = 0, pgoff_end = 1024;
+
+       printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+
+       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+       if (status) {
+               printf("Unable to create a tmpfs for testing\n");
+               exit(EXIT_FAILURE);
+       }
+
+       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+       if (status >= INPUT_MAX) {
+               printf("Fail to create file-backed THP split testing file\n");
+               goto cleanup;
+       }
+
+       fd = open(testfile, O_CREAT|O_WRONLY);
+       if (fd == -1) {
+               perror("Cannot open testing file\n");
+               goto cleanup;
+       }
+
+       /* write something to the file, so a file-backed THP can be allocated */
+       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
+       close(fd);
+
+       if (num_written < 1) {
+               printf("Fail to write data to testing file\n");
+               goto cleanup;
+       }
+
+       /* split the file-backed THP */
+       write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
+
+       status = unlink(testfile);
+       if (status)
+               perror("Cannot remove testing file\n");
+
+cleanup:
+       status = umount(tmpfs_loc);
+       if (status) {
+               printf("Unable to umount %s\n", tmpfs_loc);
+               exit(EXIT_FAILURE);
+       }
+       status = rmdir(tmpfs_loc);
+       if (status) {
+               perror("cannot remove tmp dir");
+               exit(EXIT_FAILURE);
+       }
+
+       printf("file-backed THP split test done, please check dmesg for more information\n");
+}
+
+int main(int argc, char **argv)
+{
+       if (geteuid() != 0) {
+               printf("Please run the benchmark as root\n");
+               exit(EXIT_FAILURE);
+       }
+
+       pagesize = getpagesize();
+       pageshift = ffs(pagesize) - 1;
+       pmd_pagesize = read_pmd_pagesize();
+
+       split_pmd_thp();
+       split_pte_mapped_thp();
+       split_file_backed_thp();
+
+       return 0;
+}
index 92b8ec4..f5ab5e0 100644 (file)
@@ -81,6 +81,8 @@ static volatile bool test_uffdio_copy_eexist = true;
 static volatile bool test_uffdio_zeropage_eexist = true;
 /* Whether to test uffd write-protection */
 static bool test_uffdio_wp = false;
+/* Whether to test uffd minor faults */
+static bool test_uffdio_minor = false;
 
 static bool map_shared;
 static int huge_fd;
@@ -96,6 +98,7 @@ struct uffd_stats {
        int cpu;
        unsigned long missing_faults;
        unsigned long wp_faults;
+       unsigned long minor_faults;
 };
 
 /* pthread_mutex_t starts at page offset 0 */
@@ -153,17 +156,19 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats,
                uffd_stats[i].cpu = i;
                uffd_stats[i].missing_faults = 0;
                uffd_stats[i].wp_faults = 0;
+               uffd_stats[i].minor_faults = 0;
        }
 }
 
 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
 {
        int i;
-       unsigned long long miss_total = 0, wp_total = 0;
+       unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
 
        for (i = 0; i < n_cpus; i++) {
                miss_total += stats[i].missing_faults;
                wp_total += stats[i].wp_faults;
+               minor_total += stats[i].minor_faults;
        }
 
        printf("userfaults: %llu missing (", miss_total);
@@ -172,6 +177,9 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
        printf("\b), %llu wp (", wp_total);
        for (i = 0; i < n_cpus; i++)
                printf("%lu+", stats[i].wp_faults);
+       printf("\b), %llu minor (", minor_total);
+       for (i = 0; i < n_cpus; i++)
+               printf("%lu+", stats[i].minor_faults);
        printf("\b)\n");
 }
 
@@ -328,7 +336,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = {
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
-       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
        .allocate_area  = hugetlb_allocate_area,
        .release_pages  = hugetlb_release_pages,
        .alias_mapping = hugetlb_alias_mapping,
@@ -362,6 +370,22 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
        }
 }
 
+static void continue_range(int ufd, __u64 start, __u64 len)
+{
+       struct uffdio_continue req;
+
+       req.range.start = start;
+       req.range.len = len;
+       req.mode = 0;
+
+       if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
+               fprintf(stderr,
+                       "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
+                       (uint64_t)start);
+               exit(1);
+       }
+}
+
 static void *locking_thread(void *arg)
 {
        unsigned long cpu = (unsigned long) arg;
@@ -569,8 +593,32 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
        }
 
        if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+               /* Write protect page faults */
                wp_range(uffd, msg->arg.pagefault.address, page_size, false);
                stats->wp_faults++;
+       } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+               uint8_t *area;
+               int b;
+
+               /*
+                * Minor page faults
+                *
+                * To prove we can modify the original range for testing
+                * purposes, we're going to bit flip this range before
+                * continuing.
+                *
+                * Note that this requires all minor page fault tests operate on
+                * area_dst (non-UFFD-registered) and area_dst_alias
+                * (UFFD-registered).
+                */
+
+               area = (uint8_t *)(area_dst +
+                                  ((char *)msg->arg.pagefault.address -
+                                   area_dst_alias));
+               for (b = 0; b < page_size; ++b)
+                       area[b] = ~area[b];
+               continue_range(uffd, msg->arg.pagefault.address, page_size);
+               stats->minor_faults++;
        } else {
                /* Missing page faults */
                if (bounces & BOUNCE_VERIFY &&
@@ -779,7 +827,7 @@ static int stress(struct uffd_stats *uffd_stats)
        return 0;
 }
 
-static int userfaultfd_open(int features)
+static int userfaultfd_open_ext(uint64_t *features)
 {
        struct uffdio_api uffdio_api;
 
@@ -792,7 +840,7 @@ static int userfaultfd_open(int features)
        uffd_flags = fcntl(uffd, F_GETFD, NULL);
 
        uffdio_api.api = UFFD_API;
-       uffdio_api.features = features;
+       uffdio_api.features = *features;
        if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
                fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
                        "run with either root or ptrace capability.\n");
@@ -804,9 +852,15 @@ static int userfaultfd_open(int features)
                return 1;
        }
 
+       *features = uffdio_api.features;
        return 0;
 }
 
+static int userfaultfd_open(uint64_t features)
+{
+       return userfaultfd_open_ext(&features);
+}
+
 sigjmp_buf jbuf, *sigbuf;
 
 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -1112,7 +1166,7 @@ static int userfaultfd_events_test(void)
        }
 
        if (!pid)
-               return faulting_process(0);
+               exit(faulting_process(0));
 
        waitpid(pid, &err, 0);
        if (err) {
@@ -1215,6 +1269,102 @@ static int userfaultfd_sig_test(void)
        return userfaults != 0;
 }
 
+static int userfaultfd_minor_test(void)
+{
+       struct uffdio_register uffdio_register;
+       unsigned long expected_ioctls;
+       unsigned long p;
+       pthread_t uffd_mon;
+       uint8_t expected_byte;
+       void *expected_page;
+       char c;
+       struct uffd_stats stats = { 0 };
+       uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
+
+       if (!test_uffdio_minor)
+               return 0;
+
+       printf("testing minor faults: ");
+       fflush(stdout);
+
+       if (uffd_test_ops->release_pages(area_dst))
+               return 1;
+
+       if (userfaultfd_open_ext(&features))
+               return 1;
+       /* If kernel reports the feature isn't supported, skip the test. */
+       if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
+               printf("skipping test due to lack of feature support\n");
+               fflush(stdout);
+               return 0;
+       }
+
+       uffdio_register.range.start = (unsigned long)area_dst_alias;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+               fprintf(stderr, "register failure\n");
+               exit(1);
+       }
+
+       expected_ioctls = uffd_test_ops->expected_ioctls;
+       expected_ioctls |= 1 << _UFFDIO_CONTINUE;
+       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+               fprintf(stderr, "unexpected missing ioctl(s)\n");
+               exit(1);
+       }
+
+       /*
+        * After registering with UFFD, populate the non-UFFD-registered side of
+        * the shared mapping. This should *not* trigger any UFFD minor faults.
+        */
+       for (p = 0; p < nr_pages; ++p) {
+               memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+                      page_size);
+       }
+
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
+               perror("uffd_poll_thread create");
+               exit(1);
+       }
+
+       /*
+        * Read each of the pages back using the UFFD-registered mapping. We
+        * expect that the first time we touch a page, it will result in a minor
+        * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+        * page's contents, and then issuing a CONTINUE ioctl.
+        */
+
+       if (posix_memalign(&expected_page, page_size, page_size)) {
+               fprintf(stderr, "out of memory\n");
+               return 1;
+       }
+
+       for (p = 0; p < nr_pages; ++p) {
+               expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
+               memset(expected_page, expected_byte, page_size);
+               if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
+                           page_size)) {
+                       fprintf(stderr,
+                               "unexpected page contents after minor fault\n");
+                       exit(1);
+               }
+       }
+
+       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
+               perror("pipe write");
+               exit(1);
+       }
+       if (pthread_join(uffd_mon, NULL))
+               return 1;
+
+       close(uffd);
+
+       uffd_stats_report(&stats, 1);
+
+       return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
+}
+
 static int userfaultfd_stress(void)
 {
        void *area;
@@ -1413,7 +1563,7 @@ static int userfaultfd_stress(void)
 
        close(uffd);
        return userfaultfd_zeropage_test() || userfaultfd_sig_test()
-               || userfaultfd_events_test();
+               || userfaultfd_events_test() || userfaultfd_minor_test();
 }
 
 /*
@@ -1454,6 +1604,8 @@ static void set_test_type(const char *type)
                map_shared = true;
                test_type = TEST_HUGETLB;
                uffd_test_ops = &hugetlb_uffd_test_ops;
+               /* Minor faults require shared hugetlb; only enable here. */
+               test_uffdio_minor = true;
        } else if (!strcmp(type, "shmem")) {
                map_shared = true;
                test_type = TEST_SHMEM;
index e8cad6a..73f914d 100644 (file)
@@ -272,5 +272,3 @@ do
        echo ''
     done
 done
-
-# vim: sw=4
index 935442e..8996e7a 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-gen_init_cpio
-initramfs_data.cpio
+/gen_init_cpio
+/initramfs_data.cpio
 /initramfs_inc_data
index 8ae8316..63476bb 100755 (executable)
@@ -147,7 +147,7 @@ dir_filelist() {
        header "$1"
 
        srcdir=$(echo "$1" | sed -e 's://*:/:g')
-       dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LANG=C sort)
+       dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LC_ALL=C sort)
 
        # If $dirlist is only one line, then the directory is empty
        if [  "$(echo "${dirlist}" | wc -l)" -gt 1 ]; then
index d2fab78..17b0ba1 100644 (file)
@@ -1,4 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-*
-!.gitignore
-!Makefile
+/*/