Merge tag 'drm-ast-2500-for-v4.11' of git://people.freedesktop.org/~airlied/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Mar 2017 17:42:42 +0000 (09:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Mar 2017 17:42:42 +0000 (09:42 -0800)
Pull drm AST2500 support from Dave Airlie:
 "This is a set of changes to enable the AST2500 BMC hardware, and also
  fix some bugs interacting with the older AST hardware.

  Some of the bug fixes are cc'ed to stable"

* tag 'drm-ast-2500-for-v4.11' of git://people.freedesktop.org/~airlied/linux:
  drm/ast: Call open_key before enable_mmio in POST code
  drm/ast: Fix test for VGA enabled
  drm/ast: POST code for the new AST2500
  drm/ast: Rename ast_init_dram_2300 to ast_post_chip_2300
  drm/ast: Factor mmc_test code in POST code
  drm/ast: Fixed vram size incorrect issue on POWER
  drm/ast: Base support for AST2500
  drm/ast: Fix calculation of MCLK
  drm/ast: Remove spurious include
  drm/ast: const'ify mode setting tables
  drm/ast: Handle configuration without P2A bridge
  drm/ast: Fix AST2400 POST failure without BMC FW or VBIOS

843 files changed:
Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k [new file with mode: 0644]
Documentation/DocBook/libata.tmpl
Documentation/IPMI.txt
Documentation/acpi/method-customizing.txt
Documentation/acpi/method-tracing.txt
Documentation/admin-guide/ras.rst
Documentation/blockdev/mflash.txt
Documentation/cgroup-v1/rdma.txt [new file with mode: 0644]
Documentation/cgroup-v2.txt
Documentation/device-mapper/dm-raid.txt
Documentation/devicetree/bindings/mfd/qcom-rpm.txt
Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt
Documentation/devicetree/bindings/opp/opp.txt
Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
Documentation/devicetree/bindings/power/pd-samsung.txt
Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
Documentation/devicetree/bindings/rtc/cortina,gemini.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
Documentation/devicetree/bindings/rtc/pcf8563.txt
Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
Documentation/devicetree/bindings/soc/rockchip/grf.txt
Documentation/devicetree/bindings/sound/rockchip-i2s.txt
Documentation/devicetree/bindings/sound/sun4i-codec.txt
Documentation/devicetree/bindings/sound/sun4i-i2s.txt
Documentation/filesystems/autofs4-mount-control.txt
Documentation/filesystems/autofs4.txt
Documentation/filesystems/ceph.txt
Documentation/filesystems/quota.txt
Documentation/kselftest.txt
Documentation/media/dvb-drivers/ci.rst
Documentation/media/uapi/dvb/dvb-frontend-parameters.rst
Documentation/memory-hotplug.txt
Documentation/networking/cdc_mbim.txt
Documentation/scsi/ChangeLog.megaraid_sas
Documentation/sound/hd-audio/notes.rst
Documentation/vm/userfaultfd.txt
Documentation/x86/intel_rdt_ui.txt
MAINTAINERS
Makefile
arch/Kconfig
arch/alpha/include/asm/Kbuild
arch/alpha/kernel/smp.c
arch/arc/include/asm/kprobes.h
arch/arc/kernel/smp.c
arch/arc/kernel/unwind.c
arch/arm/Kconfig
arch/arm/Kconfig-nommu
arch/arm/boot/compressed/decompress.c
arch/arm/boot/compressed/head.S
arch/arm/common/mcpm_entry.c
arch/arm/include/asm/hardware/cache-uniphier.h
arch/arm/include/asm/kprobes.h
arch/arm/include/asm/mach/flash.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/pgtable-nommu.h
arch/arm/kernel/head-nommu.S
arch/arm/kernel/module.c
arch/arm/kernel/setup.c
arch/arm/kernel/smp.c
arch/arm/mach-alpine/platsmp.c
arch/arm/mach-axxia/platsmp.c
arch/arm/mach-bcm/bcm63xx_smp.c
arch/arm/mach-bcm/platsmp-brcmstb.c
arch/arm/mach-bcm/platsmp.c
arch/arm/mach-berlin/platsmp.c
arch/arm/mach-ep93xx/ts72xx.c
arch/arm/mach-ep93xx/ts72xx.h
arch/arm/mach-exynos/firmware.c
arch/arm/mach-exynos/mcpm-exynos.c
arch/arm/mach-exynos/platsmp.c
arch/arm/mach-exynos/pm.c
arch/arm/mach-exynos/suspend.c
arch/arm/mach-hisi/platmcpm.c
arch/arm/mach-hisi/platsmp.c
arch/arm/mach-imx/platsmp.c
arch/arm/mach-imx/pm-imx6.c
arch/arm/mach-imx/src.c
arch/arm/mach-mediatek/platsmp.c
arch/arm/mach-mvebu/pm.c
arch/arm/mach-mvebu/pmsu.c
arch/arm/mach-mvebu/system-controller.c
arch/arm/mach-omap2/control.c
arch/arm/mach-omap2/omap-mpuss-lowpower.c
arch/arm/mach-omap2/omap-smp.c
arch/arm/mach-omap2/omap_twl.c
arch/arm/mach-orion5x/ts78xx-setup.c
arch/arm/mach-prima2/platsmp.c
arch/arm/mach-prima2/pm.c
arch/arm/mach-pxa/palmz72.c
arch/arm/mach-pxa/pxa25x.c
arch/arm/mach-pxa/pxa27x.c
arch/arm/mach-pxa/pxa3xx.c
arch/arm/mach-realview/platsmp-dt.c
arch/arm/mach-rockchip/platsmp.c
arch/arm/mach-rockchip/pm.c
arch/arm/mach-s3c24xx/mach-jive.c
arch/arm/mach-s3c24xx/pm-s3c2410.c
arch/arm/mach-s3c24xx/pm-s3c2416.c
arch/arm/mach-s3c64xx/pm.c
arch/arm/mach-s5pv210/pm.c
arch/arm/mach-sa1100/pm.c
arch/arm/mach-shmobile/platsmp-apmu.c
arch/arm/mach-shmobile/platsmp-scu.c
arch/arm/mach-socfpga/platsmp.c
arch/arm/mach-spear/platsmp.c
arch/arm/mach-sti/platsmp.c
arch/arm/mach-sunxi/platsmp.c
arch/arm/mach-tango/platsmp.c
arch/arm/mach-tango/pm.c
arch/arm/mach-tegra/reset.c
arch/arm/mach-ux500/platsmp.c
arch/arm/mach-vexpress/dcscb.c
arch/arm/mach-vexpress/platsmp.c
arch/arm/mach-vexpress/tc2_pm.c
arch/arm/mach-zx/platsmp.c
arch/arm/mach-zynq/platsmp.c
arch/arm/mm/Kconfig
arch/arm/mm/Makefile
arch/arm/mm/cache-uniphier.c
arch/arm/mm/cache-v7.S
arch/arm/mm/cache-v7m.S
arch/arm/mm/dma-mapping.c
arch/arm/mm/dump.c
arch/arm/mm/flush.c
arch/arm/mm/init.c
arch/arm/mm/mmu.c
arch/arm/mm/nommu.c
arch/arm/mm/physaddr.c [new file with mode: 0644]
arch/arm/probes/decode.h
arch/arm64/include/asm/kprobes.h
arch/arm64/kernel/armv8_deprecated.c
arch/arm64/kernel/insn.c
arch/arm64/kernel/probes/decode-insn.h
arch/arm64/kernel/smp.c
arch/arm64/lib/copy_template.S
arch/avr32/include/asm/kprobes.h
arch/blackfin/include/asm/Kbuild
arch/blackfin/mach-common/smp.c
arch/c6x/include/asm/Kbuild
arch/cris/include/asm/Kbuild
arch/frv/include/asm/Kbuild
arch/frv/mm/mmu-context.c
arch/h8300/include/asm/Kbuild
arch/hexagon/include/asm/Kbuild
arch/hexagon/kernel/smp.c
arch/ia64/include/asm/kprobes.h
arch/ia64/kernel/setup.c
arch/ia64/sn/kernel/sn2/sn_hwperf.c
arch/m32r/include/asm/Kbuild
arch/m32r/kernel/setup.c
arch/m68k/configs/amcore_defconfig
arch/m68k/ifpsp060/src/isp.S
arch/m68k/include/asm/Kbuild
arch/metag/include/asm/Kbuild
arch/metag/kernel/smp.c
arch/microblaze/include/asm/Kbuild
arch/mips/include/asm/kprobes.h
arch/mips/kernel/traps.c
arch/mn10300/include/asm/kprobes.h
arch/mn10300/kernel/smp.c
arch/nios2/include/asm/Kbuild
arch/openrisc/include/asm/Kbuild
arch/openrisc/kernel/entry.S
arch/openrisc/kernel/head.S
arch/openrisc/kernel/vmlinux.lds.S
arch/parisc/include/asm/Kbuild
arch/parisc/kernel/smp.c
arch/powerpc/boot/dts/fsl/mpc8569mds.dts
arch/powerpc/include/asm/book3s/64/mmu.h
arch/powerpc/include/asm/fsl_hcalls.h
arch/powerpc/include/asm/kprobes.h
arch/powerpc/kernel/smp.c
arch/powerpc/lib/code-patching.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/xmon/ppc-opc.c
arch/s390/Kconfig
arch/s390/configs/default_defconfig
arch/s390/configs/performance_defconfig
arch/s390/crypto/Makefile
arch/s390/crypto/paes_s390.c [new file with mode: 0644]
arch/s390/defconfig
arch/s390/include/asm/cpacf.h
arch/s390/include/asm/kprobes.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/pkey.h [new file with mode: 0644]
arch/s390/include/asm/processor.h
arch/s390/include/asm/uaccess.h
arch/s390/include/uapi/asm/Kbuild
arch/s390/include/uapi/asm/pkey.h [new file with mode: 0644]
arch/s390/kernel/entry.S
arch/s390/kernel/entry.h
arch/s390/kernel/nmi.c
arch/s390/kernel/process.c
arch/s390/kernel/processor.c
arch/s390/kernel/vtime.c
arch/s390/mm/gmap.c
arch/s390/mm/hugetlbpage.c
arch/score/include/asm/Kbuild
arch/score/kernel/traps.c
arch/sh/include/asm/kprobes.h
arch/sh/kernel/irq.c
arch/sh/kernel/smp.c
arch/sparc/include/asm/kprobes.h
arch/sparc/include/asm/switch_to_32.h
arch/sparc/kernel/leon_smp.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sun4d_smp.c
arch/sparc/kernel/sun4m_smp.c
arch/sparc/kernel/traps_32.c
arch/sparc/kernel/traps_64.c
arch/sparc/kernel/visemul.c
arch/tile/include/asm/kprobes.h
arch/tile/kernel/smpboot.c
arch/um/include/asm/Kbuild
arch/unicore32/include/asm/Kbuild
arch/x86/Kconfig.debug
arch/x86/include/asm/cacheflush.h
arch/x86/include/asm/desc_defs.h
arch/x86/include/asm/kprobes.h
arch/x86/include/asm/pgtable-3level.h
arch/x86/kernel/Makefile
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/vector.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/mcheck/threshold.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_work.c
arch/x86/kernel/setup.c
arch/x86/kernel/smp.c
arch/x86/kernel/test_rodata.c [deleted file]
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/mmu.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/xtensa/include/asm/Kbuild
arch/xtensa/kernel/smp.c
block/blk-throttle.c
block/bsg.c
drivers/acpi/acpi_ipmi.c
drivers/acpi/acpica/dbconvert.c
drivers/acpi/acpica/nspredef.c
drivers/acpi/acpica/nsxfeval.c
drivers/acpi/resource.c
drivers/acpi/spcr.c
drivers/ata/libata-eh.c
drivers/atm/ambassador.c
drivers/atm/eni.c
drivers/atm/firestream.c
drivers/atm/horizon.c
drivers/atm/iphase.c
drivers/atm/iphase.h
drivers/atm/lanai.c
drivers/atm/nicstar.c
drivers/block/drbd/drbd_main.c
drivers/block/loop.c
drivers/block/rbd.c
drivers/block/rbd_types.h
drivers/char/ipmi/Kconfig
drivers/char/ipmi/bt-bmc.c
drivers/char/ipmi/ipmi_devintf.c
drivers/char/ipmi/ipmi_msghandler.c
drivers/char/ipmi/ipmi_powernv.c
drivers/char/ipmi/ipmi_watchdog.c
drivers/char/pcmcia/cm4000_cs.c
drivers/char/pcmcia/cm4040_cs.c
drivers/char/sonypi.c
drivers/crypto/Kconfig
drivers/crypto/caam/ctrl.c
drivers/devfreq/devfreq.c
drivers/dma-buf/dma-buf.c
drivers/extcon/extcon-rt8973a.c
drivers/firewire/core-cdev.c
drivers/firewire/core-device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/include/atombios.h
drivers/gpu/drm/amd/powerplay/inc/hardwaremanager.h
drivers/gpu/drm/drm_probe_helper.c
drivers/gpu/drm/i915/i915_gem_render_state.c
drivers/gpu/drm/i915/i915_gem_userptr.c
drivers/gpu/drm/mga/mga_drv.h
drivers/gpu/drm/radeon/atombios.h
drivers/gpu/drm/rockchip/Kconfig
drivers/gpu/drm/rockchip/cdn-dp-core.c
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
drivers/gpu/drm/zte/zx_plane.c
drivers/hid/hid-kye.c
drivers/hwmon/g762.c
drivers/ide/ide-acpi.c
drivers/ide/ide-tape.c
drivers/ide/palm_bk3710.c
drivers/infiniband/core/Makefile
drivers/infiniband/core/cgroup.c [new file with mode: 0644]
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/device.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/hw/hfi1/file_ops.c
drivers/infiniband/hw/qib/qib_iba6120.c
drivers/infiniband/hw/qib/qib_iba7220.c
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/sw/rdmavt/mad.c
drivers/input/touchscreen/cyttsp4_core.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/amd_iommu_types.h
drivers/iommu/intel-iommu.c
drivers/iommu/intel-svm.c
drivers/isdn/hardware/eicon/debug.c
drivers/isdn/hardware/mISDN/mISDNipac.c
drivers/isdn/mISDN/dsp_core.c
drivers/media/dvb-core/dvb_ringbuffer.h
drivers/media/dvb-frontends/drx39xyj/drx_driver.h
drivers/media/dvb-frontends/drx39xyj/drxj.c
drivers/media/dvb-frontends/drx39xyj/drxj.h
drivers/media/dvb-frontends/drxk_hard.c
drivers/media/dvb-frontends/helene.c
drivers/media/dvb-frontends/or51132.c
drivers/media/dvb-frontends/tda10048.c
drivers/media/i2c/adv7183_regs.h
drivers/media/pci/saa7164/saa7164-fw.c
drivers/media/platform/exynos4-is/fimc-core.h
drivers/media/tuners/xc5000.c
drivers/media/usb/dvb-usb/dib0700_devices.c
drivers/media/usb/gspca/t613.c
drivers/media/usb/tm6000/tm6000-input.c
drivers/media/v4l2-core/tuner-core.c
drivers/misc/vmw_vmci/vmci_context.c
drivers/misc/vmw_vmci/vmci_queue_pair.c
drivers/mmc/host/mmci_qcom_dml.c
drivers/mtd/devices/lart.c
drivers/net/arcnet/arcnet.c
drivers/net/ethernet/adi/bfin_mac.c
drivers/net/ethernet/apm/xgene/xgene_enet_main.c
drivers/net/ethernet/broadcom/bcm63xx_enet.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
drivers/net/ethernet/cadence/macb.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
drivers/net/ethernet/intel/igb/e1000_phy.c
drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
drivers/net/ethernet/mellanox/mlx4/en_clock.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/micrel/ksz884x.c
drivers/net/ethernet/neterion/s2io.c
drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
drivers/net/ethernet/qlogic/qed/qed.h
drivers/net/ethernet/qlogic/qed/qed_dev.c
drivers/net/ethernet/qlogic/qed/qed_mcp.c
drivers/net/ethernet/qlogic/qed/qed_sriov.c
drivers/net/ethernet/qlogic/qed/qed_sriov.h
drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
drivers/net/ethernet/sfc/mcdi_pcol.h
drivers/net/ethernet/sis/sis900.c
drivers/net/ethernet/stmicro/stmmac/common.h
drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/gtp.c
drivers/net/phy/phy.c
drivers/net/usb/kalmia.c
drivers/net/usb/rndis_host.c
drivers/net/usb/sierra_net.c
drivers/net/vxlan.c
drivers/net/wimax/i2400m/usb-fw.c
drivers/net/wireless/ath/ath9k/ani.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
drivers/net/wireless/intel/ipw2x00/ipw2100.c
drivers/net/wireless/intel/ipw2x00/ipw2200.c
drivers/net/wireless/intel/iwlegacy/4965-mac.c
drivers/net/wireless/intel/iwlwifi/iwl-drv.c
drivers/net/wireless/marvell/mwifiex/txrx.c
drivers/net/wireless/marvell/mwifiex/wmm.c
drivers/net/wireless/realtek/rtlwifi/rtl8192se/fw.c
drivers/net/wireless/rsi/rsi_91x_usb.c
drivers/net/wireless/ti/wl18xx/main.c
drivers/net/wireless/ti/wlcore/init.c
drivers/nfc/pn533/pn533.c
drivers/nvme/host/rdma.c
drivers/parport/ieee1284_ops.c
drivers/parport/parport_pc.c
drivers/pci/quirks.c
drivers/pinctrl/bcm/Kconfig
drivers/power/avs/smartreflex.c
drivers/rapidio/devices/rio_mport_cdev.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/rtc-armada38x.c
drivers/rtc/rtc-au1xxx.c
drivers/rtc/rtc-bfin.c
drivers/rtc/rtc-bq32k.c
drivers/rtc/rtc-dm355evm.c
drivers/rtc/rtc-ds3232.c
drivers/rtc/rtc-gemini.c
drivers/rtc/rtc-imxdi.c
drivers/rtc/rtc-ls1x.c
drivers/rtc/rtc-m48t86.c
drivers/rtc/rtc-mcp795.c
drivers/rtc/rtc-mxc.c
drivers/rtc/rtc-pcf2127.c
drivers/rtc/rtc-rx8010.c
drivers/rtc/rtc-sh.c
drivers/rtc/rtc-snvs.c
drivers/rtc/rtc-stm32.c [new file with mode: 0644]
drivers/rtc/rtc-sun6i.c
drivers/rtc/rtc-tegra.c
drivers/rtc/rtc-tps65910.c
drivers/s390/block/dasd_eckd.c
drivers/s390/cio/ioasm.c
drivers/s390/crypto/Makefile
drivers/s390/crypto/ap_bus.c
drivers/s390/crypto/ap_card.c
drivers/s390/crypto/ap_queue.c
drivers/s390/crypto/pkey_api.c [new file with mode: 0644]
drivers/s390/crypto/zcrypt_api.c
drivers/s390/crypto/zcrypt_api.h
drivers/scsi/aacraid/linit.c
drivers/scsi/bfa/bfi_ms.h
drivers/scsi/fcoe/fcoe_ctlr.c
drivers/scsi/ipr.c
drivers/scsi/lpfc/lpfc_attr.c
drivers/scsi/lpfc/lpfc_sli.c
drivers/scsi/mpt3sas/mpt3sas_ctl.c
drivers/scsi/mpt3sas/mpt3sas_ctl.h
drivers/scsi/osd/osd_initiator.c
drivers/scsi/osst.c
drivers/scsi/qla2xxx/qla_init.c
drivers/scsi/scsi_transport_sas.c
drivers/scsi/storvsc_drv.c
drivers/staging/gs_fpgaboot/gs_fpgaboot.h
drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
drivers/staging/rtl8192u/ieee80211/ieee80211.h
drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
drivers/staging/wilc1000/linux_wlan.c
drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
drivers/target/target_core_user.c
drivers/tty/n_hdlc.c
drivers/tty/serial/ioc4_serial.c
drivers/usb/core/devio.c
drivers/usb/gadget/legacy/inode.c
drivers/usb/gadget/udc/fsl_udc_core.c
drivers/usb/gadget/udc/renesas_usb3.c
drivers/usb/host/ehci-hcd.c
drivers/usb/host/fotg210-hcd.c
drivers/usb/host/ohci-hcd.c
drivers/usb/misc/adutux.c
drivers/usb/misc/legousbtower.c
drivers/usb/misc/uss720.c
drivers/usb/usbip/usbip_common.c
drivers/video/fbdev/aty/radeon_monitor.c
drivers/video/fbdev/metronomefb.c
drivers/watchdog/bcm2835_wdt.c
fs/affs/affs.h
fs/affs/amigaffs.c
fs/affs/inode.c
fs/affs/namei.c
fs/affs/super.c
fs/afs/dir.c
fs/autofs4/dev-ioctl.c
fs/autofs4/root.c
fs/block_dev.c
fs/btrfs/file.c
fs/buffer.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.c
fs/ceph/super.h
fs/dax.c
fs/direct-io.c
fs/ecryptfs/kthread.c
fs/eventpoll.c
fs/ext4/extents_status.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/move_extent.c
fs/hfs/mdb.c
fs/hfsplus/wrapper.c
fs/iomap.c
fs/jfs/super.c
fs/kernfs/dir.c
fs/kernfs/file.c
fs/kernfs/kernfs-internal.h
fs/lockd/svc.c
fs/mpage.c
fs/ncpfs/sock.c
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback_xdr.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/objlayout/objlayout.c
fs/nfsd/blocklayout.c
fs/nfsd/export.c
fs/nfsd/nfs2acl.c
fs/nfsd/nfs3acl.c
fs/nfsd/nfs3proc.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4idmap.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfscache.c
fs/nfsd/nfsctl.c
fs/nfsd/nfsd.h
fs/nfsd/nfsproc.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/vfs.c
fs/nfsd/vfs.h
fs/nilfs2/alloc.c
fs/nilfs2/btnode.c
fs/nilfs2/btree.c
fs/nilfs2/inode.c
fs/nilfs2/mdt.c
fs/nilfs2/segment.c
fs/ocfs2/aops.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/file.c
fs/orangefs/orangefs-utils.c
fs/proc/base.c
fs/proc/kcore.c
fs/proc/task_mmu.c
fs/proc/task_nommu.c
fs/reiserfs/file.c
fs/reiserfs/inode.c
fs/reiserfs/super.c
fs/stat.c
fs/udf/inode.c
fs/userfaultfd.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_file.c
include/asm-generic/kprobes.h [new file with mode: 0644]
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/cgroup-defs.h
include/linux/cgroup.h
include/linux/cgroup_rdma.h [new file with mode: 0644]
include/linux/cgroup_subsys.h
include/linux/compat.h
include/linux/compiler-gcc.h
include/linux/compiler.h
include/linux/crush/crush.h
include/linux/crush/mapper.h
include/linux/dcache.h
include/linux/fs.h
include/linux/idr.h
include/linux/ipmi.h
include/linux/kconfig.h
include/linux/kernfs.h
include/linux/kprobes.h
include/linux/mfd/tps65910.h
include/linux/mm_types.h
include/linux/mtd/qinfo.h
include/linux/pid.h
include/linux/platform_data/rtc-m48t86.h [deleted file]
include/linux/radix-tree.h
include/linux/refcount.h
include/linux/rodata_test.h [new file with mode: 0644]
include/linux/sched.h
include/linux/sem.h
include/linux/spi/flash.h
include/linux/sunrpc/cache.h
include/linux/sunrpc/rpc_rdma.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/svc_xprt.h
include/linux/workqueue.h
include/media/v4l2-ctrls.h
include/net/cfg80211.h
include/net/mac80211.h
include/rdma/ib_verbs.h
include/uapi/linux/auto_dev-ioctl.h
include/uapi/linux/auto_fs.h
include/uapi/linux/auto_fs4.h
include/uapi/linux/netfilter.h
include/uapi/linux/netfilter/xt_hashlimit.h
include/uapi/linux/nfsd/export.h
include/xen/interface/grant_table.h
init/Kconfig
init/initramfs.c
init/main.c
ipc/mqueue.c
ipc/sem.c
ipc/shm.c
kernel/Makefile
kernel/cgroup/Makefile [new file with mode: 0644]
kernel/cgroup/cgroup-internal.h [new file with mode: 0644]
kernel/cgroup/cgroup-v1.c [new file with mode: 0644]
kernel/cgroup/cgroup.c [moved from kernel/cgroup.c with 72% similarity]
kernel/cgroup/cpuset.c [moved from kernel/cpuset.c with 100% similarity]
kernel/cgroup/freezer.c [moved from kernel/cgroup_freezer.c with 100% similarity]
kernel/cgroup/namespace.c [new file with mode: 0644]
kernel/cgroup/pids.c [moved from kernel/cgroup_pids.c with 100% similarity]
kernel/cgroup/rdma.c [new file with mode: 0644]
kernel/configs/android-base.config
kernel/configs/android-recommended.config
kernel/events/core.c
kernel/events/uprobes.c
kernel/exit.c
kernel/fork.c
kernel/futex.c
kernel/irq/manage.c
kernel/relay.c
kernel/sched/core.c
kernel/signal.c
kernel/torture.c
lib/Kconfig
lib/Kconfig.debug
lib/Makefile
lib/fonts/Kconfig
lib/idr.c
lib/percpu_counter.c
lib/radix-tree.c
lib/refcount.c [new file with mode: 0644]
lib/rhashtable.c
lib/scatterlist.c
lib/test_parman.c
lib/vsprintf.c
mm/Kconfig.debug
mm/Makefile
mm/dmapool.c
mm/khugepaged.c
mm/ksm.c
mm/mmu_context.c
mm/mmu_notifier.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/percpu.c
mm/rodata_test.c [new file with mode: 0644]
mm/swapfile.c
mm/truncate.c
mm/workingset.c
mm/zswap.c
net/appletalk/ddp.c
net/atm/mpc.c
net/bluetooth/hci_sock.c
net/bridge/netfilter/ebt_among.c
net/ceph/cls_lock_client.c
net/ceph/crush/crush.c
net/ceph/crush/mapper.c
net/ceph/crypto.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/ceph/snapshot.c
net/ieee802154/socket.c
net/ipv4/fib_frontend.c
net/ipv4/fib_trie.c
net/ipv4/ipmr.c
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
net/ipv4/netfilter/nf_log_arp.c
net/ipv4/route.c
net/ipv6/ip6_vti.c
net/ipv6/ip6mr.c
net/ipv6/netfilter/nf_log_ipv6.c
net/irda/irnet/irnet_ppp.c
net/l2tp/l2tp_core.c
net/l2tp/l2tp_ip.c
net/mac80211/mesh.c
net/mac80211/status.c
net/netfilter/ipvs/ip_vs_conn.c
net/netfilter/ipvs/ip_vs_dh.c
net/netfilter/ipvs/ip_vs_lblc.c
net/netfilter/ipvs/ip_vs_lblcr.c
net/netfilter/ipvs/ip_vs_sh.c
net/netfilter/ipvs/ip_vs_sync.c
net/netfilter/nf_conntrack_expect.c
net/netfilter/nf_conntrack_ftp.c
net/netfilter/nfnetlink_cthelper.c
net/netfilter/nft_ct.c
net/netfilter/nft_set_bitmap.c
net/netfilter/x_tables.c
net/rds/ib.c
net/rds/tcp.c
net/rxrpc/key.c
net/rxrpc/recvmsg.c
net/sched/act_api.c
net/sctp/output.c
net/sctp/protocol.c
net/sctp/socket.c
net/sctp/transport.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/cache.c
net/sunrpc/svc.c
net/sunrpc/svcsock.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/svc_rdma_marshal.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/sunrpc/xprtsock.c
net/tipc/node.c
net/xfrm/xfrm_policy.c
scripts/checkpatch.pl
scripts/recordmcount.pl
scripts/spelling.txt
security/selinux/ss/ebitmap.c
security/selinux/ss/policydb.c
sound/pci/ac97/ac97_patch.c
sound/pci/cs46xx/cs46xx_dsp_task_types.h
sound/pci/hda/patch_ca0132.c
sound/pci/ice1712/wm8766.c
sound/pci/ice1712/wm8776.c
sound/pci/korg1212/korg1212.c
sound/pci/pcxhr/pcxhr_hwdep.c
sound/pcmcia/vx/vxp_ops.c
sound/ppc/snd_ps3.c
sound/soc/amd/acp-pcm-dma.c
sound/soc/codecs/wm_hubs.c
sound/soc/fsl/fsl_asrc.c
sound/soc/qcom/lpass.h
sound/soc/soc-core.c
sound/soc/soc-topology.c
tools/build/Makefile
tools/build/Makefile.include
tools/include/asm-generic/bitops/atomic.h
tools/include/asm/bug.h
tools/include/linux/bitmap.h
tools/include/linux/bitops.h
tools/include/linux/compiler.h
tools/include/linux/spinlock.h [new file with mode: 0644]
tools/lib/bpf/bpf.c
tools/lib/traceevent/event-parse.c
tools/lib/traceevent/event-parse.h
tools/objtool/arch.h
tools/objtool/arch/x86/decode.c
tools/objtool/builtin-check.c
tools/perf/Documentation/perf-annotate.txt
tools/perf/Documentation/perf-diff.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-stat.txt
tools/perf/Documentation/tips.txt
tools/perf/Makefile.config
tools/perf/Makefile.perf
tools/perf/builtin-annotate.c
tools/perf/builtin-diff.c
tools/perf/builtin-mem.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-stat.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/pmu-events/json.c
tools/perf/tests/attr.c
tools/perf/tests/builtin-test.c
tools/perf/tests/code-reading.c
tools/perf/tests/fdarray.c
tools/perf/tests/llvm.c
tools/perf/tests/parse-events.c
tools/perf/tests/perf-record.c
tools/perf/tests/python-use.c
tools/perf/tests/thread-map.c
tools/perf/tests/topology.c
tools/perf/tests/vmlinux-kallsyms.c
tools/perf/ui/browsers/map.c
tools/perf/ui/hist.c
tools/perf/util/annotate.c
tools/perf/util/cgroup.c
tools/perf/util/cpumap.c
tools/perf/util/cpumap.h
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/dso.c
tools/perf/util/env.c
tools/perf/util/header.c
tools/perf/util/hist.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-events.y
tools/perf/util/pmu.c
tools/perf/util/probe-event.c
tools/perf/util/probe-finder.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/setup.py
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/stat.c
tools/perf/util/symbol-elf.c
tools/testing/ktest/ktest.pl
tools/testing/radix-tree/.gitignore
tools/testing/radix-tree/Makefile
tools/testing/radix-tree/benchmark.c
tools/testing/radix-tree/generated/autoconf.h
tools/testing/radix-tree/idr-test.c [new file with mode: 0644]
tools/testing/radix-tree/iteration_check.c
tools/testing/radix-tree/linux.c
tools/testing/radix-tree/linux/bitops.h [deleted file]
tools/testing/radix-tree/linux/bitops/__ffs.h [deleted file]
tools/testing/radix-tree/linux/bitops/ffs.h [deleted file]
tools/testing/radix-tree/linux/bitops/ffz.h [deleted file]
tools/testing/radix-tree/linux/bitops/find.h [deleted file]
tools/testing/radix-tree/linux/bitops/fls.h [deleted file]
tools/testing/radix-tree/linux/bitops/fls64.h [deleted file]
tools/testing/radix-tree/linux/bitops/hweight.h [deleted file]
tools/testing/radix-tree/linux/bitops/le.h [deleted file]
tools/testing/radix-tree/linux/bitops/non-atomic.h [deleted file]
tools/testing/radix-tree/linux/export.h [deleted file]
tools/testing/radix-tree/linux/gfp.h
tools/testing/radix-tree/linux/idr.h [new file with mode: 0644]
tools/testing/radix-tree/linux/init.h
tools/testing/radix-tree/linux/kernel.h
tools/testing/radix-tree/linux/mempool.h [deleted file]
tools/testing/radix-tree/linux/percpu.h
tools/testing/radix-tree/linux/preempt.h
tools/testing/radix-tree/linux/radix-tree.h
tools/testing/radix-tree/linux/types.h [deleted file]
tools/testing/radix-tree/main.c
tools/testing/radix-tree/multiorder.c
tools/testing/radix-tree/regression1.c
tools/testing/radix-tree/regression2.c
tools/testing/radix-tree/regression3.c
tools/testing/radix-tree/tag_check.c
tools/testing/radix-tree/test.c
tools/testing/radix-tree/test.h
tools/testing/selftests/sigaltstack/sas.c
virt/kvm/async_pf.c
virt/kvm/kvm_main.c

diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k b/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k
new file mode 100644 (file)
index 0000000..398b258
--- /dev/null
@@ -0,0 +1,7 @@
+What:          /sys/bus/i2c/devices/.../trickle_charge_bypass
+Date:          Jan 2017
+KernelVersion: 4.11
+Contact:       Enric Balletbo i Serra <eballetbo@gmail.com>
+Description:    Attribute for enable/disable the trickle charge bypass
+               The trickle_charge_bypass attribute allows the userspace to
+                enable/disable the Trickle charge FET bypass.
index d7fcdc5..0320910 100644 (file)
@@ -1020,7 +1020,7 @@ and other resources, etc.
        </itemizedlist>
 
        <para>
-       Of errors detected as above, the followings are not ATA/ATAPI
+       Of errors detected as above, the following are not ATA/ATAPI
        device errors but ATA bus errors and should be handled
        according to <xref linkend="excatATAbusErr"/>.
        </para>
index 7229230..6962cab 100644 (file)
@@ -257,7 +257,7 @@ and tell you when they come and go.
 
 Creating the User
 
-To user the message handler, you must first create a user using
+To use the message handler, you must first create a user using
 ipmi_create_user.  The interface number specifies which SMI you want
 to connect to, and you must supply callback functions to be called
 when data comes in.  The callback function can run at interrupt level,
index 5f55373..a3f598e 100644 (file)
@@ -57,7 +57,7 @@ Note: To get the ACPI debug object output (Store (AAAA, Debug)),
 3. undo your changes
    The "undo" operation is not supported for a new inserted method
    right now, i.e. we can not remove a method currently.
-   For an overrided method, in order to undo your changes, please
+   For an overridden method, in order to undo your changes, please
    save a copy of the method original ASL code in step c) section 1,
    and redo step c) ~ g) to override the method with the original one.
 
index c2505ee..0aba14c 100644 (file)
@@ -152,7 +152,7 @@ tracing facility.
        Users can enable/disable this debug tracing feature by executing
        the following command:
            # echo string > /sys/module/acpi/parameters/trace_state
-       Where "string" should be one of the followings:
+       Where "string" should be one of the following:
        "disable"
            Disable the method tracing feature.
        "enable"
index 9939348..1b90c6f 100644 (file)
@@ -81,7 +81,7 @@ That defines some categories of errors:
   still run, eventually replacing the affected hardware by a hot spare,
   if available.
 
-  Also, when an error happens on an userspace process, it is also possible to
+  Also, when an error happens on a userspace process, it is also possible to
   kill such process and let userspace restart it.
 
 The mechanism for handling non-fatal errors is usually complex and may
index 1f610ec..f7e0505 100644 (file)
@@ -17,7 +17,7 @@ driver and currently works well under standard IDE subsystem. Actually it's
 one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
 IDE interface.
 
-Followings are brief descriptions about IO mode.
+Following are brief descriptions about IO mode.
 A. IO mode based on ATA protocol and uses some custom command. (read confirm,
 write confirm)
 B. IO mode uses SRAM bus interface.
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644 (file)
index 0000000..af61817
--- /dev/null
@@ -0,0 +1,109 @@
+                               RDMA Controller
+                               ----------------
+
+Contents
+--------
+
+1. Overview
+  1-1. What is RDMA controller?
+  1-2. Why RDMA controller needed?
+  1-3. How is RDMA controller implemented?
+2. Usage Examples
+
+1. Overview
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can leads to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+  hca_handle   Maximum number of HCA Handles
+  hca_object   Maximum number of HCA Objects
+
+2. Usage Examples
+-----------------
+
+(a) Configure resource limit:
+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit:
+cat /sys/fs/cgroup/rdma/2/rdma.max
+#Output:
+mlx4_0 hca_handle=2 hca_object=2000
+ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage:
+cat /sys/fs/cgroup/rdma/2/rdma.current
+#Output:
+mlx4_0 hca_handle=1 hca_object=20
+ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit:
+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
index 4cc07ce..3b8449f 100644 (file)
@@ -47,6 +47,12 @@ CONTENTS
   5-3. IO
     5-3-1. IO Interface Files
     5-3-2. Writeback
+  5-4. PID
+    5-4-1. PID Interface Files
+  5-5. RDMA
+    5-5-1. RDMA Interface Files
+  5-6. Misc
+    5-6-1. perf_event
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -328,14 +334,12 @@ a process with a non-root euid to migrate a target process into a
 cgroup by writing its PID to the "cgroup.procs" file, the following
 conditions must be met.
 
-- The writer's euid must match either uid or suid of the target process.
-
 - The writer must have write access to the "cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file of the
   common ancestor of the source and destination cgroups.
 
-The above three constraints ensure that while a delegatee may migrate
+The above two constraints ensure that while a delegatee may migrate
 processes around freely in the delegated sub-hierarchy it can't pull
 in from or push out to outside the sub-hierarchy.
 
@@ -350,10 +354,10 @@ all processes under C0 and C1 belong to U0.
 
 Let's also say U0 wants to write the PID of a process which is
 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
-file and uid match on the process; however, the common ancestor of the
-source cgroup C10 and the destination cgroup C00 is above the points
-of delegation and U0 would not have write access to its "cgroup.procs"
-files and thus the write will be denied with -EACCES.
+file; however, the common ancestor of the source cgroup C10 and the
+destination cgroup C00 is above the points of delegation and U0 would
+not have write access to its "cgroup.procs" files and thus the write
+will be denied with -EACCES.
 
 
 2-6. Guidelines
@@ -1119,6 +1123,91 @@ writeback as follows.
        vm.dirty[_background]_ratio.
 
 
+5-4. PID
+
+The process number controller is used to allow a cgroup to stop any
+new tasks from being fork()'d or clone()'d after a specified limit is
+reached.
+
+The number of tasks in a cgroup can be exhausted in ways which other
+controllers cannot prevent, thus warranting its own controller.  For
+example, a fork bomb is likely to exhaust the number of tasks before
+hitting memory restrictions.
+
+Note that PIDs used in this controller refer to TIDs, process IDs as
+used by the kernel.
+
+
+5-4-1. PID Interface Files
+
+  pids.max
+
+ A read-write single value file which exists on non-root cgroups.  The
+ default is "max".
+
+ Hard limit of number of processes.
+
+  pids.current
+
+ A read-only single value file which exists on all cgroups.
+
+ The number of processes currently in the cgroup and its descendants.
+
+Organisational operations are not blocked by cgroup policies, so it is
+possible to have pids.current > pids.max.  This can be done by either
+setting the limit to be smaller than pids.current, or attaching enough
+processes to the cgroup such that pids.current is larger than
+pids.max.  However, it is not possible to violate a cgroup PID policy
+through fork() or clone(). These will return -EAGAIN if the creation
+of a new process would cause a cgroup policy to be violated.
+
+
+5-5. RDMA
+
+The "rdma" controller regulates the distribution and accounting of
+of RDMA resources.
+
+5-5-1. RDMA Interface Files
+
+  rdma.max
+       A readwrite nested-keyed file that exists for all the cgroups
+       except root that describes current configured resource limit
+       for a RDMA/IB device.
+
+       Lines are keyed by device name and are not ordered.
+       Each line contains space separated resource name and its configured
+       limit that can be distributed.
+
+       The following nested keys are defined.
+
+         hca_handle    Maximum number of HCA Handles
+         hca_object    Maximum number of HCA Objects
+
+       An example for mlx4 and ocrdma device follows.
+
+         mlx4_0 hca_handle=2 hca_object=2000
+         ocrdma1 hca_handle=3 hca_object=max
+
+  rdma.current
+       A read-only file that describes current resource usage.
+       It exists for all the cgroup except root.
+
+       An example for mlx4 and ocrdma device follows.
+
+         mlx4_0 hca_handle=1 hca_object=20
+         ocrdma1 hca_handle=1 hca_object=23
+
+
+5-6. Misc
+
+5-6-1. perf_event
+
+perf_event controller, if not mounted on a legacy hierarchy, is
+automatically enabled on the v2 hierarchy so that perf events can
+always be filtered by cgroup v2 path.  The controller can still be
+moved to a legacy hierarchy after v2 hierarchy is populated.
+
+
 6. Namespace
 
 6-1. Basics
index 0d19935..cd2cb2f 100644 (file)
@@ -319,7 +319,7 @@ Version History
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
 1.6.0   Add discard support (and devices_handle_discard_safely module param).
 1.7.0   Add support for MD RAID0 mappings.
-1.8.0   Explictely check for compatible flags in the superblock metadata
+1.8.0   Explicitly check for compatible flags in the superblock metadata
        and reject to start the raid set if any are set by a newer
        target version, thus avoiding data corruption on a raid set
        with a reshape in progress.
index 485bc59..3c91ad4 100644 (file)
@@ -234,7 +234,7 @@ see regulator.txt - with additional custom properties described below:
 - qcom,switch-mode-frequency:
        Usage: required
        Value type: <u32>
-       Definition: Frequency (Hz) of the swith mode power supply;
+       Definition: Frequency (Hz) of the switch mode power supply;
                    must be one of:
                    19200000, 9600000, 6400000, 4800000, 3840000, 3200000,
                    2740000, 2400000, 2130000, 1920000, 1750000, 1600000,
index 7aa840c..ae4234c 100644 (file)
@@ -1,7 +1,7 @@
 * Marvell Armada 370 / Armada XP / Armada 3700 Ethernet Controller (NETA)
 
 Required properties:
-- compatible: could be one of the followings
+- compatible: could be one of the following:
        "marvell,armada-370-neta"
        "marvell,armada-xp-neta"
        "marvell,armada-3700-neta"
index 9f5ca44..ecdcfb7 100644 (file)
@@ -136,7 +136,7 @@ Optional properties:
   larger OPP table, based on what version of the hardware we are running on. We
   still can't have multiple nodes with the same opp-hz value in OPP table.
 
-  It's an user defined array containing a hierarchy of hardware version numbers,
+  It's a user defined array containing a hierarchy of hardware version numbers,
   supported by the OPP. For example: a platform with hierarchy of three levels
   of versions (A, B and C), this field should be like <X Y Z>, where X
   corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
index 7c85dca..2fd688c 100644 (file)
@@ -6,7 +6,7 @@ the first two functions being GPIO in and out. The configuration on
 the pins includes drive strength and pull-up.
 
 Required properties:
-- compatible: Should be one of the followings (depending on you SoC):
+- compatible: Should be one of the following (depending on your SoC):
   "allwinner,sun4i-a10-pinctrl"
   "allwinner,sun5i-a10s-pinctrl"
   "allwinner,sun5i-a13-pinctrl"
index 7eb9674..549f7de 100644 (file)
@@ -23,7 +23,7 @@ Optional Properties:
 - clock-names: The following clocks can be specified:
        - oscclk: Oscillator clock.
        - clkN: Input clocks to the devices in this power domain. These clocks
-               will be reparented to oscclk before swithing power domain off.
+               will be reparented to oscclk before switching power domain off.
                Their original parent will be brought back after turning on
                the domain. Maximum of 4 clocks (N = 0 to 3) are supported.
        - asbN: Clocks required by asynchronous bridges (ASB) present in
index 2eb9d4e..c3c9a12 100644 (file)
@@ -1,9 +1,11 @@
-* Real Time Clock of the Armada 38x SoCs
+* Real Time Clock of the Armada 38x/7K/8K SoCs
 
-RTC controller for the Armada 38x SoCs
+RTC controller for the Armada 38x, 7K and 8K SoCs
 
 Required properties:
-- compatible : Should be "marvell,armada-380-rtc"
+- compatible : Should be one of the following:
+       "marvell,armada-380-rtc" for Armada 38x SoC
+       "marvell,armada-8k-rtc" for Aramda 7K/8K SoCs
 - reg: a list of base address and size pairs, one for each entry in
   reg-names
 - reg names: should contain:
diff --git a/Documentation/devicetree/bindings/rtc/cortina,gemini.txt b/Documentation/devicetree/bindings/rtc/cortina,gemini.txt
new file mode 100644 (file)
index 0000000..4ce4e79
--- /dev/null
@@ -0,0 +1,14 @@
+* Cortina Systems Gemini RTC
+
+Gemini SoC real-time clock.
+
+Required properties:
+- compatible : Should be "cortina,gemini-rtc"
+
+Examples:
+
+rtc@45000000 {
+       compatible = "cortina,gemini-rtc";
+       reg = <0x45000000 0x100>;
+       interrupts = <17 IRQ_TYPE_LEVEL_HIGH>;
+};
index c9d80d7..323cf26 100644 (file)
@@ -8,10 +8,13 @@ Required properties:
   region.
 - interrupts: rtc alarm interrupt
 
+Optional properties:
+- interrupts: dryice security violation interrupt
+
 Example:
 
 rtc@80056000 {
        compatible = "fsl,imx53-rtc", "fsl,imx25-rtc";
        reg = <0x80056000 2000>;
-       interrupts = <29>;
+       interrupts = <29 56>;
 };
index 1ad4c1c..85be53a 100644 (file)
@@ -1,7 +1,8 @@
 * Maxim DS3231 Real Time Clock
 
 Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "maxim,ds3231".
+- reg: I2C address for chip.
 
 Optional property:
 - #clock-cells: Should be 1.
index 086c998..36984ac 100644 (file)
@@ -3,7 +3,8 @@
 Philips PCF8563/Epson RTC8564 Real Time Clock
 
 Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "nxp,pcf8563".
+- reg: I2C address for chip.
 
 Optional property:
 - #clock-cells: Should be 0.
diff --git a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt
new file mode 100644 (file)
index 0000000..e2837b9
--- /dev/null
@@ -0,0 +1,27 @@
+STM32 Real Time Clock
+
+Required properties:
+- compatible: "st,stm32-rtc".
+- reg: address range of rtc register set.
+- clocks: reference to the clock entry ck_rtc.
+- interrupt-parent: phandle for the interrupt controller.
+- interrupts: rtc alarm interrupt.
+- st,syscfg: phandle for pwrcfg, mandatory to disable/enable backup domain
+  (RTC registers) write protection.
+
+Optional properties (to override default ck_rtc parent clock):
+- assigned-clocks: reference to the ck_rtc clock entry.
+- assigned-clock-parents: phandle of the new parent clock of ck_rtc.
+
+Example:
+
+       rtc: rtc@40002800 {
+               compatible = "st,stm32-rtc";
+               reg = <0x40002800 0x400>;
+               clocks = <&rcc 1 CLK_RTC>;
+               assigned-clocks = <&rcc 1 CLK_RTC>;
+               assigned-clock-parents = <&rcc 1 CLK_LSE>;
+               interrupt-parent = <&exti>;
+               interrupts = <17 1>;
+               st,syscfg = <&pwrcfg>;
+       };
index f007e42..9459349 100644 (file)
@@ -8,10 +8,20 @@ Required properties:
                  memory mapped region.
 - interrupts   : IRQ lines for the RTC alarm 0 and alarm 1, in that order.
 
+Required properties for new device trees
+- clocks       : phandle to the 32kHz external oscillator
+- clock-output-names : name of the LOSC clock created
+- #clock-cells  : must be equals to 1. The RTC provides two clocks: the
+                 LOSC and its external output, with index 0 and 1
+                 respectively.
+
 Example:
 
 rtc: rtc@01f00000 {
        compatible = "allwinner,sun6i-a31-rtc";
        reg = <0x01f00000 0x54>;
        interrupts = <0 40 4>, <0 41 4>;
+       clock-output-names = "osc32k";
+       clocks = <&ext_osc32k>;
+       #clock-cells = <1>;
 };
index c6e62cb..a0685c2 100644 (file)
@@ -10,7 +10,7 @@ From RK3368 SoCs, the GRF is divided into two sections,
 
 Required Properties:
 
-- compatible: GRF should be one of the followings
+- compatible: GRF should be one of the following:
    - "rockchip,rk3036-grf", "syscon": for rk3036
    - "rockchip,rk3066-grf", "syscon": for rk3066
    - "rockchip,rk3188-grf", "syscon": for rk3188
@@ -18,7 +18,7 @@ Required Properties:
    - "rockchip,rk3288-grf", "syscon": for rk3288
    - "rockchip,rk3368-grf", "syscon": for rk3368
    - "rockchip,rk3399-grf", "syscon": for rk3399
-- compatible: PMUGRF should be one of the followings
+- compatible: PMUGRF should be one of the following:
    - "rockchip,rk3368-pmugrf", "syscon": for rk3368
    - "rockchip,rk3399-pmugrf", "syscon": for rk3399
 - compatible: SGRF should be one of the following
index 4ea29aa..a6600f6 100644 (file)
@@ -5,7 +5,7 @@ audio data transfer between devices in the system.
 
 Required properties:
 
-- compatible: should be one of the followings
+- compatible: should be one of the following:
    - "rockchip,rk3066-i2s": for rk3066
    - "rockchip,rk3188-i2s", "rockchip,rk3066-i2s": for rk3188
    - "rockchip,rk3288-i2s", "rockchip,rk3066-i2s": for rk3288
@@ -17,7 +17,7 @@ Required properties:
        Documentation/devicetree/bindings/dma/dma.txt
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "i2s_hclk": clock for I2S BUS
    - "i2s_clk" : clock for I2S controller
 - rockchip,playback-channels: max playback channels, if not set, 8 channels default.
index 3033bd8..3863531 100644 (file)
@@ -14,7 +14,7 @@ Required properties:
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry
   in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "apb": the parent APB clock for this controller
    - "codec": the parent module clock
 
index f4adc58..ee21da8 100644 (file)
@@ -5,7 +5,7 @@ audio data transfer between devices in the system.
 
 Required properties:
 
-- compatible: should be one of the followings
+- compatible: should be one of the following:
    - "allwinner,sun4i-a10-i2s"
    - "allwinner,sun6i-a31-i2s"
 - reg: physical base address of the controller and length of memory mapped
@@ -15,7 +15,7 @@ Required properties:
        Documentation/devicetree/bindings/dma/dma.txt
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "apb" : clock for the I2S bus interface
    - "mod" : module clock for the I2S controller
 - #sound-dai-cells : Must be equal to 0
index 50a3e01..e5177cb 100644 (file)
@@ -179,6 +179,7 @@ struct autofs_dev_ioctl {
                                 * including this struct */
        __s32 ioctlfd;          /* automount command fd */
 
+       /* Command parameters */
        union {
                struct args_protover            protover;
                struct args_protosubver         protosubver;
index 8fac3fe..f10dd59 100644 (file)
@@ -65,7 +65,7 @@ directory is a mount trap only if the filesystem is mounted *direct*
 and the root is empty.
 
 Directories created in the root directory are mount traps only if the
-filesystem is mounted  *indirect* and they are empty.
+filesystem is mounted *indirect* and they are empty.
 
 Directories further down the tree depend on the *maxproto* mount
 option and particularly whether it is less than five or not.
@@ -352,7 +352,7 @@ Communicating with autofs: root directory ioctls
 ------------------------------------------------
 
 The root directory of an autofs filesystem will respond to a number of
-ioctls.   The process issuing the ioctl must have the CAP_SYS_ADMIN
+ioctls.  The process issuing the ioctl must have the CAP_SYS_ADMIN
 capability, or must be the automount daemon.
 
 The available ioctl commands are:
@@ -425,8 +425,20 @@ Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
                                          * including this struct */
                 __s32 ioctlfd;          /* automount command fd */
 
-                __u32 arg1;             /* Command parameters */
-                __u32 arg2;
+               /* Command parameters */
+               union {
+                       struct args_protover            protover;
+                       struct args_protosubver         protosubver;
+                       struct args_openmount           openmount;
+                       struct args_ready               ready;
+                       struct args_fail                fail;
+                       struct args_setpipefd           setpipefd;
+                       struct args_timeout             timeout;
+                       struct args_requester           requester;
+                       struct args_expire              expire;
+                       struct args_askumount           askumount;
+                       struct args_ismountpoint        ismountpoint;
+               };
 
                 char path[0];
         };
@@ -446,25 +458,22 @@ Commands are:
     set version numbers.
 - **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor
     on the root of an autofs filesystem.  The filesystem is identified
-    by name and device number, which is stored in `arg1`.  Device
-    numbers for existing filesystems can be found in
+    by name and device number, which is stored in `openmount.devid`.
+    Device numbers for existing filesystems can be found in
     `/proc/self/mountinfo`.
 - **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
 - **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
     catatonic mode, this can provide the write end of a new pipe
-    in `arg1` to re-establish communication with a daemon.  The
-    process group of the calling process is used to identify the
+    in `setpipefd.pipefd` to re-establish communication with a daemon.
+    The process group of the calling process is used to identify the
     daemon.
 - **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a
     name within the filesystem that has been auto-mounted on.
-    arg1 is the dev number of the underlying autofs.  On successful
-    return, `arg1` and `arg2` will be the UID and GID of the process
-    which triggered that mount.
-
+    On successful return, `requester.uid` and `requester.gid` will be
+    the UID and GID of the process which triggered that mount.
 - **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a
     mountpoint of a particular type - see separate documentation for
     details.
-
 - **AUTOFS_DEV_IOCTL_PROTOVER_CMD**:
 - **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**:
 - **AUTOFS_DEV_IOCTL_READY_CMD**:
@@ -474,7 +483,7 @@ Commands are:
 - **AUTOFS_DEV_IOCTL_EXPIRE_CMD**:
 - **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**:  These all have the same
     function as the similarly named **AUTOFS_IOC** ioctls, except
-    that **FAIL** can be given an explicit error number in `arg1`
+    that **FAIL** can be given an explicit error number in `fail.status`
     instead of assuming `ENOENT`, and this **EXPIRE** command
     corresponds to **AUTOFS_IOC_EXPIRE_MULTI**.
 
@@ -512,7 +521,7 @@ always be mounted "shared". e.g.
 
 > `mount --make-shared /autofs/mount/point`
 
-The automount daemon is only able to mange a single mount location for
+The automount daemon is only able to manage a single mount location for
 an autofs filesystem and if mounts on that are not 'shared', other
 locations will not behave as expected.  In particular access to those
 other locations will likely result in the `ELOOP` error
index f5306ee..0b302a1 100644 (file)
@@ -98,11 +98,10 @@ Mount Options
        size.
 
   rsize=X
-       Specify the maximum read size in bytes.  By default there is no
-       maximum.
+       Specify the maximum read size in bytes.  Default: 64 MB.
 
   rasize=X
-       Specify the maximum readahead.
+       Specify the maximum readahead.  Default: 8 MB.
 
   mount_timeout=X
        Specify the timeout value for mount (in seconds), in the case
index 29fc015..32874b0 100644 (file)
@@ -6,7 +6,7 @@ Quota subsystem allows system administrator to set limits on used space and
 number of used inodes (inode is a filesystem structure which is associated with
 each file or directory) for users and/or groups. For both used space and number
 of used inodes there are actually two limits. The first one is called softlimit
-and the second one hardlimit.  An user can never exceed a hardlimit for any
+and the second one hardlimit.  A user can never exceed a hardlimit for any
 resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed
 softlimit but only for limited period of time. This period is called "grace
 period" or "grace time". When grace time is over, user is not able to allocate
index d431dc8..5bd5903 100644 (file)
@@ -59,14 +59,14 @@ Install selftests
 =================
 
 You can use kselftest_install.sh tool installs selftests in default
-location which is tools/testing/selftests/kselftest or an user specified
+location which is tools/testing/selftests/kselftest or a user specified
 location.
 
 To install selftests in default location:
    $ cd tools/testing/selftests
    $ ./kselftest_install.sh
 
-To install selftests in an user specified location:
+To install selftests in a user specified location:
    $ cd tools/testing/selftests
    $ ./kselftest_install.sh install_dir
 
index 8124bf5..69b07e9 100644 (file)
@@ -20,7 +20,7 @@ existing low level CI API.
 ca_zap
 ~~~~~~
 
-An userspace application, like ``ca_zap`` is required to handle encrypted
+A userspace application, like ``ca_zap`` is required to handle encrypted
 MPEG-TS streams.
 
 The ``ca_zap`` userland application is in charge of sending the
index bf31411..899fd5c 100644 (file)
@@ -9,7 +9,7 @@ frontend parameters
 The kind of parameters passed to the frontend device for tuning depend
 on the kind of hardware you are using.
 
-The struct ``dvb_frontend_parameters`` uses an union with specific
+The struct ``dvb_frontend_parameters`` uses a union with specific
 per-system parameters. However, as newer delivery systems required more
 data, the structure size weren't enough to fit, and just extending its
 size would break the existing applications. So, those parameters were
@@ -23,7 +23,7 @@ So, newer applications should use
 instead, in order to be able to support the newer System Delivery like
 DVB-S2, DVB-T2, DVB-C2, ISDB, etc.
 
-All kinds of parameters are combined as an union in the
+All kinds of parameters are combined as a union in the
 FrontendParameters structure:
 
 
index 5de846d..670f3de 100644 (file)
@@ -114,11 +114,11 @@ config options.
     Memory model -> Sparse Memory  (CONFIG_SPARSEMEM)
     Allow for memory hot-add       (CONFIG_MEMORY_HOTPLUG)
 
-- To enable memory removal, the followings are also necessary
+- To enable memory removal, the following are also necessary
     Allow for memory hot remove    (CONFIG_MEMORY_HOTREMOVE)
     Page Migration                 (CONFIG_MIGRATION)
 
-- For ACPI memory hotplug, the followings are also necessary
+- For ACPI memory hotplug, the following are also necessary
     Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
     This option can be kernel module.
 
index a15ea60..b9482ca 100644 (file)
@@ -38,7 +38,7 @@ Basic usage
 ===========
 
 MBIM functions are inactive when unmanaged. The cdc_mbim driver only
-provides an userspace interface to the MBIM control channel, and will
+provides a userspace interface to the MBIM control channel, and will
 not participate in the management of the function. This implies that a
 userspace MBIM management application always is required to enable a
 MBIM function.
@@ -200,7 +200,7 @@ structure described in section 10.5.29 of [1].
 The DSS VLAN subdevices are used as a practical interface between the
 shared MBIM data channel and a MBIM DSS aware userspace application.
 It is not intended to be presented as-is to an end user. The
-assumption is that an userspace application initiating a DSS session
+assumption is that a userspace application initiating a DSS session
 also takes care of the necessary framing of the DSS data, presenting
 the stream to the end user in an appropriate way for the stream type.
 
index 00ffdf1..234ddab 100644 (file)
@@ -549,7 +549,7 @@ ii. Reduced by 1 max cmds sent to FW from Driver to make the reply_q_sz same
 3 Older Version   : 00.00.03.02
 
 i.     Send stop adapter to FW & Dump pending FW cmds before declaring adapter dead.
-       New varible added to set dbg level.
+       New variable added to set dbg level.
 ii.    Disable interrupt made as fn pointer as they are different for 1068 / 1078
 iii.   Frame count optimization. Main frame can contain 2 SGE for 64 bit SGLs and
        3 SGE for 32 bit SGL
index 168d0cf..9eeb9b4 100644 (file)
@@ -697,7 +697,7 @@ If it's a regression, at best, send alsa-info outputs of both working
 and non-working kernels.  This is really helpful because we can
 compare the codec registers directly.
 
-Send a bug report either the followings:
+Send a bug report either the following:
 
 kernel-bugzilla
     https://bugzilla.kernel.org/
index fe51a5a..0e5543a 100644 (file)
@@ -149,7 +149,7 @@ migration thread in the QEMU running in the destination node will
 receive the page that triggered the userfault and it'll map it as
 usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
 was spontaneously sent by the source or if it was an urgent page
-requested through an userfault).
+requested through a userfault).
 
 By the time the userfaults start, the QEMU in the destination node
 doesn't need to keep any per-page state bitmap relative to the live
index d918d26..51cf6fa 100644 (file)
@@ -212,3 +212,117 @@ Finally we move core 4-7 over to the new group and make sure that the
 kernel and the tasks running there get 50% of the cache.
 
 # echo C0 > p0/cpus
+
+4) Locking between applications
+
+Certain operations on the resctrl filesystem, composed of read/writes
+to/from multiple files, must be atomic.
+
+As an example, the allocation of an exclusive reservation of L3 cache
+involves:
+
+  1. Read the cbmmasks from each directory
+  2. Find a contiguous set of bits in the global CBM bitmask that is clear
+     in any of the directory cbmmasks
+  3. Create a new directory
+  4. Set the bits found in step 2 to the new directory "schemata" file
+
+If two applications attempt to allocate space concurrently then they can
+end up allocating the same bits so the reservations are shared instead of
+exclusive.
+
+To coordinate atomic operations on the resctrlfs and to avoid the problem
+above, the following locking procedure is recommended:
+
+Locking is based on flock, which is available in libc and also as a shell
+script command
+
+Write lock:
+
+ A) Take flock(LOCK_EX) on /sys/fs/resctrl
+ B) Read/write the directory structure.
+ C) funlock
+
+Read lock:
+
+ A) Take flock(LOCK_SH) on /sys/fs/resctrl
+ B) If success read the directory structure.
+ C) funlock
+
+Example with bash:
+
+# Atomically read directory structure
+$ flock -s /sys/fs/resctrl/ find /sys/fs/resctrl
+
+# Read directory contents and create new subdirectory
+
+$ cat create-dir.sh
+find /sys/fs/resctrl/ > output.txt
+mask = function-of(output.txt)
+mkdir /sys/fs/resctrl/newres/
+echo mask > /sys/fs/resctrl/newres/schemata
+
+$ flock /sys/fs/resctrl/ ./create-dir.sh
+
+Example with C:
+
+/*
+ * Example code do take advisory locks
+ * before accessing resctrl filesystem
+ */
+#include <sys/file.h>
+#include <stdlib.h>
+
+void resctrl_take_shared_lock(int fd)
+{
+       int ret;
+
+       /* take shared lock on resctrl filesystem */
+       ret = flock(fd, LOCK_SH);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void resctrl_take_exclusive_lock(int fd)
+{
+       int ret;
+
+       /* release lock on resctrl filesystem */
+       ret = flock(fd, LOCK_EX);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void resctrl_release_lock(int fd)
+{
+       int ret;
+
+       /* take shared lock on resctrl filesystem */
+       ret = flock(fd, LOCK_UN);
+       if (ret) {
+               perror("flock");
+               exit(-1);
+       }
+}
+
+void main(void)
+{
+       int fd, ret;
+
+       fd = open("/sys/fs/resctrl", O_DIRECTORY);
+       if (fd == -1) {
+               perror("open");
+               exit(-1);
+       }
+       resctrl_take_shared_lock(fd);
+       /* code to read directory contents */
+       resctrl_release_lock(fd);
+
+       resctrl_take_exclusive_lock(fd);
+       /* code to read and write directory contents */
+       resctrl_release_lock(fd);
+}
index 6cd8945..846f97a 100644 (file)
@@ -7286,6 +7286,7 @@ M:        Masami Hiramatsu <mhiramat@kernel.org>
 S:     Maintained
 F:     Documentation/kprobes.txt
 F:     include/linux/kprobes.h
+F:     include/asm-generic/kprobes.h
 F:     kernel/kprobes.c
 
 KS0108 LCD CONTROLLER DRIVER
index b83109b..4cb6b0a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -910,6 +910,18 @@ mod_sign_cmd = true
 endif
 export mod_sign_cmd
 
+ifdef CONFIG_STACK_VALIDATION
+  has_libelf := $(call try-run,\
+               echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
+  ifeq ($(has_libelf),1)
+    objtool_target := tools/objtool FORCE
+  else
+    $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+    SKIP_STACK_VALIDATION := 1
+    export SKIP_STACK_VALIDATION
+  endif
+endif
+
 
 ifeq ($(KBUILD_EXTMOD),)
 core-y         += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
@@ -1037,18 +1049,6 @@ prepare0: archprepare gcc-plugins
 # All the preparing..
 prepare: prepare0 prepare-objtool
 
-ifdef CONFIG_STACK_VALIDATION
-  has_libelf := $(call try-run,\
-               echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
-  ifeq ($(has_libelf),1)
-    objtool_target := tools/objtool FORCE
-  else
-    $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
-    SKIP_STACK_VALIDATION := 1
-    export SKIP_STACK_VALIDATION
-  endif
-endif
-
 PHONY += prepare-objtool
 prepare-objtool: $(objtool_target)
 
index d0012ad..cd211a1 100644 (file)
@@ -29,7 +29,7 @@ config OPROFILE_EVENT_MULTIPLEX
          The number of hardware counters is limited. The multiplexing
          feature enables OProfile to gather more events than counters
          are provided by the hardware. This is realized by switching
-         between events at an user specified time interval.
+         between events at a user specified time interval.
 
          If unsure, say N.
 
index 46e47c0..d103db5 100644 (file)
@@ -10,3 +10,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += current.h
+generic-y += kprobes.h
index 46bf263..acb4b14 100644 (file)
@@ -144,7 +144,7 @@ smp_callin(void)
                alpha_mv.smp_callin();
 
        /* All kernel threads share the same mm context.  */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        /* inform the notifiers about the new cpu */
index 944dbed..00bdbe1 100644 (file)
@@ -9,6 +9,8 @@
 #ifndef _ARC_KPROBES_H
 #define _ARC_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
 #ifdef CONFIG_KPROBES
 
 typedef u16 kprobe_opcode_t;
@@ -55,6 +57,6 @@ void trap_is_kprobe(unsigned long address, struct pt_regs *regs);
 static void trap_is_kprobe(unsigned long address, struct pt_regs *regs)
 {
 }
-#endif
+#endif /* CONFIG_KPROBES */
 
-#endif
+#endif /* _ARC_KPROBES_H */
index 2afbafa..b8e8d39 100644 (file)
@@ -139,8 +139,8 @@ void start_kernel_secondary(void)
        /* MMU, Caches, Vector Table, Interrupts etc */
        setup_processor();
 
-       atomic_inc(&mm->mm_users);
-       atomic_inc(&mm->mm_count);
+       mmget(mm);
+       mmgrab(mm);
        current->active_mm = mm;
        cpumask_set_cpu(cpu, mm_cpumask(mm));
 
index 61fd1ce..b6e4f7a 100644 (file)
@@ -1051,9 +1051,9 @@ int arc_unwind(struct unwind_frame_info *frame)
                ++ptr;
        }
        if (cie != NULL) {
-               /* get code aligment factor */
+               /* get code alignment factor */
                state.codeAlign = get_uleb128(&ptr, end);
-               /* get data aligment factor */
+               /* get data alignment factor */
                state.dataAlign = get_sleb128(&ptr, end);
                if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
                        cie = NULL;
index fda6a46..0d4e71b 100644 (file)
@@ -2,6 +2,7 @@ config ARM
        bool
        default y
        select ARCH_CLOCKSOURCE_DATA
+       select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_SET_MEMORY
index aed66d5..b757634 100644 (file)
@@ -34,8 +34,7 @@ config PROCESSOR_ID
          used instead of the auto-probing which utilizes the register.
 
 config REMAP_VECTORS_TO_RAM
-       bool 'Install vectors to the beginning of RAM' if DRAM_BASE
-       depends on DRAM_BASE
+       bool 'Install vectors to the beginning of RAM'
        help
          The kernel needs to change the hardware exception vectors.
          In nommu mode, the hardware exception vectors are normally
index a0765e7..ea78327 100644 (file)
@@ -32,6 +32,7 @@ extern void error(char *);
 
 /* Not needed, but used in some headers pulled in by decompressors */
 extern char * strstr(const char * s1, const char *s2);
+extern size_t strlen(const char *s);
 
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
index fc6d541..9150f97 100644 (file)
@@ -1196,7 +1196,7 @@ skip:
                bgt     loop1
 finished:
                ldmfd   sp!, {r0-r7, r9-r11}
-               mov     r10, #0                 @ swith back to cache level 0
+               mov     r10, #0                 @ switch back to cache level 0
                mcr     p15, 2, r10, c0, c0, 0  @ select current cache level in cssr
 iflush:
                mcr     p15, 0, r10, c7, c10, 4 @ DSB
index a923524..cf06247 100644 (file)
@@ -144,7 +144,7 @@ extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
 {
-       unsigned long val = ptr ? virt_to_phys(ptr) : 0;
+       unsigned long val = ptr ? __pa_symbol(ptr) : 0;
        mcpm_entry_vectors[cluster][cpu] = val;
        sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
 }
@@ -299,8 +299,8 @@ void mcpm_cpu_power_down(void)
         * the kernel as if the power_up method just had deasserted reset
         * on the CPU.
         */
-       phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-       phys_reset(virt_to_phys(mcpm_entry_point));
+       phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+       phys_reset(__pa_symbol(mcpm_entry_point));
 
        /* should never get here */
        BUG();
@@ -388,8 +388,8 @@ static int __init nocache_trampoline(unsigned long _arg)
        __mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
        __mcpm_cpu_down(cpu, cluster);
 
-       phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-       phys_reset(virt_to_phys(mcpm_entry_point));
+       phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+       phys_reset(__pa_symbol(mcpm_entry_point));
        BUG();
 }
 
@@ -449,7 +449,7 @@ int __init mcpm_sync_init(
        sync_cache_w(&mcpm_sync);
 
        if (power_up_setup) {
-               mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+               mcpm_power_up_setup_phys = __pa_symbol(power_up_setup);
                sync_cache_w(&mcpm_power_up_setup_phys);
        }
 
index eaa60da..0ef42ae 100644 (file)
@@ -16,7 +16,7 @@
 #ifndef __CACHE_UNIPHIER_H
 #define __CACHE_UNIPHIER_H
 
-#include <linux/types.h>
+#include <linux/errno.h>
 
 #ifdef CONFIG_CACHE_UNIPHIER
 int uniphier_cache_init(void);
index 3ea9be5..5965545 100644 (file)
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
@@ -83,4 +86,5 @@ struct arch_optimized_insn {
         */
 };
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
index 4ca69fe..bada3f8 100644 (file)
@@ -22,7 +22,7 @@ struct mtd_info;
  * set_vpp:    method called to enable or disable VPP
  * mmcontrol:  method called to enable or disable Sync. Burst Read in OneNAND
  * parts:      optional array of mtd_partitions for static partitioning
- * nr_parts:   number of mtd_partitions for static partitoning
+ * nr_parts:   number of mtd_partitions for static partitioning
  */
 struct flash_platform_data {
        const char      *map_name;
index 76cbd9c..1f54e4e 100644 (file)
 #define IOREMAP_MAX_ORDER      24
 #endif
 
+#define VECTORS_BASE           UL(0xffff0000)
+
 #else /* CONFIG_MMU */
 
+#ifndef __ASSEMBLY__
+extern unsigned long vectors_base;
+#define VECTORS_BASE           vectors_base
+#endif
+
 /*
  * The limitation of user task size can grow up to the end of free ram region.
  * It is difficult to define and perhaps will never meet the original meaning
 
 #endif /* !CONFIG_MMU */
 
+#ifdef CONFIG_XIP_KERNEL
+#define KERNEL_START           _sdata
+#else
+#define KERNEL_START           _stext
+#endif
+#define KERNEL_END             _end
+
 /*
  * We fix the TCM memories max 32 KiB ITCM resp DTCM at these
  * locations
@@ -206,7 +220,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
        : "r" (x), "I" (__PV_BITS_31_24)                \
        : "cc")
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
        phys_addr_t t;
 
@@ -238,7 +252,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
 #define PHYS_OFFSET    PLAT_PHYS_OFFSET
 #define PHYS_PFN_OFFSET        ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
        return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
 }
@@ -254,6 +268,16 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
        ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
         PHYS_PFN_OFFSET)
 
+#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x))
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern phys_addr_t __virt_to_phys(unsigned long x);
+extern phys_addr_t __phys_addr_symbol(unsigned long x);
+#else
+#define __virt_to_phys(x)      __virt_to_phys_nodebug(x)
+#define __phys_addr_symbol(x)  __pa_symbol_nodebug(x)
+#endif
+
 /*
  * These are *only* valid on the kernel direct mapped RAM memory.
  * Note: Drivers should NOT use these.  They are the wrong
@@ -276,6 +300,7 @@ static inline void *phys_to_virt(phys_addr_t x)
  * Drivers should NOT use these either.
  */
 #define __pa(x)                        __virt_to_phys((unsigned long)(x))
+#define __pa_symbol(x)         __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
 #define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)      __va((phys_addr_t)(pfn) << PAGE_SHIFT)
 
index add094d..302240c 100644 (file)
@@ -63,9 +63,9 @@ typedef pte_t *pte_addr_t;
 /*
  * Mark the prot value as uncacheable and unbufferable.
  */
-#define pgprot_noncached(prot) __pgprot(0)
-#define pgprot_writecombine(prot) __pgprot(0)
-#define pgprot_dmacoherent(prot) __pgprot(0)
+#define pgprot_noncached(prot) (prot)
+#define pgprot_writecombine(prot) (prot)
+#define pgprot_dmacoherent(prot) (prot)
 
 
 /*
index 6b4eb27..2e21e08 100644 (file)
@@ -152,11 +152,6 @@ __after_proc_init:
 #ifdef CONFIG_CPU_ICACHE_DISABLE
        bic     r0, r0, #CR_I
 #endif
-#ifdef CONFIG_CPU_HIGH_VECTOR
-       orr     r0, r0, #CR_V
-#else
-       bic     r0, r0, #CR_V
-#endif
        mcr     p15, 0, r0, c1, c0, 0           @ write control reg
 #elif defined (CONFIG_CPU_V7M)
        /* For V7M systems we want to modify the CCR similarly to the SCTLR */
index 4f14b5c..80254b4 100644 (file)
@@ -155,8 +155,17 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                       break;
 
                case R_ARM_PREL31:
-                       offset = *(u32 *)loc + sym->st_value - loc;
-                       *(u32 *)loc = offset & 0x7fffffff;
+                       offset = (*(s32 *)loc << 1) >> 1; /* sign extend */
+                       offset += sym->st_value - loc;
+                       if (offset >= 0x40000000 || offset < -0x40000000) {
+                               pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
+                                      module->name, relindex, i, symname,
+                                      ELF32_R_TYPE(rel->r_info), loc,
+                                      sym->st_value);
+                               return -ENOEXEC;
+                       }
+                       *(u32 *)loc &= 0x80000000;
+                       *(u32 *)loc |= offset & 0x7fffffff;
                        break;
 
                case R_ARM_MOVW_ABS_NC:
index 34e3f3c..f4e5450 100644 (file)
@@ -81,7 +81,7 @@ __setup("fpe=", fpe_setup);
 extern void init_default_cache_policy(unsigned long);
 extern void paging_init(const struct machine_desc *desc);
 extern void early_paging_init(const struct machine_desc *);
-extern void sanity_check_meminfo(void);
+extern void adjust_lowmem_bounds(void);
 extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(const struct machine_desc *desc);
 
@@ -1093,8 +1093,14 @@ void __init setup_arch(char **cmdline_p)
        setup_dma_zone(mdesc);
        xen_early_init();
        efi_init();
-       sanity_check_meminfo();
+       /*
+        * Make sure the calculation for lowmem/highmem is set appropriately
+        * before reserving/allocating any mmeory
+        */
+       adjust_lowmem_bounds();
        arm_memblock_init(mdesc);
+       /* Memory may have been removed so recalculate the bounds. */
+       adjust_lowmem_bounds();
 
        early_ioremap_reset();
 
index 7dd14e8..5a07c5a 100644 (file)
@@ -251,7 +251,7 @@ void __cpu_die(unsigned int cpu)
                pr_err("CPU%u: cpu didn't die\n", cpu);
                return;
        }
-       pr_notice("CPU%u: shutdown\n", cpu);
+       pr_debug("CPU%u: shutdown\n", cpu);
 
        /*
         * platform_cpu_kill() is generally expected to do the powering off
@@ -371,7 +371,7 @@ asmlinkage void secondary_start_kernel(void)
         * reference and switch to it.
         */
        cpu = smp_processor_id();
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
        current->active_mm = mm;
        cpumask_set_cpu(cpu, mm_cpumask(mm));
 
index dd77ea2..6dc6d49 100644 (file)
@@ -27,7 +27,7 @@ static int alpine_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        phys_addr_t addr;
 
-       addr = virt_to_phys(secondary_startup);
+       addr = __pa_symbol(secondary_startup);
 
        if (addr > (phys_addr_t)(uint32_t)(-1)) {
                pr_err("FAIL: resume address over 32bit (%pa)", &addr);
index ffbd71d..502e3df 100644 (file)
@@ -25,7 +25,7 @@
 static void write_release_addr(u32 release_phys)
 {
        u32 *virt = (u32 *) phys_to_virt(release_phys);
-       writel_relaxed(virt_to_phys(secondary_startup), virt);
+       writel_relaxed(__pa_symbol(secondary_startup), virt);
        /* Make sure this store is visible to other CPUs */
        smp_wmb();
        __cpuc_flush_dcache_area(virt, sizeof(u32));
index 9b6727e..f5fb10b 100644 (file)
@@ -135,7 +135,7 @@ static int bcm63138_smp_boot_secondary(unsigned int cpu,
        }
 
        /* Write the secondary init routine to the BootLUT reset vector */
-       val = virt_to_phys(secondary_startup);
+       val = __pa_symbol(secondary_startup);
        writel_relaxed(val, bootlut_base + BOOTLUT_RESET_VECT);
 
        /* Power up the core, will jump straight to its reset vector when we
index 40dc844..1237996 100644 (file)
@@ -151,7 +151,7 @@ static void brcmstb_cpu_boot(u32 cpu)
         * Set the reset vector to point to the secondary_startup
         * routine
         */
-       cpu_set_boot_addr(cpu, virt_to_phys(secondary_startup));
+       cpu_set_boot_addr(cpu, __pa_symbol(secondary_startup));
 
        /* Unhalt the cpu */
        cpu_rst_cfg_set(cpu, 0);
index 3ac3a9b..582886d 100644 (file)
@@ -116,7 +116,7 @@ static int nsp_write_lut(unsigned int cpu)
                return -ENOMEM;
        }
 
-       secondary_startup_phy = virt_to_phys(secondary_startup);
+       secondary_startup_phy = __pa_symbol(secondary_startup);
        BUG_ON(secondary_startup_phy > (phys_addr_t)U32_MAX);
 
        writel_relaxed(secondary_startup_phy, sku_rom_lut);
@@ -189,7 +189,7 @@ static int kona_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * Secondary cores will start in secondary_startup(),
         * defined in "arch/arm/kernel/head.S"
         */
-       boot_func = virt_to_phys(secondary_startup);
+       boot_func = __pa_symbol(secondary_startup);
        BUG_ON(boot_func & BOOT_ADDR_CPUID_MASK);
        BUG_ON(boot_func > (phys_addr_t)U32_MAX);
 
index 93f9068..7586b7a 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/cp15.h>
+#include <asm/memory.h>
 #include <asm/smp_plat.h>
 #include <asm/smp_scu.h>
 
@@ -75,7 +76,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus)
        if (!cpu_ctrl)
                goto unmap_scu;
 
-       vectors_base = ioremap(CONFIG_VECTORS_BASE, SZ_32K);
+       vectors_base = ioremap(VECTORS_BASE, SZ_32K);
        if (!vectors_base)
                goto unmap_scu;
 
@@ -92,7 +93,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus)
         * Write the secondary startup address into the SW reset address
         * vector. This is used by boot_inst.
         */
-       writel(virt_to_phys(secondary_startup), vectors_base + SW_RESET_ADDR);
+       writel(__pa_symbol(secondary_startup), vectors_base + SW_RESET_ADDR);
 
        iounmap(vectors_base);
 unmap_scu:
index 3b39ea3..8a5b6f0 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 
@@ -45,16 +44,6 @@ static struct map_desc ts72xx_io_desc[] __initdata = {
                .pfn            = __phys_to_pfn(TS72XX_OPTIONS2_PHYS_BASE),
                .length         = TS72XX_OPTIONS2_SIZE,
                .type           = MT_DEVICE,
-       }, {
-               .virtual        = (unsigned long)TS72XX_RTC_INDEX_VIRT_BASE,
-               .pfn            = __phys_to_pfn(TS72XX_RTC_INDEX_PHYS_BASE),
-               .length         = TS72XX_RTC_INDEX_SIZE,
-               .type           = MT_DEVICE,
-       }, {
-               .virtual        = (unsigned long)TS72XX_RTC_DATA_VIRT_BASE,
-               .pfn            = __phys_to_pfn(TS72XX_RTC_DATA_PHYS_BASE),
-               .length         = TS72XX_RTC_DATA_SIZE,
-               .type           = MT_DEVICE,
        }
 };
 
@@ -179,31 +168,22 @@ static void __init ts72xx_register_flash(void)
        }
 }
 
+/*************************************************************************
+ * RTC M48T86
+ *************************************************************************/
+#define TS72XX_RTC_INDEX_PHYS_BASE     (EP93XX_CS1_PHYS_BASE + 0x00800000)
+#define TS72XX_RTC_DATA_PHYS_BASE      (EP93XX_CS1_PHYS_BASE + 0x01700000)
 
-static unsigned char ts72xx_rtc_readbyte(unsigned long addr)
-{
-       __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
-       return __raw_readb(TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static void ts72xx_rtc_writebyte(unsigned char value, unsigned long addr)
-{
-       __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
-       __raw_writeb(value, TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static struct m48t86_ops ts72xx_rtc_ops = {
-       .readbyte       = ts72xx_rtc_readbyte,
-       .writebyte      = ts72xx_rtc_writebyte,
+static struct resource ts72xx_rtc_resources[] = {
+       DEFINE_RES_MEM(TS72XX_RTC_INDEX_PHYS_BASE, 0x01),
+       DEFINE_RES_MEM(TS72XX_RTC_DATA_PHYS_BASE, 0x01),
 };
 
 static struct platform_device ts72xx_rtc_device = {
        .name           = "rtc-m48t86",
        .id             = -1,
-       .dev            = {
-               .platform_data  = &ts72xx_rtc_ops,
-       },
-       .num_resources  = 0,
+       .resource       = ts72xx_rtc_resources,
+       .num_resources  = ARRAY_SIZE(ts72xx_rtc_resources),
 };
 
 static struct resource ts72xx_wdt_resources[] = {
index 071feaa..2255ba2 100644 (file)
@@ -9,8 +9,6 @@
  * febff000    22000000        4K      model number register (bits 0-2)
  * febfe000    22400000        4K      options register
  * febfd000    22800000        4K      options register #2
- * febf9000    10800000        4K      TS-5620 RTC index register
- * febf8000    11700000        4K      TS-5620 RTC data register
  */
 
 #define TS72XX_MODEL_PHYS_BASE         0x22000000
 #define TS72XX_OPTIONS2_TS9420         0x04
 #define TS72XX_OPTIONS2_TS9420_BOOT    0x02
 
-
-#define TS72XX_RTC_INDEX_VIRT_BASE     IOMEM(0xfebf9000)
-#define TS72XX_RTC_INDEX_PHYS_BASE     0x10800000
-#define TS72XX_RTC_INDEX_SIZE          0x00001000
-
-#define TS72XX_RTC_DATA_VIRT_BASE      IOMEM(0xfebf8000)
-#define TS72XX_RTC_DATA_PHYS_BASE      0x11700000
-#define TS72XX_RTC_DATA_SIZE           0x00001000
-
 #define TS72XX_WDT_CONTROL_PHYS_BASE   0x23800000
 #define TS72XX_WDT_FEED_PHYS_BASE      0x23c00000
 
index fd6da54..e81a78b 100644 (file)
@@ -41,7 +41,7 @@ static int exynos_do_idle(unsigned long mode)
        case FW_DO_IDLE_AFTR:
                if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
                        exynos_save_cp15();
-               writel_relaxed(virt_to_phys(exynos_cpu_resume_ns),
+               writel_relaxed(__pa_symbol(exynos_cpu_resume_ns),
                               sysram_ns_base_addr + 0x24);
                writel_relaxed(EXYNOS_AFTR_MAGIC, sysram_ns_base_addr + 0x20);
                if (soc_is_exynos3250()) {
@@ -135,7 +135,7 @@ static int exynos_suspend(void)
                exynos_save_cp15();
 
        writel(EXYNOS_SLEEP_MAGIC, sysram_ns_base_addr + EXYNOS_BOOT_FLAG);
-       writel(virt_to_phys(exynos_cpu_resume_ns),
+       writel(__pa_symbol(exynos_cpu_resume_ns),
                sysram_ns_base_addr + EXYNOS_BOOT_ADDR);
 
        return cpu_suspend(0, exynos_cpu_suspend);
index 038fd8c..b426225 100644 (file)
@@ -221,7 +221,7 @@ static void exynos_mcpm_setup_entry_point(void)
         */
        __raw_writel(0xe59f0000, ns_sram_base_addr);     /* ldr r0, [pc, #0] */
        __raw_writel(0xe12fff10, ns_sram_base_addr + 4); /* bx  r0 */
-       __raw_writel(virt_to_phys(mcpm_entry_point), ns_sram_base_addr + 8);
+       __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8);
 }
 
 static struct syscore_ops exynos_mcpm_syscore_ops = {
index a5d6841..5a03bff 100644 (file)
@@ -353,7 +353,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 
                smp_rmb();
 
-               boot_addr = virt_to_phys(exynos4_secondary_startup);
+               boot_addr = __pa_symbol(exynos4_secondary_startup);
 
                ret = exynos_set_boot_addr(core_id, boot_addr);
                if (ret)
@@ -413,7 +413,7 @@ static void __init exynos_smp_prepare_cpus(unsigned int max_cpus)
 
                mpidr = cpu_logical_map(i);
                core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-               boot_addr = virt_to_phys(exynos4_secondary_startup);
+               boot_addr = __pa_symbol(exynos4_secondary_startup);
 
                ret = exynos_set_boot_addr(core_id, boot_addr);
                if (ret)
index 487295f..1a7e5b5 100644 (file)
@@ -132,7 +132,7 @@ static void exynos_set_wakeupmask(long mask)
 
 static void exynos_cpu_set_boot_vector(long flags)
 {
-       writel_relaxed(virt_to_phys(exynos_cpu_resume),
+       writel_relaxed(__pa_symbol(exynos_cpu_resume),
                       exynos_boot_vector_addr());
        writel_relaxed(flags, exynos_boot_vector_flag());
 }
@@ -238,7 +238,7 @@ static int exynos_cpu0_enter_aftr(void)
 
 abort:
        if (cpu_online(1)) {
-               unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+               unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
 
                /*
                 * Set the boot vector to something non-zero
@@ -330,7 +330,7 @@ cpu1_aborted:
 
 static void exynos_pre_enter_aftr(void)
 {
-       unsigned long boot_addr = virt_to_phys(exynos_cpu_resume);
+       unsigned long boot_addr = __pa_symbol(exynos_cpu_resume);
 
        (void)exynos_set_boot_addr(1, boot_addr);
 }
index adf4e8f..748cfb8 100644 (file)
@@ -301,7 +301,7 @@ static void exynos_pm_prepare(void)
        exynos_pm_enter_sleep_mode();
 
        /* ensure at least INFORM0 has the resume address */
-       pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+       pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
 }
 
 static void exynos3250_pm_prepare(void)
@@ -318,7 +318,7 @@ static void exynos3250_pm_prepare(void)
        exynos_pm_enter_sleep_mode();
 
        /* ensure at least INFORM0 has the resume address */
-       pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0);
+       pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0);
 }
 
 static void exynos5420_pm_prepare(void)
@@ -343,7 +343,7 @@ static void exynos5420_pm_prepare(void)
 
        /* ensure at least INFORM0 has the resume address */
        if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM))
-               pmu_raw_writel(virt_to_phys(mcpm_entry_point), S5P_INFORM0);
+               pmu_raw_writel(__pa_symbol(mcpm_entry_point), S5P_INFORM0);
 
        tmp = pmu_raw_readl(EXYNOS_L2_OPTION(0));
        tmp &= ~EXYNOS_L2_USE_RETENTION;
index 4b653a8..a6c1176 100644 (file)
@@ -327,7 +327,7 @@ static int __init hip04_smp_init(void)
         */
        writel_relaxed(hip04_boot_method[0], relocation);
        writel_relaxed(0xa5a5a5a5, relocation + 4);     /* magic number */
-       writel_relaxed(virt_to_phys(secondary_startup), relocation + 8);
+       writel_relaxed(__pa_symbol(secondary_startup), relocation + 8);
        writel_relaxed(0, relocation + 12);
        iounmap(relocation);
 
index e1d6764..91bb02d 100644 (file)
@@ -28,7 +28,7 @@ void hi3xxx_set_cpu_jump(int cpu, void *jump_addr)
        cpu = cpu_logical_map(cpu);
        if (!cpu || !ctrl_base)
                return;
-       writel_relaxed(virt_to_phys(jump_addr), ctrl_base + ((cpu - 1) << 2));
+       writel_relaxed(__pa_symbol(jump_addr), ctrl_base + ((cpu - 1) << 2));
 }
 
 int hi3xxx_get_cpu_jump(int cpu)
@@ -118,7 +118,7 @@ static int hix5hd2_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        phys_addr_t jumpaddr;
 
-       jumpaddr = virt_to_phys(secondary_startup);
+       jumpaddr = __pa_symbol(secondary_startup);
        hix5hd2_set_scu_boot_addr(HIX5HD2_BOOT_ADDRESS, jumpaddr);
        hix5hd2_set_cpu(cpu, true);
        arch_send_wakeup_ipi_mask(cpumask_of(cpu));
@@ -156,7 +156,7 @@ static int hip01_boot_secondary(unsigned int cpu, struct task_struct *idle)
        struct device_node *node;
 
 
-       jumpaddr = virt_to_phys(secondary_startup);
+       jumpaddr = __pa_symbol(secondary_startup);
        hip01_set_boot_addr(HIP01_BOOT_ADDRESS, jumpaddr);
 
        node = of_find_compatible_node(NULL, NULL, "hisilicon,hip01-sysctrl");
index 711dbbd..c2d1b32 100644 (file)
@@ -117,7 +117,7 @@ static void __init ls1021a_smp_prepare_cpus(unsigned int max_cpus)
        dcfg_base = of_iomap(np, 0);
        BUG_ON(!dcfg_base);
 
-       paddr = virt_to_phys(secondary_startup);
+       paddr = __pa_symbol(secondary_startup);
        writel_relaxed(cpu_to_be32(paddr), dcfg_base + DCFG_CCSR_SCRATCHRW1);
 
        iounmap(dcfg_base);
index 1515e49..e61b1d1 100644 (file)
@@ -499,7 +499,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
        memset(suspend_ocram_base, 0, sizeof(*pm_info));
        pm_info = suspend_ocram_base;
        pm_info->pbase = ocram_pbase;
-       pm_info->resume_addr = virt_to_phys(v7_cpu_resume);
+       pm_info->resume_addr = __pa_symbol(v7_cpu_resume);
        pm_info->pm_info_size = sizeof(*pm_info);
 
        /*
index 70b083f..495d85d 100644 (file)
@@ -99,7 +99,7 @@ void imx_enable_cpu(int cpu, bool enable)
 void imx_set_cpu_jump(int cpu, void *jump_addr)
 {
        cpu = cpu_logical_map(cpu);
-       writel_relaxed(virt_to_phys(jump_addr),
+       writel_relaxed(__pa_symbol(jump_addr),
                       src_base + SRC_GPR1 + cpu * 8);
 }
 
index b821e34..726eb69 100644 (file)
@@ -122,7 +122,7 @@ static void __init __mtk_smp_prepare_cpus(unsigned int max_cpus, int trustzone)
         * write the address of slave startup address into the system-wide
         * jump register
         */
-       writel_relaxed(virt_to_phys(secondary_startup_arm),
+       writel_relaxed(__pa_symbol(secondary_startup_arm),
                        mtk_smp_base + mtk_smp_info->jump_reg);
 }
 
index 2990c52..c487be6 100644 (file)
@@ -110,7 +110,7 @@ static void mvebu_pm_store_armadaxp_bootinfo(u32 *store_addr)
 {
        phys_addr_t resume_pc;
 
-       resume_pc = virt_to_phys(armada_370_xp_cpu_resume);
+       resume_pc = __pa_symbol(armada_370_xp_cpu_resume);
 
        /*
         * The bootloader expects the first two words to be a magic
index f39bd51..27a78c8 100644 (file)
@@ -112,7 +112,7 @@ static const struct of_device_id of_pmsu_table[] = {
 
 void mvebu_pmsu_set_cpu_boot_addr(int hw_cpu, void *boot_addr)
 {
-       writel(virt_to_phys(boot_addr), pmsu_mp_base +
+       writel(__pa_symbol(boot_addr), pmsu_mp_base +
                PMSU_BOOT_ADDR_REDIRECT_OFFSET(hw_cpu));
 }
 
index 76cbc82..04d9ebe 100644 (file)
@@ -153,7 +153,7 @@ void mvebu_system_controller_set_cpu_boot_addr(void *boot_addr)
        if (of_machine_is_compatible("marvell,armada375"))
                mvebu_armada375_smp_wa_init();
 
-       writel(virt_to_phys(boot_addr), system_controller_base +
+       writel(__pa_symbol(boot_addr), system_controller_base +
               mvebu_sc->resume_boot_addr);
 }
 #endif
index 1662071..bd8089f 100644 (file)
@@ -315,15 +315,15 @@ void omap3_save_scratchpad_contents(void)
        scratchpad_contents.boot_config_ptr = 0x0;
        if (cpu_is_omap3630())
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore_3630);
+                       __pa_symbol(omap3_restore_3630);
        else if (omap_rev() != OMAP3430_REV_ES3_0 &&
                                        omap_rev() != OMAP3430_REV_ES3_1 &&
                                        omap_rev() != OMAP3430_REV_ES3_1_2)
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore);
+                       __pa_symbol(omap3_restore);
        else
                scratchpad_contents.public_restore_ptr =
-                       virt_to_phys(omap3_restore_es3);
+                       __pa_symbol(omap3_restore_es3);
 
        if (omap_type() == OMAP2_DEVICE_TYPE_GP)
                scratchpad_contents.secure_ram_restore_ptr = 0x0;
@@ -395,7 +395,7 @@ void omap3_save_scratchpad_contents(void)
        sdrc_block_contents.flags = 0x0;
        sdrc_block_contents.block_size = 0x0;
 
-       arm_context_addr = virt_to_phys(omap3_arm_context);
+       arm_context_addr = __pa_symbol(omap3_arm_context);
 
        /* Copy all the contents to the scratchpad location */
        scratchpad_address = OMAP2_L4_IO_ADDRESS(OMAP343X_SCRATCHPAD);
index 7d62ad4..113ab2d 100644 (file)
@@ -273,7 +273,7 @@ int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state)
        cpu_clear_prev_logic_pwrst(cpu);
        pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
        pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state);
-       set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.resume));
+       set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume));
        omap_pm_ops.scu_prepare(cpu, power_state);
        l2x0_pwrst_prepare(cpu, save_state);
 
@@ -325,7 +325,7 @@ int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state)
 
        pwrdm_clear_all_prev_pwrst(pm_info->pwrdm);
        pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
-       set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.hotplug_restart));
+       set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.hotplug_restart));
        omap_pm_ops.scu_prepare(cpu, power_state);
 
        /*
@@ -467,13 +467,13 @@ void __init omap4_mpuss_early_init(void)
        sar_base = omap4_get_sar_ram_base();
 
        if (cpu_is_omap443x())
-               startup_pa = virt_to_phys(omap4_secondary_startup);
+               startup_pa = __pa_symbol(omap4_secondary_startup);
        else if (cpu_is_omap446x())
-               startup_pa = virt_to_phys(omap4460_secondary_startup);
+               startup_pa = __pa_symbol(omap4460_secondary_startup);
        else if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE)
-               startup_pa = virt_to_phys(omap5_secondary_hyp_startup);
+               startup_pa = __pa_symbol(omap5_secondary_hyp_startup);
        else
-               startup_pa = virt_to_phys(omap5_secondary_startup);
+               startup_pa = __pa_symbol(omap5_secondary_startup);
 
        if (cpu_is_omap44xx())
                writel_relaxed(startup_pa, sar_base +
index b4de3da..003353b 100644 (file)
@@ -316,9 +316,9 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
         * A barrier is added to ensure that write buffer is drained
         */
        if (omap_secure_apis_support())
-               omap_auxcoreboot_addr(virt_to_phys(cfg.startup_addr));
+               omap_auxcoreboot_addr(__pa_symbol(cfg.startup_addr));
        else
-               writel_relaxed(virt_to_phys(cfg.startup_addr),
+               writel_relaxed(__pa_symbol(cfg.startup_addr),
                               base + OMAP_AUX_CORE_BOOT_1);
 }
 
index 6bf6267..1346b3a 100644 (file)
@@ -1,5 +1,5 @@
 /**
- * OMAP and TWL PMIC specific intializations.
+ * OMAP and TWL PMIC specific initializations.
  *
  * Copyright (C) 2010 Texas Instruments Incorporated.
  * Thara Gopinath
index 8d59726..7ef80a8 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/platform_device.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/ata_platform.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/timeriomem-rng.h>
@@ -80,79 +79,38 @@ static struct mv_sata_platform_data ts78xx_sata_data = {
 /*****************************************************************************
  * RTC M48T86 - nicked^Wborrowed from arch/arm/mach-ep93xx/ts72xx.c
  ****************************************************************************/
-#define TS_RTC_CTRL    (TS78XX_FPGA_REGS_VIRT_BASE + 0x808)
-#define TS_RTC_DATA    (TS78XX_FPGA_REGS_VIRT_BASE + 0x80c)
+#define TS_RTC_CTRL    (TS78XX_FPGA_REGS_PHYS_BASE + 0x808)
+#define TS_RTC_DATA    (TS78XX_FPGA_REGS_PHYS_BASE + 0x80c)
 
-static unsigned char ts78xx_ts_rtc_readbyte(unsigned long addr)
-{
-       writeb(addr, TS_RTC_CTRL);
-       return readb(TS_RTC_DATA);
-}
-
-static void ts78xx_ts_rtc_writebyte(unsigned char value, unsigned long addr)
-{
-       writeb(addr, TS_RTC_CTRL);
-       writeb(value, TS_RTC_DATA);
-}
-
-static struct m48t86_ops ts78xx_ts_rtc_ops = {
-       .readbyte       = ts78xx_ts_rtc_readbyte,
-       .writebyte      = ts78xx_ts_rtc_writebyte,
+static struct resource ts78xx_ts_rtc_resources[] = {
+       DEFINE_RES_MEM(TS_RTC_CTRL, 0x01),
+       DEFINE_RES_MEM(TS_RTC_DATA, 0x01),
 };
 
 static struct platform_device ts78xx_ts_rtc_device = {
        .name           = "rtc-m48t86",
        .id             = -1,
-       .dev            = {
-               .platform_data  = &ts78xx_ts_rtc_ops,
-       },
-       .num_resources  = 0,
+       .resource       = ts78xx_ts_rtc_resources,
+       .num_resources  = ARRAY_SIZE(ts78xx_ts_rtc_resources),
 };
 
-/*
- * TS uses some of the user storage space on the RTC chip so see if it is
- * present; as it's an optional feature at purchase time and not all boards
- * will have it present
- *
- * I've used the method TS use in their rtc7800.c example for the detection
- *
- * TODO: track down a guinea pig without an RTC to see if we can work out a
- *             better RTC detection routine
- */
 static int ts78xx_ts_rtc_load(void)
 {
        int rc;
-       unsigned char tmp_rtc0, tmp_rtc1;
-
-       tmp_rtc0 = ts78xx_ts_rtc_readbyte(126);
-       tmp_rtc1 = ts78xx_ts_rtc_readbyte(127);
-
-       ts78xx_ts_rtc_writebyte(0x00, 126);
-       ts78xx_ts_rtc_writebyte(0x55, 127);
-       if (ts78xx_ts_rtc_readbyte(127) == 0x55) {
-               ts78xx_ts_rtc_writebyte(0xaa, 127);
-               if (ts78xx_ts_rtc_readbyte(127) == 0xaa
-                               && ts78xx_ts_rtc_readbyte(126) == 0x00) {
-                       ts78xx_ts_rtc_writebyte(tmp_rtc0, 126);
-                       ts78xx_ts_rtc_writebyte(tmp_rtc1, 127);
-
-                       if (ts78xx_fpga.supports.ts_rtc.init == 0) {
-                               rc = platform_device_register(&ts78xx_ts_rtc_device);
-                               if (!rc)
-                                       ts78xx_fpga.supports.ts_rtc.init = 1;
-                       } else
-                               rc = platform_device_add(&ts78xx_ts_rtc_device);
-
-                       if (rc)
-                               pr_info("RTC could not be registered: %d\n",
-                                       rc);
-                       return rc;
-               }
+
+       if (ts78xx_fpga.supports.ts_rtc.init == 0) {
+               rc = platform_device_register(&ts78xx_ts_rtc_device);
+               if (!rc)
+                       ts78xx_fpga.supports.ts_rtc.init = 1;
+       } else {
+               rc = platform_device_add(&ts78xx_ts_rtc_device);
        }
 
-       pr_info("RTC not found\n");
-       return -ENODEV;
-};
+       if (rc)
+               pr_info("RTC could not be registered: %d\n", rc);
+
+       return rc;
+}
 
 static void ts78xx_ts_rtc_unload(void)
 {
index 0875b99..75ef5d4 100644 (file)
@@ -65,7 +65,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * waiting for. This would wake up the secondary core from WFE
         */
 #define SIRFSOC_CPU1_JUMPADDR_OFFSET 0x2bc
-       __raw_writel(virt_to_phys(sirfsoc_secondary_startup),
+       __raw_writel(__pa_symbol(sirfsoc_secondary_startup),
                clk_base + SIRFSOC_CPU1_JUMPADDR_OFFSET);
 
 #define SIRFSOC_CPU1_WAKEMAGIC_OFFSET 0x2b8
index 83e94c9..b0bcf1f 100644 (file)
@@ -54,7 +54,7 @@ static void sirfsoc_set_sleep_mode(u32 mode)
 
 static int sirfsoc_pre_suspend_power_off(void)
 {
-       u32 wakeup_entry = virt_to_phys(cpu_resume);
+       u32 wakeup_entry = __pa_symbol(cpu_resume);
 
        sirfsoc_rtc_iobrg_writel(wakeup_entry, sirfsoc_pwrc_base +
                SIRFSOC_PWRC_SCRATCH_PAD1);
index 9c308de..2963006 100644 (file)
@@ -249,7 +249,7 @@ static int palmz72_pm_suspend(void)
        store_ptr = *PALMZ72_SAVE_DWORD;
 
        /* Setting PSPR to a proper value */
-       PSPR = virt_to_phys(&palmz72_resume_info);
+       PSPR = __pa_symbol(&palmz72_resume_info);
 
        return 0;
 }
index c725baf..ba431fa 100644 (file)
@@ -85,7 +85,7 @@ static void pxa25x_cpu_pm_enter(suspend_state_t state)
 static int pxa25x_cpu_pm_prepare(void)
 {
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
        return 0;
 }
 
index c0185c5..9b69be4 100644 (file)
@@ -168,7 +168,7 @@ static int pxa27x_cpu_pm_valid(suspend_state_t state)
 static int pxa27x_cpu_pm_prepare(void)
 {
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
        return 0;
 }
 
index 87acc96..0cc9f12 100644 (file)
@@ -123,7 +123,7 @@ static void pxa3xx_cpu_pm_suspend(void)
        PSPR = 0x5c014000;
 
        /* overwrite with the resume address */
-       *p = virt_to_phys(cpu_resume);
+       *p = __pa_symbol(cpu_resume);
 
        cpu_suspend(0, pxa3xx_finish_suspend);
 
index 70ca99e..c242423 100644 (file)
@@ -76,7 +76,7 @@ static void __init realview_smp_prepare_cpus(unsigned int max_cpus)
        }
        /* Put the boot address in this magic register */
        regmap_write(map, REALVIEW_SYS_FLAGSSET_OFFSET,
-                    virt_to_phys(versatile_secondary_startup));
+                    __pa_symbol(versatile_secondary_startup));
 }
 
 static const struct smp_operations realview_dt_smp_ops __initconst = {
index 4d827a0..3abafdb 100644 (file)
@@ -156,7 +156,7 @@ static int rockchip_boot_secondary(unsigned int cpu, struct task_struct *idle)
                 */
                mdelay(1); /* ensure the cpus other than cpu0 to startup */
 
-               writel(virt_to_phys(secondary_startup), sram_base_addr + 8);
+               writel(__pa_symbol(secondary_startup), sram_base_addr + 8);
                writel(0xDEADBEAF, sram_base_addr + 4);
                dsb_sev();
        }
@@ -195,7 +195,7 @@ static int __init rockchip_smp_prepare_sram(struct device_node *node)
        }
 
        /* set the boot function for the sram code */
-       rockchip_boot_fn = virt_to_phys(secondary_startup);
+       rockchip_boot_fn = __pa_symbol(secondary_startup);
 
        /* copy the trampoline to sram, that runs during startup of the core */
        memcpy(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz);
index bee8c80..0592534 100644 (file)
@@ -62,7 +62,7 @@ static inline u32 rk3288_l2_config(void)
 static void rk3288_config_bootdata(void)
 {
        rkpm_bootdata_cpusp = rk3288_bootram_phy + (SZ_4K - 8);
-       rkpm_bootdata_cpu_code = virt_to_phys(cpu_resume);
+       rkpm_bootdata_cpu_code = __pa_symbol(cpu_resume);
 
        rkpm_bootdata_l2ctlr_f  = 1;
        rkpm_bootdata_l2ctlr = rk3288_l2_config();
index 895aca2..f5b5c49 100644 (file)
@@ -484,7 +484,7 @@ static int jive_pm_suspend(void)
         * correct address to resume from. */
 
        __raw_writel(0x2BED, S3C2412_INFORM0);
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
 
        return 0;
 }
index 20e481d..a4588da 100644 (file)
@@ -45,7 +45,7 @@ static void s3c2410_pm_prepare(void)
 {
        /* ensure at least GSTATUS3 has the resume address */
 
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2410_GSTATUS3);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2410_GSTATUS3);
 
        S3C_PMDBG("GSTATUS3 0x%08x\n", __raw_readl(S3C2410_GSTATUS3));
        S3C_PMDBG("GSTATUS4 0x%08x\n", __raw_readl(S3C2410_GSTATUS4));
index c0e328e..b5bbf0d 100644 (file)
@@ -48,7 +48,7 @@ static void s3c2416_pm_prepare(void)
         * correct address to resume from.
         */
        __raw_writel(0x2BED, S3C2412_INFORM0);
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1);
 }
 
 static int s3c2416_pm_add(struct device *dev, struct subsys_interface *sif)
index b0be382..2f579be 100644 (file)
@@ -304,7 +304,7 @@ static void s3c64xx_pm_prepare(void)
                              wake_irqs, ARRAY_SIZE(wake_irqs));
 
        /* store address of resume. */
-       __raw_writel(virt_to_phys(s3c_cpu_resume), S3C64XX_INFORM0);
+       __raw_writel(__pa_symbol(s3c_cpu_resume), S3C64XX_INFORM0);
 
        /* ensure previous wakeup state is cleared before sleeping */
        __raw_writel(__raw_readl(S3C64XX_WAKEUP_STAT), S3C64XX_WAKEUP_STAT);
index 7d69666..07cee14 100644 (file)
@@ -69,7 +69,7 @@ static void s5pv210_pm_prepare(void)
        __raw_writel(s5pv210_irqwake_intmask, S5P_WAKEUP_MASK);
 
        /* ensure at least INFORM0 has the resume address */
-       __raw_writel(virt_to_phys(s5pv210_cpu_resume), S5P_INFORM0);
+       __raw_writel(__pa_symbol(s5pv210_cpu_resume), S5P_INFORM0);
 
        tmp = __raw_readl(S5P_SLEEP_CFG);
        tmp &= ~(S5P_SLEEP_CFG_OSC_EN | S5P_SLEEP_CFG_USBOSC_EN);
index 34853d5..9a7079f 100644 (file)
@@ -73,7 +73,7 @@ static int sa11x0_pm_enter(suspend_state_t state)
        RCSR = RCSR_HWR | RCSR_SWR | RCSR_WDR | RCSR_SMR;
 
        /* set resume return address */
-       PSPR = virt_to_phys(cpu_resume);
+       PSPR = __pa_symbol(cpu_resume);
 
        /* go zzz */
        cpu_suspend(0, sa1100_finish_suspend);
index e192668..3ca2c13 100644 (file)
@@ -190,7 +190,7 @@ static void apmu_parse_dt(void (*fn)(struct resource *res, int cpu, int bit))
 static void __init shmobile_smp_apmu_setup_boot(void)
 {
        /* install boot code shared by all CPUs */
-       shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+       shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
 }
 
 void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
@@ -204,7 +204,7 @@ void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
 int shmobile_smp_apmu_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        /* For this particular CPU register boot vector */
-       shmobile_smp_hook(cpu, virt_to_phys(secondary_startup), 0);
+       shmobile_smp_hook(cpu, __pa_symbol(secondary_startup), 0);
 
        return apmu_wrap(cpu, apmu_power_on);
 }
@@ -308,7 +308,7 @@ int shmobile_smp_apmu_cpu_kill(unsigned int cpu)
 #if defined(CONFIG_SUSPEND)
 static int shmobile_smp_apmu_do_suspend(unsigned long cpu)
 {
-       shmobile_smp_hook(cpu, virt_to_phys(cpu_resume), 0);
+       shmobile_smp_hook(cpu, __pa_symbol(cpu_resume), 0);
        shmobile_smp_apmu_cpu_shutdown(cpu);
        cpu_do_idle(); /* WFI selects Core Standby */
        return 1;
index d1ecaf3..f1a1efd 100644 (file)
@@ -24,7 +24,7 @@ static void __iomem *shmobile_scu_base;
 static int shmobile_scu_cpu_prepare(unsigned int cpu)
 {
        /* For this particular CPU register SCU SMP boot vector */
-       shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu),
+       shmobile_smp_hook(cpu, __pa_symbol(shmobile_boot_scu),
                          shmobile_scu_base_phys);
        return 0;
 }
@@ -33,7 +33,7 @@ void __init shmobile_smp_scu_prepare_cpus(phys_addr_t scu_base_phys,
                                          unsigned int max_cpus)
 {
        /* install boot code shared by all CPUs */
-       shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
+       shmobile_boot_fn = __pa_symbol(shmobile_smp_boot);
 
        /* enable SCU and cache coherency on booting CPU */
        shmobile_scu_base_phys = scu_base_phys;
index 0794574..0ee7677 100644 (file)
@@ -40,7 +40,7 @@ static int socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle)
 
                memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
 
-               writel(virt_to_phys(secondary_startup),
+               writel(__pa_symbol(secondary_startup),
                       sys_manager_base_addr + (socfpga_cpu1start_addr & 0x000000ff));
 
                flush_cache_all();
@@ -63,7 +63,7 @@ static int socfpga_a10_boot_secondary(unsigned int cpu, struct task_struct *idle
                       SOCFPGA_A10_RSTMGR_MODMPURST);
                memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size);
 
-               writel(virt_to_phys(secondary_startup),
+               writel(__pa_symbol(secondary_startup),
                       sys_manager_base_addr + (socfpga_cpu1start_addr & 0x00000fff));
 
                flush_cache_all();
index 8d1e2d5..39038a0 100644 (file)
@@ -117,7 +117,7 @@ static void __init spear13xx_smp_prepare_cpus(unsigned int max_cpus)
         * (presently it is in SRAM). The BootMonitor waits until it receives a
         * soft interrupt, and then the secondary CPU branches to this address.
         */
-       __raw_writel(virt_to_phys(spear13xx_secondary_startup), SYS_LOCATION);
+       __raw_writel(__pa_symbol(spear13xx_secondary_startup), SYS_LOCATION);
 }
 
 const struct smp_operations spear13xx_smp_ops __initconst = {
index ea5a227..231f19e 100644 (file)
@@ -103,7 +103,7 @@ static void __init sti_smp_prepare_cpus(unsigned int max_cpus)
        u32 __iomem *cpu_strt_ptr;
        u32 release_phys;
        int cpu;
-       unsigned long entry_pa = virt_to_phys(sti_secondary_startup);
+       unsigned long entry_pa = __pa_symbol(sti_secondary_startup);
 
        np = of_find_compatible_node(NULL, NULL, "arm,cortex-a9-scu");
 
index 6642267..8fb5088 100644 (file)
@@ -80,7 +80,7 @@ static int sun6i_smp_boot_secondary(unsigned int cpu,
        spin_lock(&cpu_lock);
 
        /* Set CPU boot address */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               cpucfg_membase + CPUCFG_PRIVATE0_REG);
 
        /* Assert the CPU core in reset */
@@ -162,7 +162,7 @@ static int sun8i_smp_boot_secondary(unsigned int cpu,
        spin_lock(&cpu_lock);
 
        /* Set CPU boot address */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               cpucfg_membase + CPUCFG_PRIVATE0_REG);
 
        /* Assert the CPU core in reset */
index 98c62a4..2f0c6c0 100644 (file)
@@ -5,7 +5,7 @@
 
 static int tango_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-       tango_set_aux_boot_addr(virt_to_phys(secondary_startup));
+       tango_set_aux_boot_addr(__pa_symbol(secondary_startup));
        tango_start_aux_core(cpu);
        return 0;
 }
index b05c6d6..406c081 100644 (file)
@@ -5,7 +5,7 @@
 
 static int tango_pm_powerdown(unsigned long arg)
 {
-       tango_suspend(virt_to_phys(cpu_resume));
+       tango_suspend(__pa_symbol(cpu_resume));
 
        return -EIO; /* tango_suspend has failed */
 }
index 6fd9db5..dc55889 100644 (file)
@@ -94,14 +94,14 @@ void __init tegra_cpu_reset_handler_init(void)
        __tegra_cpu_reset_handler_data[TEGRA_RESET_MASK_PRESENT] =
                *((u32 *)cpu_possible_mask);
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_SECONDARY] =
-               virt_to_phys((void *)secondary_startup);
+               __pa_symbol((void *)secondary_startup);
 #endif
 
 #ifdef CONFIG_PM_SLEEP
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP1] =
                TEGRA_IRAM_LPx_RESUME_AREA;
        __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP2] =
-               virt_to_phys((void *)tegra_resume);
+               __pa_symbol((void *)tegra_resume);
 #endif
 
        tegra_cpu_reset_handler_enable();
index e0ee139..9b124c2 100644 (file)
@@ -79,7 +79,7 @@ static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * backup ram register at offset 0x1FF0, which is what boot rom code
         * is waiting for. This will wake up the secondary core from WFE.
         */
-       writel(virt_to_phys(secondary_startup),
+       writel(__pa_symbol(secondary_startup),
               backupram + UX500_CPU1_JUMPADDR_OFFSET);
        writel(0xA1FEED01,
               backupram + UX500_CPU1_WAKEMAGIC_OFFSET);
index 5cedcf5..ee2a0fa 100644 (file)
@@ -166,7 +166,7 @@ static int __init dcscb_init(void)
         * Future entries into the kernel can now go
         * through the cluster entry vectors.
         */
-       vexpress_flags_set(virt_to_phys(mcpm_entry_point));
+       vexpress_flags_set(__pa_symbol(mcpm_entry_point));
 
        return 0;
 }
index 98e29de..742499b 100644 (file)
@@ -79,7 +79,7 @@ static void __init vexpress_smp_dt_prepare_cpus(unsigned int max_cpus)
         * until it receives a soft interrupt, and then the
         * secondary CPU branches to this address.
         */
-       vexpress_flags_set(virt_to_phys(versatile_secondary_startup));
+       vexpress_flags_set(__pa_symbol(versatile_secondary_startup));
 }
 
 const struct smp_operations vexpress_smp_dt_ops __initconst = {
index 1aa4cce..9b5f3c4 100644 (file)
@@ -54,7 +54,7 @@ static int tc2_pm_cpu_powerup(unsigned int cpu, unsigned int cluster)
        if (cluster >= TC2_CLUSTERS || cpu >= tc2_nr_cpus[cluster])
                return -EINVAL;
        ve_spc_set_resume_addr(cluster, cpu,
-                              virt_to_phys(mcpm_entry_point));
+                              __pa_symbol(mcpm_entry_point));
        ve_spc_cpu_wakeup_irq(cluster, cpu, true);
        return 0;
 }
@@ -159,7 +159,7 @@ static int tc2_pm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 
 static void tc2_pm_cpu_suspend_prepare(unsigned int cpu, unsigned int cluster)
 {
-       ve_spc_set_resume_addr(cluster, cpu, virt_to_phys(mcpm_entry_point));
+       ve_spc_set_resume_addr(cluster, cpu, __pa_symbol(mcpm_entry_point));
 }
 
 static void tc2_pm_cpu_is_up(unsigned int cpu, unsigned int cluster)
index 0297f92..afb9a82 100644 (file)
@@ -76,7 +76,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus)
         * until it receives a soft interrupt, and then the
         * secondary CPU branches to this address.
         */
-       __raw_writel(virt_to_phys(zx_secondary_startup),
+       __raw_writel(__pa_symbol(zx_secondary_startup),
                     aonsysctrl_base + AON_SYS_CTRL_RESERVED1);
 
        iounmap(aonsysctrl_base);
@@ -94,7 +94,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus)
 
        /* Map the first 4 KB IRAM for suspend usage */
        sys_iram = __arm_ioremap_exec(ZX_IRAM_BASE, PAGE_SIZE, false);
-       zx_secondary_startup_pa = virt_to_phys(zx_secondary_startup);
+       zx_secondary_startup_pa = __pa_symbol(zx_secondary_startup);
        fncpy(sys_iram, &zx_resume_jump, zx_suspend_iram_sz);
 }
 
index 7cd9865..caa6d5f 100644 (file)
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(zynq_cpun_start);
 
 static int zynq_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-       return zynq_cpun_start(virt_to_phys(secondary_startup), cpu);
+       return zynq_cpun_start(__pa_symbol(secondary_startup), cpu);
 }
 
 /*
index 35e3a56..c6c4c9c 100644 (file)
@@ -29,6 +29,7 @@ config CPU_ARM720T
        select CPU_COPY_V4WT if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WT if MMU
        help
          A 32-bit RISC processor with 8kByte Cache, Write Buffer and
@@ -46,6 +47,7 @@ config CPU_ARM740T
        select CPU_CACHE_V4
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          A 32-bit RISC processor with 8KB cache or 4KB variants,
          write buffer and MPU(Protection Unit) built around
@@ -79,6 +81,7 @@ config CPU_ARM920T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM920T is licensed to be produced by numerous vendors,
@@ -97,6 +100,7 @@ config CPU_ARM922T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM922T is a version of the ARM920T, but with smaller
@@ -116,6 +120,7 @@ config CPU_ARM925T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM925T is a mix between the ARM920T and ARM926T, but with
@@ -134,6 +139,7 @@ config CPU_ARM926T
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          This is a variant of the ARM920.  It has slightly different
@@ -170,6 +176,7 @@ config CPU_ARM940T
        select CPU_CACHE_VIVT
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          ARM940T is a member of the ARM9TDMI family of general-
          purpose microprocessors with MPU and separate 4KB
@@ -188,6 +195,7 @@ config CPU_ARM946E
        select CPU_CACHE_VIVT
        select CPU_CP15_MPU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        help
          ARM946E-S is a member of the ARM9E-S family of high-
          performance, 32-bit system-on-chip processor solutions.
@@ -206,6 +214,7 @@ config CPU_ARM1020
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1020 is the 32K cached version of the ARM10 processor,
@@ -225,6 +234,7 @@ config CPU_ARM1020E
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # ARM1022E
@@ -236,6 +246,7 @@ config CPU_ARM1022
        select CPU_COPY_V4WB if MMU # can probably do better
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1022E is an implementation of the ARMv5TE architecture
@@ -254,6 +265,7 @@ config CPU_ARM1026
        select CPU_COPY_V4WB if MMU # can probably do better
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        help
          The ARM1026EJ-S is an implementation of the ARMv5TEJ architecture
@@ -302,6 +314,7 @@ config CPU_XSCALE
        select CPU_CACHE_VIVT
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # XScale Core Version 3
@@ -312,6 +325,7 @@ config CPU_XSC3
        select CPU_CACHE_VIVT
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
        select IO_36
 
@@ -324,6 +338,7 @@ config CPU_MOHAWK
        select CPU_COPY_V4WB if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V4WBI if MMU
 
 # Feroceon
@@ -335,6 +350,7 @@ config CPU_FEROCEON
        select CPU_COPY_FEROCEON if MMU
        select CPU_CP15_MMU
        select CPU_PABRT_LEGACY
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_FEROCEON if MMU
 
 config CPU_FEROCEON_OLD_ID
@@ -367,6 +383,7 @@ config CPU_V6
        select CPU_CP15_MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V6
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V6 if MMU
 
 # ARMv6k
@@ -381,6 +398,7 @@ config CPU_V6K
        select CPU_CP15_MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V6
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V6 if MMU
 
 # ARMv7
@@ -396,6 +414,7 @@ config CPU_V7
        select CPU_CP15_MPU if !MMU
        select CPU_HAS_ASID if MMU
        select CPU_PABRT_V7
+       select CPU_THUMB_CAPABLE
        select CPU_TLB_V7 if MMU
 
 # ARMv7M
@@ -410,11 +429,17 @@ config CPU_V7M
 
 config CPU_THUMBONLY
        bool
+       select CPU_THUMB_CAPABLE
        # There are no CPUs available with MMU that don't implement an ARM ISA:
        depends on !MMU
        help
          Select this if your CPU doesn't support the 32 bit ARM instructions.
 
+config CPU_THUMB_CAPABLE
+       bool
+       help
+         Select this if your CPU can support Thumb mode.
+
 # Figure out what processor architecture version we should be using.
 # This defines the compiler instruction set which depends on the machine type.
 config CPU_32v3
@@ -655,11 +680,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config ARM_THUMB
        bool "Support Thumb user binaries" if !CPU_THUMBONLY
-       depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \
-               CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \
-               CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \
-               CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \
-               CPU_V7 || CPU_FEROCEON || CPU_V7M
+       depends on CPU_THUMB_CAPABLE
        default y
        help
          Say Y if you want to include kernel support for running user space
index e869824..b3dea80 100644 (file)
@@ -14,6 +14,7 @@ endif
 
 obj-$(CONFIG_ARM_PTDUMP)       += dump.o
 obj-$(CONFIG_MODULES)          += proc-syms.o
+obj-$(CONFIG_DEBUG_VIRTUAL)    += physaddr.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
index dfe97b4..f57b080 100644 (file)
@@ -15,6 +15,7 @@
 
 #define pr_fmt(fmt)            "uniphier: " fmt
 
+#include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/log2.h>
@@ -71,8 +72,7 @@
  * @ctrl_base: virtual base address of control registers
  * @rev_base: virtual base address of revision registers
  * @op_base: virtual base address of operation registers
- * @way_present_mask: each bit specifies if the way is present
- * @way_locked_mask: each bit specifies if the way is locked
+ * @way_mask: each bit specifies if the way is present
  * @nsets: number of associativity sets
  * @line_size: line size in bytes
  * @range_op_max_size: max size that can be handled by a single range operation
@@ -83,8 +83,7 @@ struct uniphier_cache_data {
        void __iomem *rev_base;
        void __iomem *op_base;
        void __iomem *way_ctrl_base;
-       u32 way_present_mask;
-       u32 way_locked_mask;
+       u32 way_mask;
        u32 nsets;
        u32 line_size;
        u32 range_op_max_size;
@@ -234,17 +233,13 @@ static void __uniphier_cache_enable(struct uniphier_cache_data *data, bool on)
        writel_relaxed(val, data->ctrl_base + UNIPHIER_SSCC);
 }
 
-static void __init __uniphier_cache_set_locked_ways(
-                                       struct uniphier_cache_data *data,
-                                       u32 way_mask)
+static void __init __uniphier_cache_set_active_ways(
+                                       struct uniphier_cache_data *data)
 {
        unsigned int cpu;
 
-       data->way_locked_mask = way_mask & data->way_present_mask;
-
        for_each_possible_cpu(cpu)
-               writel_relaxed(~data->way_locked_mask & data->way_present_mask,
-                              data->way_ctrl_base + 4 * cpu);
+               writel_relaxed(data->way_mask, data->way_ctrl_base + 4 * cpu);
 }
 
 static void uniphier_cache_maint_range(unsigned long start, unsigned long end,
@@ -307,7 +302,7 @@ static void __init uniphier_cache_enable(void)
 
        list_for_each_entry(data, &uniphier_cache_list, list) {
                __uniphier_cache_enable(data, true);
-               __uniphier_cache_set_locked_ways(data, 0);
+               __uniphier_cache_set_active_ways(data);
        }
 }
 
@@ -382,8 +377,8 @@ static int __init __uniphier_cache_init(struct device_node *np,
                goto err;
        }
 
-       data->way_present_mask =
-               ((u32)1 << cache_size / data->nsets / data->line_size) - 1;
+       data->way_mask = GENMASK(cache_size / data->nsets / data->line_size - 1,
+                                0);
 
        data->ctrl_base = of_iomap(np, 0);
        if (!data->ctrl_base) {
index a134d8a..de78109 100644 (file)
@@ -164,7 +164,7 @@ skip:
        cmp     r3, r10
        bgt     flush_levels
 finished:
-       mov     r10, #0                         @ swith back to cache level 0
+       mov     r10, #0                         @ switch back to cache level 0
        mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
        dsb     st
        isb
index 816a7e4..788486e 100644 (file)
@@ -217,7 +217,7 @@ skip:
        cmp     r3, r10
        bgt     flush_levels
 finished:
-       mov     r10, #0                         @ swith back to cache level 0
+       mov     r10, #0                         @ switch back to cache level 0
        write_csselr r10, r3                    @ select current cache level in cssr
        dsb     st
        isb
index e309a5e..63eabb0 100644 (file)
@@ -870,6 +870,9 @@ static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
                                      vma->vm_end - vma->vm_start,
                                      vma->vm_page_prot);
        }
+#else
+       ret = vm_iomap_memory(vma, vma->vm_start,
+                             (vma->vm_end - vma->vm_start));
 #endif /* CONFIG_MMU */
 
        return ret;
index 9fe8e24..21192d6 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 
 #include <asm/fixmap.h>
+#include <asm/memory.h>
 #include <asm/pgtable.h>
 
 struct addr_marker {
@@ -31,8 +32,8 @@ static struct addr_marker address_markers[] = {
        { 0,                    "vmalloc() Area" },
        { VMALLOC_END,          "vmalloc() End" },
        { FIXADDR_START,        "Fixmap Area" },
-       { CONFIG_VECTORS_BASE,  "Vectors" },
-       { CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
+       { VECTORS_BASE, "Vectors" },
+       { VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
        { -1,                   NULL },
 };
 
index 3cced84..f1e6190 100644 (file)
@@ -327,6 +327,12 @@ void flush_dcache_page(struct page *page)
        if (page == ZERO_PAGE(0))
                return;
 
+       if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) {
+               if (test_bit(PG_dcache_clean, &page->flags))
+                       clear_bit(PG_dcache_clean, &page->flags);
+               return;
+       }
+
        mapping = page_mapping(page);
 
        if (!cache_ops_need_broadcast() &&
index 4be0bee..bf4d3bc 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/cp15.h>
 #include <asm/mach-types.h>
 #include <asm/memblock.h>
+#include <asm/memory.h>
 #include <asm/prom.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -227,41 +228,59 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
        return phys;
 }
 
-void __init arm_memblock_init(const struct machine_desc *mdesc)
+static void __init arm_initrd_init(void)
 {
-       /* Register the kernel text, kernel data and initrd with memblock. */
-#ifdef CONFIG_XIP_KERNEL
-       memblock_reserve(__pa(_sdata), _end - _sdata);
-#else
-       memblock_reserve(__pa(_stext), _end - _stext);
-#endif
 #ifdef CONFIG_BLK_DEV_INITRD
+       phys_addr_t start;
+       unsigned long size;
+
        /* FDT scan will populate initrd_start */
        if (initrd_start && !phys_initrd_size) {
                phys_initrd_start = __virt_to_phys(initrd_start);
                phys_initrd_size = initrd_end - initrd_start;
        }
+
        initrd_start = initrd_end = 0;
-       if (phys_initrd_size &&
-           !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
+
+       if (!phys_initrd_size)
+               return;
+
+       /*
+        * Round the memory region to page boundaries as per free_initrd_mem()
+        * This allows us to detect whether the pages overlapping the initrd
+        * are in use, but more importantly, reserves the entire set of pages
+        * as we don't want these pages allocated for other purposes.
+        */
+       start = round_down(phys_initrd_start, PAGE_SIZE);
+       size = phys_initrd_size + (phys_initrd_start - start);
+       size = round_up(size, PAGE_SIZE);
+
+       if (!memblock_is_region_memory(start, size)) {
                pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n",
-                      (u64)phys_initrd_start, phys_initrd_size);
-               phys_initrd_start = phys_initrd_size = 0;
+                      (u64)start, size);
+               return;
        }
-       if (phys_initrd_size &&
-           memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
+
+       if (memblock_is_region_reserved(start, size)) {
                pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n",
-                      (u64)phys_initrd_start, phys_initrd_size);
-               phys_initrd_start = phys_initrd_size = 0;
+                      (u64)start, size);
+               return;
        }
-       if (phys_initrd_size) {
-               memblock_reserve(phys_initrd_start, phys_initrd_size);
 
-               /* Now convert initrd to virtual addresses */
-               initrd_start = __phys_to_virt(phys_initrd_start);
-               initrd_end = initrd_start + phys_initrd_size;
-       }
+       memblock_reserve(start, size);
+
+       /* Now convert initrd to virtual addresses */
+       initrd_start = __phys_to_virt(phys_initrd_start);
+       initrd_end = initrd_start + phys_initrd_size;
 #endif
+}
+
+void __init arm_memblock_init(const struct machine_desc *mdesc)
+{
+       /* Register the kernel text, kernel data and initrd with memblock. */
+       memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START);
+
+       arm_initrd_init();
 
        arm_mm_memblock_reserve();
 
@@ -521,8 +540,7 @@ void __init mem_init(void)
                        "      .data : 0x%p" " - 0x%p" "   (%4td kB)\n"
                        "       .bss : 0x%p" " - 0x%p" "   (%4td kB)\n",
 
-                       MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
-                               (PAGE_SIZE)),
+                       MLK(VECTORS_BASE, VECTORS_BASE + PAGE_SIZE),
 #ifdef CONFIG_HAVE_TCM
                        MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
                        MLK(ITCM_OFFSET, (unsigned long) itcm_end),
index 4001dd1..4e016d7 100644 (file)
@@ -1152,13 +1152,12 @@ early_param("vmalloc", early_vmalloc);
 
 phys_addr_t arm_lowmem_limit __initdata = 0;
 
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
 {
        phys_addr_t memblock_limit = 0;
-       int highmem = 0;
        u64 vmalloc_limit;
        struct memblock_region *reg;
-       bool should_use_highmem = false;
+       phys_addr_t lowmem_limit = 0;
 
        /*
         * Let's use our own (unoptimized) equivalent of __pa() that is
@@ -1172,43 +1171,18 @@ void __init sanity_check_meminfo(void)
        for_each_memblock(memory, reg) {
                phys_addr_t block_start = reg->base;
                phys_addr_t block_end = reg->base + reg->size;
-               phys_addr_t size_limit = reg->size;
 
-               if (reg->base >= vmalloc_limit)
-                       highmem = 1;
-               else
-                       size_limit = vmalloc_limit - reg->base;
-
-
-               if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
-
-                       if (highmem) {
-                               pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
-                                         &block_start, &block_end);
-                               memblock_remove(reg->base, reg->size);
-                               should_use_highmem = true;
-                               continue;
-                       }
-
-                       if (reg->size > size_limit) {
-                               phys_addr_t overlap_size = reg->size - size_limit;
-
-                               pr_notice("Truncating RAM at %pa-%pa",
-                                         &block_start, &block_end);
-                               block_end = vmalloc_limit;
-                               pr_cont(" to -%pa", &block_end);
-                               memblock_remove(vmalloc_limit, overlap_size);
-                               should_use_highmem = true;
-                       }
-               }
-
-               if (!highmem) {
-                       if (block_end > arm_lowmem_limit) {
-                               if (reg->size > size_limit)
-                                       arm_lowmem_limit = vmalloc_limit;
-                               else
-                                       arm_lowmem_limit = block_end;
-                       }
+               if (reg->base < vmalloc_limit) {
+                       if (block_end > lowmem_limit)
+                               /*
+                                * Compare as u64 to ensure vmalloc_limit does
+                                * not get truncated. block_end should always
+                                * fit in phys_addr_t so there should be no
+                                * issue with assignment.
+                                */
+                               lowmem_limit = min_t(u64,
+                                                        vmalloc_limit,
+                                                        block_end);
 
                        /*
                         * Find the first non-pmd-aligned page, and point
@@ -1227,14 +1201,13 @@ void __init sanity_check_meminfo(void)
                                if (!IS_ALIGNED(block_start, PMD_SIZE))
                                        memblock_limit = block_start;
                                else if (!IS_ALIGNED(block_end, PMD_SIZE))
-                                       memblock_limit = arm_lowmem_limit;
+                                       memblock_limit = lowmem_limit;
                        }
 
                }
        }
 
-       if (should_use_highmem)
-               pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+       arm_lowmem_limit = lowmem_limit;
 
        high_memory = __va(arm_lowmem_limit - 1) + 1;
 
@@ -1248,6 +1221,18 @@ void __init sanity_check_meminfo(void)
        if (!memblock_limit)
                memblock_limit = arm_lowmem_limit;
 
+       if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
+               if (memblock_end_of_DRAM() > arm_lowmem_limit) {
+                       phys_addr_t end = memblock_end_of_DRAM();
+
+                       pr_notice("Ignoring RAM at %pa-%pa\n",
+                                 &memblock_limit, &end);
+                       pr_notice("Consider using a HIGHMEM enabled kernel.\n");
+
+                       memblock_remove(memblock_limit, end - memblock_limit);
+               }
+       }
+
        memblock_set_current_limit(memblock_limit);
 }
 
@@ -1437,11 +1422,7 @@ static void __init kmap_init(void)
 static void __init map_lowmem(void)
 {
        struct memblock_region *reg;
-#ifdef CONFIG_XIP_KERNEL
-       phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE);
-#else
-       phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
-#endif
+       phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
        phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
 
        /* Map all the lowmem memory banks. */
index 2740967..3b5c7aa 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 #include <asm/sections.h>
 #include <asm/page.h>
 #include <asm/setup.h>
@@ -22,6 +23,8 @@
 
 #include "mm.h"
 
+unsigned long vectors_base;
+
 #ifdef CONFIG_ARM_MPU
 struct mpu_rgn_info mpu_rgn_info;
 
@@ -85,7 +88,7 @@ static unsigned long irbar_read(void)
 }
 
 /* MPU initialisation functions */
-void __init sanity_check_meminfo_mpu(void)
+void __init adjust_lowmem_bounds_mpu(void)
 {
        phys_addr_t phys_offset = PHYS_OFFSET;
        phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size;
@@ -274,19 +277,64 @@ void __init mpu_setup(void)
        }
 }
 #else
-static void sanity_check_meminfo_mpu(void) {}
+static void adjust_lowmem_bounds_mpu(void) {}
 static void __init mpu_setup(void) {}
 #endif /* CONFIG_ARM_MPU */
 
+#ifdef CONFIG_CPU_CP15
+#ifdef CONFIG_CPU_HIGH_VECTOR
+static unsigned long __init setup_vectors_base(void)
+{
+       unsigned long reg = get_cr();
+
+       set_cr(reg | CR_V);
+       return 0xffff0000;
+}
+#else /* CONFIG_CPU_HIGH_VECTOR */
+/* Write exception base address to VBAR */
+static inline void set_vbar(unsigned long val)
+{
+       asm("mcr p15, 0, %0, c12, c0, 0" : : "r" (val) : "cc");
+}
+
+/*
+ * Security extensions, bits[7:4], permitted values,
+ * 0b0000 - not implemented, 0b0001/0b0010 - implemented
+ */
+static inline bool security_extensions_enabled(void)
+{
+       return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4);
+}
+
+static unsigned long __init setup_vectors_base(void)
+{
+       unsigned long base = 0, reg = get_cr();
+
+       set_cr(reg & ~CR_V);
+       if (security_extensions_enabled()) {
+               if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM))
+                       base = CONFIG_DRAM_BASE;
+               set_vbar(base);
+       } else if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM)) {
+               if (CONFIG_DRAM_BASE != 0)
+                       pr_err("Security extensions not enabled, vectors cannot be remapped to RAM, vectors base will be 0x00000000\n");
+       }
+
+       return base;
+}
+#endif /* CONFIG_CPU_HIGH_VECTOR */
+#endif /* CONFIG_CPU_CP15 */
+
 void __init arm_mm_memblock_reserve(void)
 {
 #ifndef CONFIG_CPU_V7M
+       vectors_base = IS_ENABLED(CONFIG_CPU_CP15) ? setup_vectors_base() : 0;
        /*
         * Register the exception vector page.
         * some architectures which the DRAM is the exception vector to trap,
         * alloc_page breaks with error, although it is not NULL, but "0."
         */
-       memblock_reserve(CONFIG_VECTORS_BASE, 2 * PAGE_SIZE);
+       memblock_reserve(vectors_base, 2 * PAGE_SIZE);
 #else /* ifndef CONFIG_CPU_V7M */
        /*
         * There is no dedicated vector page on V7-M. So nothing needs to be
@@ -295,10 +343,10 @@ void __init arm_mm_memblock_reserve(void)
 #endif
 }
 
-void __init sanity_check_meminfo(void)
+void __init adjust_lowmem_bounds(void)
 {
        phys_addr_t end;
-       sanity_check_meminfo_mpu();
+       adjust_lowmem_bounds_mpu();
        end = memblock_end_of_DRAM();
        high_memory = __va(end - 1) + 1;
        memblock_set_current_limit(end);
@@ -310,7 +358,7 @@ void __init sanity_check_meminfo(void)
  */
 void __init paging_init(const struct machine_desc *mdesc)
 {
-       early_trap_init((void *)CONFIG_VECTORS_BASE);
+       early_trap_init((void *)vectors_base);
        mpu_setup();
        bootmem_init();
 }
diff --git a/arch/arm/mm/physaddr.c b/arch/arm/mm/physaddr.c
new file mode 100644 (file)
index 0000000..02e60f4
--- /dev/null
@@ -0,0 +1,57 @@
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/mmdebug.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/memory.h>
+#include <asm/fixmap.h>
+#include <asm/dma.h>
+
+#include "mm.h"
+
+static inline bool __virt_addr_valid(unsigned long x)
+{
+       /*
+        * high_memory does not get immediately defined, and there
+        * are early callers of __pa() against PAGE_OFFSET
+        */
+       if (!high_memory && x >= PAGE_OFFSET)
+               return true;
+
+       if (high_memory && x >= PAGE_OFFSET && x < (unsigned long)high_memory)
+               return true;
+
+       /*
+        * MAX_DMA_ADDRESS is a virtual address that may not correspond to an
+        * actual physical address. Enough code relies on __pa(MAX_DMA_ADDRESS)
+        * that we just need to work around it and always return true.
+        */
+       if (x == MAX_DMA_ADDRESS)
+               return true;
+
+       return false;
+}
+
+phys_addr_t __virt_to_phys(unsigned long x)
+{
+       WARN(!__virt_addr_valid(x),
+            "virt_to_phys used for non-linear address: %pK (%pS)\n",
+            (void *)x, (void *)x);
+
+       return __virt_to_phys_nodebug(x);
+}
+EXPORT_SYMBOL(__virt_to_phys);
+
+phys_addr_t __phys_addr_symbol(unsigned long x)
+{
+       /* This is bounds checking against the kernel image only.
+        * __pa_symbol should only be used on kernel symbol addresses.
+        */
+       VIRTUAL_BUG_ON(x < (unsigned long)KERNEL_START ||
+                      x > (unsigned long)KERNEL_END);
+
+       return __pa_symbol_nodebug(x);
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
index f9b08ba..548d622 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <asm/probes.h>
+#include <asm/kprobes.h>
 
 void __init arm_probes_decode_init(void);
 
index 1737aec..6deb8d7 100644 (file)
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -57,4 +60,5 @@ int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr);
 void kretprobe_trampoline(void);
 void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
index 86032a0..657977e 100644 (file)
@@ -19,6 +19,7 @@
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
 #include <asm/traps.h>
+#include <asm/kprobes.h>
 #include <linux/uaccess.h>
 #include <asm/cpufeature.h>
 
index b6badff..3a63954 100644 (file)
@@ -31,6 +31,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/fixmap.h>
 #include <asm/insn.h>
+#include <asm/kprobes.h>
 
 #define AARCH64_INSN_SF_BIT    BIT(31)
 #define AARCH64_INSN_N_BIT     BIT(22)
index 76d3f31..192ab00 100644 (file)
@@ -16,6 +16,8 @@
 #ifndef _ARM_KERNEL_KPROBES_ARM64_H
 #define _ARM_KERNEL_KPROBES_ARM64_H
 
+#include <asm/kprobes.h>
+
 /*
  * ARM strongly recommends a limit of 128 bytes between LoadExcl and
  * StoreExcl instructions in a single thread of execution. So keep the
index a8ec5da..827d52d 100644 (file)
@@ -222,7 +222,7 @@ asmlinkage void secondary_start_kernel(void)
         * All kernel threads share the same mm context; grab a
         * reference and switch to it.
         */
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
        current->active_mm = mm;
 
        /*
index 410fbdb..f5b9210 100644 (file)
@@ -62,7 +62,7 @@ D_h   .req    x14
        sub     count, count, tmp2
        /*
        * Copy the leading memory data from src to dst in an increasing
-       * address order.By this way,the risk of overwritting the source
+       * address order.By this way,the risk of overwriting the source
        * memory data is eliminated when the distance between src and
        * dst is less than 16. The memory accesses here are alignment.
        */
index 45f563e..28dfc61 100644 (file)
 #ifndef __ASM_AVR32_KPROBES_H
 #define __ASM_AVR32_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0xd673  /* breakpoint */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 
 typedef u16    kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xd673  /* breakpoint */
 #define MAX_INSN_SIZE          2
 #define MAX_STACK_SIZE         64      /* 32 would probably be OK */
 
@@ -46,4 +50,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 #define flush_insn_slot(p)     do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif /* __ASM_AVR32_KPROBES_H */
index d6fa60b..625db8a 100644 (file)
@@ -46,3 +46,4 @@ generic-y += unaligned.h
 generic-y += user.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 23c4ef5..a2e6db2 100644 (file)
@@ -307,8 +307,8 @@ void secondary_start_kernel(void)
        local_irq_disable();
 
        /* Attach the new idle task to the global mm. */
-       atomic_inc(&mm->mm_users);
-       atomic_inc(&mm->mm_count);
+       mmget(mm);
+       mmgrab(mm);
        current->active_mm = mm;
 
        preempt_disable();
index 4e9f574..82619c3 100644 (file)
@@ -61,3 +61,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 8e4ef32..0f5132b 100644 (file)
@@ -45,3 +45,4 @@ generic-y += types.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 0f5b0d5..c33b467 100644 (file)
@@ -7,3 +7,4 @@ generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
index 81757d5..3473bde 100644 (file)
@@ -188,7 +188,7 @@ int cxn_pin_by_pid(pid_t pid)
                task_lock(tsk);
                if (tsk->mm) {
                        mm = tsk->mm;
-                       atomic_inc(&mm->mm_users);
+                       mmget(mm);
                        ret = 0;
                }
                task_unlock(tsk);
index 5efd0c8..341740c 100644 (file)
@@ -74,3 +74,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index a43a7c9..797b64a 100644 (file)
@@ -59,3 +59,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 983bae7..c02a645 100644 (file)
@@ -162,7 +162,7 @@ void start_secondary(void)
        );
 
        /*  Set the memory struct  */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        cpu = smp_processor_id();
index d5505d6..0302b36 100644 (file)
  * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
  *              <anil.s.keshavamurthy@intel.com> adapted from i386
  */
+#include <asm-generic/kprobes.h>
+#include <asm/break.h>
+
+#define BREAK_INST     (long)(__IA64_BREAK_KPROBE << 6)
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-#include <asm/break.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 #define MAX_INSN_SIZE   2      /* last half is for kprobe-booster */
-#define BREAK_INST     (long)(__IA64_BREAK_KPROBE << 6)
 #define NOP_M_INST     (long)(1<<27)
 #define BRL_INST(i1, i2) ((long)((0xcL << 37) |        /* brl */ \
                                (0x1L << 12) |  /* many */ \
@@ -124,4 +129,5 @@ extern void invalidate_stacked_regs(void);
 extern void flush_register_stack(void);
 extern void arch_remove_kprobe(struct kprobe *p);
 
-#endif                         /* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
index c483ece..d683229 100644 (file)
@@ -994,7 +994,7 @@ cpu_init (void)
         */
        ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
                                        | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        BUG_ON(current->mm);
 
index 4c3b84d..52704f1 100644 (file)
@@ -525,7 +525,7 @@ static int sn_topology_show(struct seq_file *s, void *d)
                                /* both ends local to this partition */
                                seq_puts(s, " local");
                        else if (SN_HWPERF_FOREIGN(p))
-                               /* both ends of the link in foreign partiton */
+                               /* both ends of the link in foreign partition */
                                seq_puts(s, " foreign");
                        else
                                /* link straddles a partition */
index 8c24c5e..deb2987 100644 (file)
@@ -11,3 +11,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
index 136c69f..b18bc0b 100644 (file)
@@ -403,7 +403,7 @@ void __init cpu_init (void)
        printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
        /* Set up and load the per-CPU TSS and LDT */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        if (current->mm)
                BUG();
index f108dd1..131b410 100644 (file)
@@ -1,19 +1,20 @@
-CONFIG_LOCALVERSION="amcore-001"
+CONFIG_LOCALVERSION="amcore-002"
 CONFIG_DEFAULT_HOSTNAME="amcore"
 CONFIG_SYSVIPC=y
 # CONFIG_FHANDLE is not set
 # CONFIG_USELIB is not set
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_NAMESPACES=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_AIO is not set
 # CONFIG_ADVISE_SYSCALLS is not set
 # CONFIG_MEMBARRIER is not set
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 # CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_CFQ is not set
 # CONFIG_MMU is not set
 CONFIG_M5307=y
 CONFIG_AMCORE=y
@@ -27,13 +28,14 @@ CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
+CONFIG_SYN_COOKIES=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
-CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y
+# CONFIG_FW_LOADER is not set
 # CONFIG_ALLOW_DEV_COREDUMP is not set
 CONFIG_CONNECTOR=y
 CONFIG_MTD=y
@@ -53,6 +55,7 @@ CONFIG_MTD_UCLINUX=y
 CONFIG_MTD_PLATRAM=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_NETDEVICES=y
+# CONFIG_NET_VENDOR_AMAZON is not set
 # CONFIG_NET_VENDOR_ARC is not set
 # CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
@@ -89,14 +92,12 @@ CONFIG_I2C=y
 CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_HELPER_AUTO is not set
 CONFIG_I2C_IMX=y
-CONFIG_PPS=y
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_SYSTOHC is not set
 CONFIG_RTC_DRV_DS1307=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
 # CONFIG_FILE_LOCKING is not set
 # CONFIG_DNOTIFY is not set
 # CONFIG_INOTIFY_USER is not set
@@ -108,6 +109,7 @@ CONFIG_ROMFS_BACKED_BY_BOTH=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
 CONFIG_PANIC_ON_OOPS=y
 # CONFIG_SCHED_DEBUG is not set
index 6dccda7..b865c1a 100644 (file)
@@ -3814,7 +3814,7 @@ CAS2W2_FILLER:
 #      (3) Save current DFC/SFC (ASSUMED TO BE EQUAL!!!); Then set     #
 #          SFC/DFC according to whether exception occurred in user or  #
 #          supervisor mode.                                            #
-#      (4) Use "plpaw" instruction to pre-load ATC with efective       #
+#      (4) Use "plpaw" instruction to pre-load ATC with effective      #
 #          address page(s). THIS SHOULD NOT FAULT!!! The relevant      #
 #          page(s) should have been made resident prior to entering    #
 #          this routine.                                               #
index 6c76d6c..d4f9ccb 100644 (file)
@@ -33,3 +33,4 @@ generic-y += trace_clock.h
 generic-y += types.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index d3731f0..f9b9df5 100644 (file)
@@ -54,3 +54,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index bad1323..c622293 100644 (file)
@@ -344,8 +344,8 @@ asmlinkage void secondary_start_kernel(void)
         * All kernel threads share the same mm context; grab a
         * reference and switch to it.
         */
-       atomic_inc(&mm->mm_users);
-       atomic_inc(&mm->mm_count);
+       mmget(mm);
+       mmgrab(mm);
        current->active_mm = mm;
        cpumask_set_cpu(cpu, mm_cpumask(mm));
        enter_lazy_tlb(mm, current);
index 6275eb0..1732ec1 100644 (file)
@@ -10,3 +10,4 @@ generic-y += preempt.h
 generic-y += syscalls.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
index daba1f9..291846d 100644 (file)
@@ -22,6 +22,9 @@
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/ptrace.h>
 #include <linux/types.h>
 
@@ -94,4 +97,5 @@ struct kprobe_ctlblk {
 extern int kprobe_exceptions_notify(struct notifier_block *self,
                                    unsigned long val, void *data);
 
-#endif                         /* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
index cb479be..49c6df2 100644 (file)
@@ -2232,7 +2232,7 @@ void per_cpu_trap_init(bool is_boot_cpu)
        if (!cpu_data[cpu].asid_cache)
                cpu_data[cpu].asid_cache = asid_first_version(cpu);
 
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        BUG_ON(current->mm);
        enter_lazy_tlb(&init_mm, current);
index c800b59..7abea0b 100644 (file)
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0xff
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 struct kprobe;
 
 typedef unsigned char kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xff
 #define MAX_INSN_SIZE 8
 #define MAX_STACK_SIZE 128
 
@@ -47,4 +51,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 extern void arch_remove_kprobe(struct kprobe *p);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_KPROBES_H */
index 426173c..e65b5cc 100644 (file)
@@ -589,7 +589,7 @@ static void __init smp_cpu_init(void)
        }
        printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        BUG_ON(current->mm);
 
index 35b0e88..aaa3c21 100644 (file)
@@ -62,3 +62,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index fb24175..fb01873 100644 (file)
@@ -67,3 +67,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index bc65008..1b7160c 100644 (file)
@@ -319,7 +319,7 @@ EXCEPTION_ENTRY(_timer_handler)
        l.j    _ret_from_intr
         l.nop
 
-/* ---[ 0x600: Aligment exception ]-------------------------------------- */
+/* ---[ 0x600: Alignment exception ]-------------------------------------- */
 
 EXCEPTION_ENTRY(_alignment_handler)
        CLEAR_LWA_FLAG(r3)
@@ -331,8 +331,8 @@ EXCEPTION_ENTRY(_alignment_handler)
         l.nop
 
 #if 0
-EXCEPTION_ENTRY(_aligment_handler)
-//        l.mfspr r2,r0,SPR_EEAR_BASE     /* Load the efective addres */
+EXCEPTION_ENTRY(_alignment_handler)
+//        l.mfspr r2,r0,SPR_EEAR_BASE     /* Load the effective address */
        l.addi  r2,r4,0
 //        l.mfspr r5,r0,SPR_EPCR_BASE     /* Load the insn address */
        l.lwz   r5,PT_PC(r1)
index d01b82e..1e87913 100644 (file)
@@ -325,7 +325,7 @@ _dispatch_do_ipage_fault:
     .org 0x500
        EXCEPTION_HANDLE(_timer_handler)
 
-/* ---[ 0x600: Aligment exception ]-------------------------------------- */
+/* ---[ 0x600: Alignment exception ]-------------------------------------- */
     .org 0x600
        EXCEPTION_HANDLE(_alignment_handler)
 
@@ -640,8 +640,8 @@ _flush_tlb:
 
 /* ========================================[ cache ]=== */
 
-       /* aligment here so we don't change memory offsets with
-        * memory controler defined
+       /* alignment here so we don't change memory offsets with
+        * memory controller defined
         */
        .align 0x2000
 
index 5525446..00ddb78 100644 (file)
@@ -19,8 +19,8 @@
 
 /* TODO
  *             - clean up __offset & stuff
- *             - change all 8192 aligment to PAGE !!!
- *             - recheck if all aligments are really needed
+ *             - change all 8192 alignment to PAGE !!!
+ *             - recheck if all alignments are really needed
  */
 
 #  define LOAD_OFFSET  PAGE_OFFSET
index cc70b41..a9909c2 100644 (file)
@@ -28,3 +28,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 75dab28..67b452b 100644 (file)
@@ -279,7 +279,7 @@ smp_cpu_init(int cpunum)
        set_cpu_online(cpunum, true);
 
        /* Initialise the idle task for this CPU */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        BUG_ON(current->mm);
        enter_lazy_tlb(&init_mm, current);
index 8e94448..76b2bd6 100644 (file)
@@ -55,7 +55,7 @@
                                label = "kernel";
                                reg = <0x01c00000 0x002e0000>;
                        };
-                       partiton@1ee0000 {
+                       partition@1ee0000 {
                                label = "dtb";
                                reg = <0x01ee0000 0x00020000>;
                        };
index d73e9df..1145dc8 100644 (file)
@@ -30,7 +30,7 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 
 #ifndef __ASSEMBLY__
 /*
- * ISA 3.0 partiton and process table entry format
+ * ISA 3.0 partition and process table entry format
  */
 struct prtb_entry {
        __be64 prtb0;
index 3abb583..b889d13 100644 (file)
@@ -109,7 +109,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
 #define FH_DTPROP_MAX_PROPLEN 32768
 
 /**
- * fh_partiton_get_dtprop - get a property from a guest device tree.
+ * fh_partition_get_dtprop - get a property from a guest device tree.
  * @handle: handle of partition whose device tree is to be accessed
  * @dtpath_addr: physical address of device tree path to access
  * @propname_addr: physical address of name of property
index d821835..0503c98 100644 (file)
@@ -1,5 +1,8 @@
 #ifndef _ASM_POWERPC_KPROBES_H
 #define _ASM_POWERPC_KPROBES_H
+
+#include <asm-generic/kprobes.h>
+
 #ifdef __KERNEL__
 /*
  *  Kernel Probes (KProbes)
index 893bd7f..573fb3a 100644 (file)
@@ -707,7 +707,7 @@ void start_secondary(void *unused)
        unsigned int cpu = smp_processor_id();
        int i, base;
 
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        smp_store_cpu_info(cpu);
index 0899315..0d3002b 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/code-patching.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 
 
 int patch_instruction(unsigned int *addr, unsigned int instr)
index 8278f43..e0f83c2 100644 (file)
@@ -3034,7 +3034,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
 /*
  * This function is supposed to be called on basis of PE from top
  * to bottom style. So the the I/O or MMIO segment assigned to
- * parent PE could be overrided by its child PEs if necessary.
+ * parent PE could be overridden by its child PEs if necessary.
  */
 static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
 {
index 0024e45..4d757ea 100644 (file)
@@ -1020,7 +1020,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
        /* check largest block * page size > max memory hotplug addr */
        max_addr = memory_hotplug_max();
        if (query.largest_available_block < (max_addr >> page_shift)) {
-               dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
+               dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
                          "%llu-sized pages\n", max_addr,  query.largest_available_block,
                          1ULL << page_shift);
                goto out_failed;
index 6845e91..954dbf8 100644 (file)
@@ -1587,7 +1587,7 @@ extract_tbr (unsigned long insn,
 #define CTX(op, xop)   (OP (op) | (((unsigned long)(xop)) & 0x7))
 #define CTX_MASK CTX(0x3f, 0x7)
 
-/* An User Context form instruction.  */
+/* A User Context form instruction.  */
 #define UCTX(op, xop)  (OP (op) | (((unsigned long)(xop)) & 0x1f))
 #define UCTX_MASK UCTX(0x3f, 0x1f)
 
index d5c1073..a2dcef0 100644 (file)
@@ -134,6 +134,7 @@ config S390
        select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
+       select HAVE_COPY_THREAD_TLS
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_API_DEBUG
        select HAVE_DMA_CONTIGUOUS
index e009753..143b1e0 100644 (file)
@@ -678,6 +678,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
index 2cf8734..2358bf3 100644 (file)
@@ -628,6 +628,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
index d1033de..402c530 100644 (file)
@@ -6,7 +6,7 @@ obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
-obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
+obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
new file mode 100644 (file)
index 0000000..d69ea49
--- /dev/null
@@ -0,0 +1,619 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the AES Cipher Algorithm with protected keys.
+ *
+ * s390 Version:
+ *   Copyright IBM Corp. 2017
+ *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *             Harald Freudenberger <freude@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "paes_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <crypto/xts.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+static u8 *ctrblk;
+static DEFINE_SPINLOCK(ctrblk_lock);
+
+static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
+
+struct s390_paes_ctx {
+       struct pkey_seckey sk;
+       struct pkey_protkey pk;
+       unsigned long fc;
+};
+
+struct s390_pxts_ctx {
+       struct pkey_seckey sk[2];
+       struct pkey_protkey pk[2];
+       unsigned long fc;
+};
+
+static inline int __paes_convert_key(struct pkey_seckey *sk,
+                                    struct pkey_protkey *pk)
+{
+       int i, ret;
+
+       /* try three times in case of failure */
+       for (i = 0; i < 3; i++) {
+               ret = pkey_skey2pkey(sk, pk);
+               if (ret == 0)
+                       break;
+       }
+
+       return ret;
+}
+
+static int __paes_set_key(struct s390_paes_ctx *ctx)
+{
+       unsigned long fc;
+
+       if (__paes_convert_key(&ctx->sk, &ctx->pk))
+               return -EINVAL;
+
+       /* Pick the correct function code based on the protected key type */
+       fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+
+       /* Check if the function code is available */
+       ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+       return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ecb_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (key_len != SECKEYBLOBSIZE)
+               return -EINVAL;
+
+       memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+       if (__paes_set_key(ctx)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int ecb_paes_crypt(struct blkcipher_desc *desc,
+                         unsigned long modifier,
+                         struct blkcipher_walk *walk)
+{
+       struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       unsigned int nbytes, n, k;
+       int ret;
+
+       ret = blkcipher_walk_virt(desc, walk);
+       while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+               /* only use complete blocks */
+               n = nbytes & ~(AES_BLOCK_SIZE - 1);
+               k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey,
+                            walk->dst.virt.addr, walk->src.virt.addr, n);
+               if (k)
+                       ret = blkcipher_walk_done(desc, walk, nbytes - k);
+               if (k < n) {
+                       if (__paes_set_key(ctx) != 0)
+                               return blkcipher_walk_done(desc, walk, -EIO);
+               }
+       }
+       return ret;
+}
+
+static int ecb_paes_encrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return ecb_paes_crypt(desc, CPACF_ENCRYPT, &walk);
+}
+
+static int ecb_paes_decrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return ecb_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ecb_paes_alg = {
+       .cra_name               =       "ecb(paes)",
+       .cra_driver_name        =       "ecb-paes-s390",
+       .cra_priority           =       400,    /* combo: aes + ecb */
+       .cra_flags              =       CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          =       AES_BLOCK_SIZE,
+       .cra_ctxsize            =       sizeof(struct s390_paes_ctx),
+       .cra_type               =       &crypto_blkcipher_type,
+       .cra_module             =       THIS_MODULE,
+       .cra_list               =       LIST_HEAD_INIT(ecb_paes_alg.cra_list),
+       .cra_u                  =       {
+               .blkcipher = {
+                       .min_keysize            =       SECKEYBLOBSIZE,
+                       .max_keysize            =       SECKEYBLOBSIZE,
+                       .setkey                 =       ecb_paes_set_key,
+                       .encrypt                =       ecb_paes_encrypt,
+                       .decrypt                =       ecb_paes_decrypt,
+               }
+       }
+};
+
+static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+{
+       unsigned long fc;
+
+       if (__paes_convert_key(&ctx->sk, &ctx->pk))
+               return -EINVAL;
+
+       /* Pick the correct function code based on the protected key type */
+       fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0;
+
+       /* Check if the function code is available */
+       ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0;
+
+       return ctx->fc ? 0 : -EINVAL;
+}
+
+static int cbc_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+       if (__cbc_paes_set_key(ctx)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int cbc_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+                         struct blkcipher_walk *walk)
+{
+       struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       unsigned int nbytes, n, k;
+       int ret;
+       struct {
+               u8 iv[AES_BLOCK_SIZE];
+               u8 key[MAXPROTKEYSIZE];
+       } param;
+
+       ret = blkcipher_walk_virt(desc, walk);
+       memcpy(param.iv, walk->iv, AES_BLOCK_SIZE);
+       memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+       while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+               /* only use complete blocks */
+               n = nbytes & ~(AES_BLOCK_SIZE - 1);
+               k = cpacf_kmc(ctx->fc | modifier, &param,
+                             walk->dst.virt.addr, walk->src.virt.addr, n);
+               if (k)
+                       ret = blkcipher_walk_done(desc, walk, nbytes - k);
+               if (n < k) {
+                       if (__cbc_paes_set_key(ctx) != 0)
+                               return blkcipher_walk_done(desc, walk, -EIO);
+                       memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+               }
+       }
+       memcpy(walk->iv, param.iv, AES_BLOCK_SIZE);
+       return ret;
+}
+
+static int cbc_paes_encrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return cbc_paes_crypt(desc, 0, &walk);
+}
+
+static int cbc_paes_decrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return cbc_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg cbc_paes_alg = {
+       .cra_name               =       "cbc(paes)",
+       .cra_driver_name        =       "cbc-paes-s390",
+       .cra_priority           =       400,    /* combo: aes + cbc */
+       .cra_flags              =       CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          =       AES_BLOCK_SIZE,
+       .cra_ctxsize            =       sizeof(struct s390_paes_ctx),
+       .cra_type               =       &crypto_blkcipher_type,
+       .cra_module             =       THIS_MODULE,
+       .cra_list               =       LIST_HEAD_INIT(cbc_paes_alg.cra_list),
+       .cra_u                  =       {
+               .blkcipher = {
+                       .min_keysize            =       SECKEYBLOBSIZE,
+                       .max_keysize            =       SECKEYBLOBSIZE,
+                       .ivsize                 =       AES_BLOCK_SIZE,
+                       .setkey                 =       cbc_paes_set_key,
+                       .encrypt                =       cbc_paes_encrypt,
+                       .decrypt                =       cbc_paes_decrypt,
+               }
+       }
+};
+
+static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+{
+       unsigned long fc;
+
+       if (__paes_convert_key(&ctx->sk[0], &ctx->pk[0]) ||
+           __paes_convert_key(&ctx->sk[1], &ctx->pk[1]))
+               return -EINVAL;
+
+       if (ctx->pk[0].type != ctx->pk[1].type)
+               return -EINVAL;
+
+       /* Pick the correct function code based on the protected key type */
+       fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 :
+               (ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ?
+               CPACF_KM_PXTS_256 : 0;
+
+       /* Check if the function code is available */
+       ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+       return ctx->fc ? 0 : -EINVAL;
+}
+
+static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm);
+       u8 ckey[2 * AES_MAX_KEY_SIZE];
+       unsigned int ckey_len;
+
+       memcpy(ctx->sk[0].seckey, in_key, SECKEYBLOBSIZE);
+       memcpy(ctx->sk[1].seckey, in_key + SECKEYBLOBSIZE, SECKEYBLOBSIZE);
+       if (__xts_paes_set_key(ctx)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+
+       /*
+        * xts_check_key verifies the key length is not odd and makes
+        * sure that the two keys are not the same. This can be done
+        * on the two protected keys as well
+        */
+       ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ?
+               AES_KEYSIZE_128 : AES_KEYSIZE_256;
+       memcpy(ckey, ctx->pk[0].protkey, ckey_len);
+       memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len);
+       return xts_check_key(tfm, ckey, 2*ckey_len);
+}
+
+static int xts_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+                         struct blkcipher_walk *walk)
+{
+       struct s390_pxts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       unsigned int keylen, offset, nbytes, n, k;
+       int ret;
+       struct {
+               u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+               u8 tweak[16];
+               u8 block[16];
+               u8 bit[16];
+               u8 xts[16];
+       } pcc_param;
+       struct {
+               u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+               u8 init[16];
+       } xts_param;
+
+       ret = blkcipher_walk_virt(desc, walk);
+       keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
+       offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
+retry:
+       memset(&pcc_param, 0, sizeof(pcc_param));
+       memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak));
+       memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
+       cpacf_pcc(ctx->fc, pcc_param.key + offset);
+
+       memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
+       memcpy(xts_param.init, pcc_param.xts, 16);
+
+       while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+               /* only use complete blocks */
+               n = nbytes & ~(AES_BLOCK_SIZE - 1);
+               k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
+                            walk->dst.virt.addr, walk->src.virt.addr, n);
+               if (k)
+                       ret = blkcipher_walk_done(desc, walk, nbytes - k);
+               if (k < n) {
+                       if (__xts_paes_set_key(ctx) != 0)
+                               return blkcipher_walk_done(desc, walk, -EIO);
+                       goto retry;
+               }
+       }
+       return ret;
+}
+
+static int xts_paes_encrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return xts_paes_crypt(desc, 0, &walk);
+}
+
+static int xts_paes_decrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return xts_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg xts_paes_alg = {
+       .cra_name               =       "xts(paes)",
+       .cra_driver_name        =       "xts-paes-s390",
+       .cra_priority           =       400,    /* combo: aes + xts */
+       .cra_flags              =       CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          =       AES_BLOCK_SIZE,
+       .cra_ctxsize            =       sizeof(struct s390_pxts_ctx),
+       .cra_type               =       &crypto_blkcipher_type,
+       .cra_module             =       THIS_MODULE,
+       .cra_list               =       LIST_HEAD_INIT(xts_paes_alg.cra_list),
+       .cra_u                  =       {
+               .blkcipher = {
+                       .min_keysize            =       2 * SECKEYBLOBSIZE,
+                       .max_keysize            =       2 * SECKEYBLOBSIZE,
+                       .ivsize                 =       AES_BLOCK_SIZE,
+                       .setkey                 =       xts_paes_set_key,
+                       .encrypt                =       xts_paes_encrypt,
+                       .decrypt                =       xts_paes_decrypt,
+               }
+       }
+};
+
+static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+{
+       unsigned long fc;
+
+       if (__paes_convert_key(&ctx->sk, &ctx->pk))
+               return -EINVAL;
+
+       /* Pick the correct function code based on the protected key type */
+       fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 :
+               (ctx->pk.type == PKEY_KEYTYPE_AES_256) ?
+               CPACF_KMCTR_PAES_256 : 0;
+
+       /* Check if the function code is available */
+       ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0;
+
+       return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ctr_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       memcpy(ctx->sk.seckey, in_key, key_len);
+       if (__ctr_paes_set_key(ctx)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
+{
+       unsigned int i, n;
+
+       /* only use complete blocks, max. PAGE_SIZE */
+       memcpy(ctrptr, iv, AES_BLOCK_SIZE);
+       n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1);
+       for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) {
+               memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE);
+               crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+               ctrptr += AES_BLOCK_SIZE;
+       }
+       return n;
+}
+
+static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+                         struct blkcipher_walk *walk)
+{
+       struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       u8 buf[AES_BLOCK_SIZE], *ctrptr;
+       unsigned int nbytes, n, k;
+       int ret, locked;
+
+       locked = spin_trylock(&ctrblk_lock);
+
+       ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE);
+       while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+               n = AES_BLOCK_SIZE;
+               if (nbytes >= 2*AES_BLOCK_SIZE && locked)
+                       n = __ctrblk_init(ctrblk, walk->iv, nbytes);
+               ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv;
+               k = cpacf_kmctr(ctx->fc | modifier, ctx->pk.protkey,
+                               walk->dst.virt.addr, walk->src.virt.addr,
+                               n, ctrptr);
+               if (k) {
+                       if (ctrptr == ctrblk)
+                               memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE,
+                                      AES_BLOCK_SIZE);
+                       crypto_inc(walk->iv, AES_BLOCK_SIZE);
+                       ret = blkcipher_walk_done(desc, walk, nbytes - n);
+               }
+               if (k < n) {
+                       if (__ctr_paes_set_key(ctx) != 0)
+                               return blkcipher_walk_done(desc, walk, -EIO);
+               }
+       }
+       if (locked)
+               spin_unlock(&ctrblk_lock);
+       /*
+        * final block may be < AES_BLOCK_SIZE, copy only nbytes
+        */
+       if (nbytes) {
+               while (1) {
+                       if (cpacf_kmctr(ctx->fc | modifier,
+                                       ctx->pk.protkey, buf,
+                                       walk->src.virt.addr, AES_BLOCK_SIZE,
+                                       walk->iv) == AES_BLOCK_SIZE)
+                               break;
+                       if (__ctr_paes_set_key(ctx) != 0)
+                               return blkcipher_walk_done(desc, walk, -EIO);
+               }
+               memcpy(walk->dst.virt.addr, buf, nbytes);
+               crypto_inc(walk->iv, AES_BLOCK_SIZE);
+               ret = blkcipher_walk_done(desc, walk, 0);
+       }
+
+       return ret;
+}
+
+static int ctr_paes_encrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return ctr_paes_crypt(desc, 0, &walk);
+}
+
+static int ctr_paes_decrypt(struct blkcipher_desc *desc,
+                           struct scatterlist *dst, struct scatterlist *src,
+                           unsigned int nbytes)
+{
+       struct blkcipher_walk walk;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       return ctr_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ctr_paes_alg = {
+       .cra_name               =       "ctr(paes)",
+       .cra_driver_name        =       "ctr-paes-s390",
+       .cra_priority           =       400,    /* combo: aes + ctr */
+       .cra_flags              =       CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          =       1,
+       .cra_ctxsize            =       sizeof(struct s390_paes_ctx),
+       .cra_type               =       &crypto_blkcipher_type,
+       .cra_module             =       THIS_MODULE,
+       .cra_list               =       LIST_HEAD_INIT(ctr_paes_alg.cra_list),
+       .cra_u                  =       {
+               .blkcipher = {
+                       .min_keysize            =       SECKEYBLOBSIZE,
+                       .max_keysize            =       SECKEYBLOBSIZE,
+                       .ivsize                 =       AES_BLOCK_SIZE,
+                       .setkey                 =       ctr_paes_set_key,
+                       .encrypt                =       ctr_paes_encrypt,
+                       .decrypt                =       ctr_paes_decrypt,
+               }
+       }
+};
+
+static inline void __crypto_unregister_alg(struct crypto_alg *alg)
+{
+       if (!list_empty(&alg->cra_list))
+               crypto_unregister_alg(alg);
+}
+
+static void paes_s390_fini(void)
+{
+       if (ctrblk)
+               free_page((unsigned long) ctrblk);
+       __crypto_unregister_alg(&ctr_paes_alg);
+       __crypto_unregister_alg(&xts_paes_alg);
+       __crypto_unregister_alg(&cbc_paes_alg);
+       __crypto_unregister_alg(&ecb_paes_alg);
+}
+
+static int __init paes_s390_init(void)
+{
+       int ret;
+
+       /* Query available functions for KM, KMC and KMCTR */
+       cpacf_query(CPACF_KM, &km_functions);
+       cpacf_query(CPACF_KMC, &kmc_functions);
+       cpacf_query(CPACF_KMCTR, &kmctr_functions);
+
+       if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
+           cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
+           cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
+               ret = crypto_register_alg(&ecb_paes_alg);
+               if (ret)
+                       goto out_err;
+       }
+
+       if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
+           cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
+           cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
+               ret = crypto_register_alg(&cbc_paes_alg);
+               if (ret)
+                       goto out_err;
+       }
+
+       if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
+           cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
+               ret = crypto_register_alg(&xts_paes_alg);
+               if (ret)
+                       goto out_err;
+       }
+
+       if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
+           cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
+           cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
+               ret = crypto_register_alg(&ctr_paes_alg);
+               if (ret)
+                       goto out_err;
+               ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
+               if (!ctrblk) {
+                       ret = -ENOMEM;
+                       goto out_err;
+               }
+       }
+
+       return 0;
+out_err:
+       paes_s390_fini();
+       return ret;
+}
+
+module_init(paes_s390_init);
+module_exit(paes_s390_fini);
+
+MODULE_ALIAS_CRYPTO("aes-all");
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys");
+MODULE_LICENSE("GPL");
index d00e368..68bfd09 100644 (file)
@@ -229,6 +229,7 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
index 2c680db..e2dfbf2 100644 (file)
@@ -28,8 +28,9 @@
 #define CPACF_PPNO             0xb93c          /* MSA5 */
 
 /*
- * Decryption modifier bit
+ * En/decryption modifier bits
  */
+#define CPACF_ENCRYPT          0x00
 #define CPACF_DECRYPT          0x80
 
 /*
 #define CPACF_KM_AES_128       0x12
 #define CPACF_KM_AES_192       0x13
 #define CPACF_KM_AES_256       0x14
+#define CPACF_KM_PAES_128      0x1a
+#define CPACF_KM_PAES_192      0x1b
+#define CPACF_KM_PAES_256      0x1c
 #define CPACF_KM_XTS_128       0x32
 #define CPACF_KM_XTS_256       0x34
+#define CPACF_KM_PXTS_128      0x3a
+#define CPACF_KM_PXTS_256      0x3c
 
 /*
  * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING)
@@ -56,6 +62,9 @@
 #define CPACF_KMC_AES_128      0x12
 #define CPACF_KMC_AES_192      0x13
 #define CPACF_KMC_AES_256      0x14
+#define CPACF_KMC_PAES_128     0x1a
+#define CPACF_KMC_PAES_192     0x1b
+#define CPACF_KMC_PAES_256     0x1c
 #define CPACF_KMC_PRNG         0x43
 
 /*
@@ -69,6 +78,9 @@
 #define CPACF_KMCTR_AES_128    0x12
 #define CPACF_KMCTR_AES_192    0x13
 #define CPACF_KMCTR_AES_256    0x14
+#define CPACF_KMCTR_PAES_128   0x1a
+#define CPACF_KMCTR_PAES_192   0x1b
+#define CPACF_KMCTR_PAES_256   0x1c
 
 /*
  * Function codes for the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
 #define CPACF_KMAC_TDEA_192    0x03
 
 /*
+ * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT)
+ * instruction
+ */
+#define CPACF_PCKMO_QUERY              0x00
+#define CPACF_PCKMO_ENC_DES_KEY                0x01
+#define CPACF_PCKMO_ENC_TDES_128_KEY   0x02
+#define CPACF_PCKMO_ENC_TDES_192_KEY   0x03
+#define CPACF_PCKMO_ENC_AES_128_KEY    0x12
+#define CPACF_PCKMO_ENC_AES_192_KEY    0x13
+#define CPACF_PCKMO_ENC_AES_256_KEY    0x14
+
+/*
  * Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
  * instruction
  */
@@ -397,4 +421,24 @@ static inline void cpacf_pcc(unsigned long func, void *param)
                : "cc", "memory");
 }
 
+/**
+ * cpacf_pckmo() - executes the PCKMO (PERFORM CRYPTOGRAPHIC KEY
+ *               MANAGEMENT) instruction
+ * @func: the function code passed to PCKMO; see CPACF_PCKMO_xxx defines
+ * @param: address of parameter block; see POP for details on each func
+ *
+ * Returns 0.
+ */
+static inline void cpacf_pckmo(long func, void *param)
+{
+       register unsigned long r0 asm("0") = (unsigned long) func;
+       register unsigned long r1 asm("1") = (unsigned long) param;
+
+       asm volatile(
+               "       .insn   rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
+               :
+               : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
+               : "cc", "memory");
+}
+
 #endif /* _ASM_S390_CPACF_H */
index 591e5a5..84c0f90 100644 (file)
  * 2005-Dec    Used as a template for s390 by Mike Grundy
  *             <grundym@us.ibm.com>
  */
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0x0002
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -37,7 +42,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u16 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0x0002
 
 /* Maximum instruction size is 3 (16bit) halfwords: */
 #define MAX_INSN_SIZE          0x0003
@@ -91,4 +95,5 @@ int probe_is_insn_relative_long(u16 *insn);
 
 #define flush_insn_slot(p)     do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_S390_KPROBES_H */
index 67f7a99..9b828c0 100644 (file)
@@ -63,7 +63,7 @@ static inline void set_user_asce(struct mm_struct *mm)
        S390_lowcore.user_asce = mm->context.asce;
        if (current->thread.mm_segment.ar4)
                __ctl_load(S390_lowcore.user_asce, 7, 7);
-       set_cpu_flag(CIF_ASCE);
+       set_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
 static inline void clear_user_asce(void)
@@ -81,7 +81,7 @@ static inline void load_kernel_asce(void)
        __ctl_store(asce, 1, 1);
        if (asce != S390_lowcore.kernel_asce)
                __ctl_load(S390_lowcore.kernel_asce, 1, 1);
-       set_cpu_flag(CIF_ASCE);
+       set_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
index 5251186..7ed1972 100644 (file)
@@ -640,12 +640,12 @@ static inline int pud_bad(pud_t pud)
 
 static inline int pmd_present(pmd_t pmd)
 {
-       return pmd_val(pmd) != _SEGMENT_ENTRY_INVALID;
+       return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline int pmd_none(pmd_t pmd)
 {
-       return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
+       return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline unsigned long pmd_pfn(pmd_t pmd)
@@ -803,7 +803,7 @@ static inline void pud_clear(pud_t *pud)
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
-       pmd_val(*pmdp) = _SEGMENT_ENTRY_INVALID;
+       pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -1357,7 +1357,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long addr, pmd_t *pmdp)
 {
-       return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+       return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
@@ -1367,10 +1367,10 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
 {
        if (full) {
                pmd_t pmd = *pmdp;
-               *pmdp = __pmd(_SEGMENT_ENTRY_INVALID);
+               *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
                return pmd;
        }
-       return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+       return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1384,7 +1384,7 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 static inline void pmdp_invalidate(struct vm_area_struct *vma,
                                   unsigned long addr, pmd_t *pmdp)
 {
-       pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+       pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
new file mode 100644 (file)
index 0000000..b48aef4
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Kernelspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _KAPI_PKEY_H
+#define _KAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <uapi/asm/pkey.h>
+
+/*
+ * Generate (AES) random secure key.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_genseckey(__u16 cardnr, __u16 domain,
+                  __u32 keytype, struct pkey_seckey *seckey);
+
+/*
+ * Generate (AES) secure key with given key value.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2seckey(__u16 cardnr, __u16 domain, __u32 keytype,
+                   const struct pkey_clrkey *clrkey,
+                   struct pkey_seckey *seckey);
+
+/*
+ * Derive (AES) proteced key from the (AES) secure key blob.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ *       additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_sec2protkey(__u16 cardnr, __u16 domain,
+                    const struct pkey_seckey *seckey,
+                    struct pkey_protkey *protkey);
+
+/*
+ * Derive (AES) protected key from a given clear key value.
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param protkey pointer to buffer receiving the protected key and
+ *       additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2protkey(__u32 keytype,
+                    const struct pkey_clrkey *clrkey,
+                    struct pkey_protkey *protkey);
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param cardnr pointer to cardnr, receives the card number on success
+ * @param domain pointer to domain, receives the domain number on success
+ * @param verify if set, always verify by fetching verification pattern
+ *       from card
+ * @return 0 on success, negative errno value on failure. If no card could be
+ *        found, -ENODEV is returned.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+                 __u16 *cardnr, __u16 *domain, int verify);
+
+/*
+ * Find card and transform secure key to protected key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ *       additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+                  struct pkey_protkey *protkey);
+
+#endif /* _KAPI_PKEY_H */
index dacba34..e498871 100644 (file)
 #include <linux/const.h>
 
 #define CIF_MCCK_PENDING       0       /* machine check handling is pending */
-#define CIF_ASCE               1       /* user asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY         2       /* delay HZ disable for a tick */
-#define CIF_FPU                        3       /* restore FPU registers */
-#define CIF_IGNORE_IRQ         4       /* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT       5       /* in enabled wait state */
+#define CIF_ASCE_PRIMARY       1       /* primary asce needs fixup / uaccess */
+#define CIF_ASCE_SECONDARY     2       /* secondary asce needs fixup / uaccess */
+#define CIF_NOHZ_DELAY         3       /* delay HZ disable for a tick */
+#define CIF_FPU                        4       /* restore FPU registers */
+#define CIF_IGNORE_IRQ         5       /* ignore interrupt (for udelay) */
+#define CIF_ENABLED_WAIT       6       /* in enabled wait state */
 
 #define _CIF_MCCK_PENDING      _BITUL(CIF_MCCK_PENDING)
-#define _CIF_ASCE              _BITUL(CIF_ASCE)
+#define _CIF_ASCE_PRIMARY      _BITUL(CIF_ASCE_PRIMARY)
+#define _CIF_ASCE_SECONDARY    _BITUL(CIF_ASCE_SECONDARY)
 #define _CIF_NOHZ_DELAY                _BITUL(CIF_NOHZ_DELAY)
 #define _CIF_FPU               _BITUL(CIF_FPU)
 #define _CIF_IGNORE_IRQ                _BITUL(CIF_IGNORE_IRQ)
@@ -89,7 +91,8 @@ extern void execve_tail(void);
  * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
  */
 
-#define TASK_SIZE_OF(tsk)      ((tsk)->mm->context.asce_limit)
+#define TASK_SIZE_OF(tsk)      ((tsk)->mm ? \
+                                (tsk)->mm->context.asce_limit : TASK_MAX_SIZE)
 #define TASK_UNMAPPED_BASE     (test_thread_flag(TIF_31BIT) ? \
                                        (1UL << 30) : (1UL << 41))
 #define TASK_SIZE              TASK_SIZE_OF(current)
@@ -200,10 +203,12 @@ struct stack_frame {
 struct task_struct;
 struct mm_struct;
 struct seq_file;
+struct pt_regs;
 
 typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable);
 void dump_trace(dump_trace_func_t func, void *data,
                struct task_struct *task, unsigned long sp);
+void show_registers(struct pt_regs *regs);
 
 void show_cacheinfo(struct seq_file *m);
 
index b2988fc..136932f 100644 (file)
@@ -14,6 +14,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
 #define VERIFY_READ     0
 
 #define get_ds()        (KERNEL_DS)
 #define get_fs()        (current->thread.mm_segment)
-
-#define set_fs(x)                                                      \
-do {                                                                   \
-       unsigned long __pto;                                            \
-       current->thread.mm_segment = (x);                               \
-       __pto = current->thread.mm_segment.ar4 ?                        \
-               S390_lowcore.user_asce : S390_lowcore.kernel_asce;      \
-       __ctl_load(__pto, 7, 7);                                        \
-} while (0)
-
 #define segment_eq(a,b) ((a).ar4 == (b).ar4)
 
+static inline void set_fs(mm_segment_t fs)
+{
+       current->thread.mm_segment = fs;
+       if (segment_eq(fs, KERNEL_DS)) {
+               set_cpu_flag(CIF_ASCE_SECONDARY);
+               __ctl_load(S390_lowcore.kernel_asce, 7, 7);
+       } else {
+               clear_cpu_flag(CIF_ASCE_SECONDARY);
+               __ctl_load(S390_lowcore.user_asce, 7, 7);
+       }
+}
+
 static inline int __range_ok(unsigned long addr, unsigned long size)
 {
        return 1;
index bf736e7..6848ba5 100644 (file)
@@ -24,6 +24,7 @@ header-y += mman.h
 header-y += monwriter.h
 header-y += msgbuf.h
 header-y += param.h
+header-y += pkey.h
 header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
new file mode 100644 (file)
index 0000000..ed7f19c
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Userspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _UAPI_PKEY_H
+#define _UAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * Ioctl calls supported by the pkey device driver
+ */
+
+#define PKEY_IOCTL_MAGIC 'p'
+
+#define SECKEYBLOBSIZE 64     /* secure key blob size is always 64 bytes */
+#define MAXPROTKEYSIZE 64  /* a protected key blob may be up to 64 bytes */
+#define MAXCLRKEYSIZE  32     /* a clear key value may be up to 32 bytes */
+
+/* defines for the type field within the pkey_protkey struct */
+#define PKEY_KEYTYPE_AES_128  1
+#define PKEY_KEYTYPE_AES_192  2
+#define PKEY_KEYTYPE_AES_256  3
+
+/* Struct to hold a secure key blob */
+struct pkey_seckey {
+       __u8  seckey[SECKEYBLOBSIZE];             /* the secure key blob */
+};
+
+/* Struct to hold protected key and length info */
+struct pkey_protkey {
+       __u32 type;          /* key type, one of the PKEY_KEYTYPE values */
+       __u32 len;              /* bytes actually stored in protkey[]    */
+       __u8  protkey[MAXPROTKEYSIZE];         /* the protected key blob */
+};
+
+/* Struct to hold a clear key value */
+struct pkey_clrkey {
+       __u8  clrkey[MAXCLRKEYSIZE]; /* 16, 24, or 32 byte clear key value */
+};
+
+/*
+ * Generate secure key
+ */
+struct pkey_genseck {
+       __u16 cardnr;               /* in: card to use or FFFF for any   */
+       __u16 domain;               /* in: domain or FFFF for any        */
+       __u32 keytype;              /* in: key type to generate          */
+       struct pkey_seckey seckey;  /* out: the secure key blob          */
+};
+#define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck)
+
+/*
+ * Construct secure key from clear key value
+ */
+struct pkey_clr2seck {
+       __u16 cardnr;               /* in: card to use or FFFF for any   */
+       __u16 domain;               /* in: domain or FFFF for any        */
+       __u32 keytype;              /* in: key type to generate          */
+       struct pkey_clrkey clrkey;  /* in: the clear key value           */
+       struct pkey_seckey seckey;  /* out: the secure key blob          */
+};
+#define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck)
+
+/*
+ * Fabricate protected key from a secure key
+ */
+struct pkey_sec2protk {
+       __u16 cardnr;                /* in: card to use or FFFF for any   */
+       __u16 domain;                /* in: domain or FFFF for any        */
+       struct pkey_seckey seckey;   /* in: the secure key blob           */
+       struct pkey_protkey protkey; /* out: the protected key            */
+};
+#define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk)
+
+/*
+ * Fabricate protected key from an clear key value
+ */
+struct pkey_clr2protk {
+       __u32 keytype;               /* in: key type to generate          */
+       struct pkey_clrkey clrkey;   /* in: the clear key value           */
+       struct pkey_protkey protkey; /* out: the protected key            */
+};
+#define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk)
+
+/*
+ * Search for matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+struct pkey_findcard {
+       struct pkey_seckey seckey;             /* in: the secure key blob */
+       __u16  cardnr;                         /* out: card number        */
+       __u16  domain;                         /* out: domain number      */
+};
+#define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard)
+
+/*
+ * Combined together: findcard + sec2prot
+ */
+struct pkey_skey2pkey {
+       struct pkey_seckey seckey;   /* in: the secure key blob           */
+       struct pkey_protkey protkey; /* out: the protected key            */
+};
+#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
+
+#endif /* _UAPI_PKEY_H */
index db469fa..dff2152 100644 (file)
@@ -50,7 +50,8 @@ _TIF_WORK     = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
                   _TIF_UPROBE)
 _TIF_TRACE     = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
                   _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
+_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
+                  _CIF_ASCE_SECONDARY | _CIF_FPU)
 _PIF_WORK      = (_PIF_PER_TRAP)
 
 #define BASED(name) name-cleanup_critical(%r13)
@@ -339,8 +340,8 @@ ENTRY(system_call)
        jo      .Lsysc_notify_resume
        TSTMSK  __LC_CPU_FLAGS,_CIF_FPU
        jo      .Lsysc_vxrs
-       TSTMSK  __LC_CPU_FLAGS,_CIF_ASCE
-       jo      .Lsysc_uaccess
+       TSTMSK  __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+       jnz     .Lsysc_asce
        j       .Lsysc_return           # beware of critical section cleanup
 
 #
@@ -358,12 +359,15 @@ ENTRY(system_call)
        jg      s390_handle_mcck        # TIF bit will be cleared by handler
 
 #
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
 #
-.Lsysc_uaccess:
-       ni      __LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lsysc_asce:
+       ni      __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
-       j       .Lsysc_return
+       TSTMSK  __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+       jz      .Lsysc_return
+       larl    %r14,.Lsysc_return
+       jg      set_fs_fixup
 
 #
 # CIF_FPU is set, restore floating-point controls and floating-point registers.
@@ -661,8 +665,8 @@ ENTRY(io_int_handler)
        jo      .Lio_notify_resume
        TSTMSK  __LC_CPU_FLAGS,_CIF_FPU
        jo      .Lio_vxrs
-       TSTMSK  __LC_CPU_FLAGS,_CIF_ASCE
-       jo      .Lio_uaccess
+       TSTMSK  __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+       jnz     .Lio_asce
        j       .Lio_return             # beware of critical section cleanup
 
 #
@@ -675,12 +679,15 @@ ENTRY(io_int_handler)
        j       .Lio_return
 
 #
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
 #
-.Lio_uaccess:
-       ni      __LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lio_asce:
+       ni      __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
-       j       .Lio_return
+       TSTMSK  __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+       jz      .Lio_return
+       larl    %r14,.Lio_return
+       jg      set_fs_fixup
 
 #
 # CIF_FPU is set, restore floating-point controls and floating-point registers.
index e79f030..33f9018 100644 (file)
@@ -80,5 +80,6 @@ long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
 DECLARE_PER_CPU(u64, mt_cycles[8]);
 
 void verify_facilities(void);
+void set_fs_fixup(void);
 
 #endif /* _ENTRY_H */
index 56e14d0..80c093e 100644 (file)
@@ -116,6 +116,19 @@ static int notrace s390_validate_registers(union mci mci, int umode)
                        s390_handle_damage();
                kill_task = 1;
        }
+       /* Validate control registers */
+       if (!mci.cr) {
+               /*
+                * Control registers have unknown contents.
+                * Can't recover and therefore stopping machine.
+                */
+               s390_handle_damage();
+       } else {
+               asm volatile(
+                       "       lctlg   0,15,0(%0)\n"
+                       "       ptlb\n"
+                       : : "a" (&S390_lowcore.cregs_save_area) : "memory");
+       }
        if (!mci.fp) {
                /*
                 * Floating point registers can't be restored. If the
@@ -208,18 +221,6 @@ static int notrace s390_validate_registers(union mci mci, int umode)
                 */
                kill_task = 1;
        }
-       /* Validate control registers */
-       if (!mci.cr) {
-               /*
-                * Control registers have unknown contents.
-                * Can't recover and therefore stopping machine.
-                */
-               s390_handle_damage();
-       } else {
-               asm volatile(
-                       "       lctlg   0,15,0(%0)"
-                       : : "a" (&S390_lowcore.cregs_save_area) : "memory");
-       }
        /*
         * We don't even try to validate the TOD register, since we simply
         * can't write something sensible into that register.
index c5b86b4..5428166 100644 (file)
@@ -100,8 +100,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
        return 0;
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
-               unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
+                   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
        struct fake_frame
        {
@@ -156,7 +156,6 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 
        /* Set a new TLS ?  */
        if (clone_flags & CLONE_SETTLS) {
-               unsigned long tls = frame->childregs.gprs[6];
                if (is_compat_task()) {
                        p->thread.acrs[0] = (unsigned int)tls;
                } else {
@@ -234,3 +233,16 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
        ret = PAGE_ALIGN(mm->brk + brk_rnd());
        return (ret > mm->brk) ? ret : mm->brk;
 }
+
+void set_fs_fixup(void)
+{
+       struct pt_regs *regs = current_pt_regs();
+       static bool warned;
+
+       set_fs(USER_DS);
+       if (warned)
+               return;
+       WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
+       show_registers(regs);
+       warned = true;
+}
index 21004aa..bc2b60d 100644 (file)
@@ -73,7 +73,7 @@ void cpu_init(void)
        get_cpu_id(id);
        if (machine_has_cpu_mhz)
                update_cpu_mhz(NULL);
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        BUG_ON(current->mm);
        enter_lazy_tlb(&init_mm, current);
index b4a3e9e..31bd96e 100644 (file)
@@ -350,7 +350,7 @@ static void __add_vtimer(struct vtimer_list *timer, int periodic)
 }
 
 /*
- * add_virt_timer - add an oneshot virtual CPU timer
+ * add_virt_timer - add a oneshot virtual CPU timer
  */
 void add_virt_timer(struct vtimer_list *timer)
 {
index 59ac937..a07b1ec 100644 (file)
@@ -359,8 +359,8 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
        spin_lock(&gmap->guest_table_lock);
        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
        if (entry) {
-               flush = (*entry != _SEGMENT_ENTRY_INVALID);
-               *entry = _SEGMENT_ENTRY_INVALID;
+               flush = (*entry != _SEGMENT_ENTRY_EMPTY);
+               *entry = _SEGMENT_ENTRY_EMPTY;
        }
        spin_unlock(&gmap->guest_table_lock);
        return flush;
@@ -589,7 +589,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
                return rc;
        ptl = pmd_lock(mm, pmd);
        spin_lock(&gmap->guest_table_lock);
-       if (*table == _SEGMENT_ENTRY_INVALID) {
+       if (*table == _SEGMENT_ENTRY_EMPTY) {
                rc = radix_tree_insert(&gmap->host_to_guest,
                                       vmaddr >> PMD_SHIFT, table);
                if (!rc)
index a038162..9b4050c 100644 (file)
@@ -62,7 +62,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
                rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
                                     _SEGMENT_ENTRY_NOEXEC);
        } else
-               rste = _SEGMENT_ENTRY_INVALID;
+               rste = _SEGMENT_ENTRY_EMPTY;
        return rste;
 }
 
index db3e28c..926943a 100644 (file)
@@ -13,3 +13,4 @@ generic-y += trace_clock.h
 generic-y += xor.h
 generic-y += serial.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
index 2b22bcf..569ac02 100644 (file)
@@ -336,7 +336,7 @@ void __init trap_init(void)
        set_except_vector(18, handle_dbe);
        flush_icache_range(DEBUG_VECTOR_BASE_ADDR, IRQ_VECTOR_BASE_ADDR);
 
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        cpu_cache_init();
 }
index 134f398..f0986f9 100644 (file)
@@ -1,13 +1,16 @@
 #ifndef __ASM_SH_KPROBES_H
 #define __ASM_SH_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0xc33a
+
 #ifdef CONFIG_KPROBES
 
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 typedef insn_size_t kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xc33a
 
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
index bc35911..04487e8 100644 (file)
@@ -99,7 +99,7 @@ static inline void handle_one_irq(unsigned int irq)
                        "mov    %0, r4          \n"
                        "mov    r15, r8         \n"
                        "jsr    @%1             \n"
-                       /* swith to the irq stack */
+                       /* switch to the irq stack */
                        " mov   %2, r15         \n"
                        /* restore the stack (ring zero) */
                        "mov    r8, r15         \n"
index 38e7860..edc4769 100644 (file)
@@ -178,8 +178,8 @@ asmlinkage void start_secondary(void)
        struct mm_struct *mm = &init_mm;
 
        enable_mmu();
-       atomic_inc(&mm->mm_count);
-       atomic_inc(&mm->mm_users);
+       mmgrab(mm);
+       mmget(mm);
        current->active_mm = mm;
 #ifdef CONFIG_MMU
        enter_lazy_tlb(mm, current);
index a145d79..49f8402 100644 (file)
@@ -1,13 +1,17 @@
 #ifndef _SPARC64_KPROBES_H
 #define _SPARC64_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
+#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/percpu.h>
 
 typedef u32 kprobe_opcode_t;
 
-#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
-#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
 #define MAX_INSN_SIZE 2
 
 #define kretprobe_blacklist_size 0
@@ -48,4 +52,6 @@ int kprobe_exceptions_notify(struct notifier_block *self,
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
                                      struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _SPARC64_KPROBES_H */
index 16f1037..475dd41 100644 (file)
@@ -9,7 +9,7 @@ extern struct thread_info *current_set[NR_CPUS];
  * Flush windows so that the VM switch which follows
  * would not pull the stack from under us.
  *
- * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
+ * SWITCH_ENTER and SWITCH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
  * XXX WTF is the above comment? Found in late teen 2.4.x.
  */
 #ifdef CONFIG_SMP
index 71e16f2..b99d337 100644 (file)
@@ -93,7 +93,7 @@ void leon_cpu_pre_online(void *arg)
                             : "memory" /* paranoid */);
 
        /* Attach to the address space of init_task. */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
index 90a02cb..8e3e139 100644 (file)
@@ -122,7 +122,7 @@ void smp_callin(void)
        current_thread_info()->new_child = 0;
 
        /* Attach to the address space of init_task. */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        /* inform the notifiers about the new cpu */
index 9d98e50..7b55c50 100644 (file)
@@ -93,7 +93,7 @@ void sun4d_cpu_pre_online(void *arg)
        show_leds(cpuid);
 
        /* Attach to the address space of init_task. */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        local_ops->cache_all();
index 278c40a..633c4cf 100644 (file)
@@ -59,7 +59,7 @@ void sun4m_cpu_pre_online(void *arg)
                             : "memory" /* paranoid */);
 
        /* Attach to the address space of init_task. */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
index 4f21df7..ecddac5 100644 (file)
@@ -448,7 +448,7 @@ void trap_init(void)
                thread_info_offsets_are_bolixed_pete();
 
        /* Attach to the address space of init_task. */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 
        /* NOTE: Other cpus have this done as they are started
index dfc97a4..e022d7b 100644 (file)
@@ -2837,6 +2837,6 @@ void __init trap_init(void)
        /* Attach to the address space of init_task.  On SMP we
         * do this in smp.c:smp_callin for other cpus.
         */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
 }
index c4ac58e..8f35eea 100644 (file)
@@ -30,7 +30,7 @@
 /* 001001011 - two 32-bit merges */
 #define FPMERGE_OPF    0x04b
 
-/* 000110001 - 8-by-16-bit partitoned product  */
+/* 000110001 - 8-by-16-bit partitioned product  */
 #define FMUL8x16_OPF   0x031
 
 /* 000110011 - 8-by-16-bit upper alpha partitioned product  */
index d8f9a83..4a8b1ca 100644 (file)
 #ifndef _ASM_TILE_KPROBES_H
 #define _ASM_TILE_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-
 #include <arch/opcode.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -76,4 +79,5 @@ void arch_remove_kprobe(struct kprobe *);
 extern int kprobe_exceptions_notify(struct notifier_block *self,
                             unsigned long val, void *data);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_TILE_KPROBES_H */
index 6c0abaa..53ce940 100644 (file)
@@ -160,7 +160,7 @@ static void start_secondary(void)
        __this_cpu_write(current_asid, min_asid);
 
        /* Set up this thread as another owner of the init_mm */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        current->active_mm = &init_mm;
        if (current->mm)
                BUG();
index 90c281c..e9d42aa 100644 (file)
@@ -25,3 +25,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index 5d51ade..84205fe 100644 (file)
@@ -63,3 +63,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index c4cba00..63c1d13 100644 (file)
@@ -74,14 +74,6 @@ config EFI_PGT_DUMP
          issues with the mapping of the EFI runtime regions into that
          table.
 
-config DEBUG_RODATA_TEST
-       bool "Testcase for the marking rodata read-only"
-       default y
-       ---help---
-         This option enables a testcase for the setting rodata read-only
-         as well as for the change_page_attr() infrastructure.
-         If in doubt, say "N"
-
 config DEBUG_WX
        bool "Warn on W+X mappings at boot"
        select X86_PTDUMP_CORE
index 872877d..e7e1942 100644 (file)
@@ -90,18 +90,8 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 
-#ifdef CONFIG_DEBUG_RODATA_TEST
-int rodata_test(void);
-#else
-static inline int rodata_test(void)
-{
-       return 0;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
index eb5deb4..4926534 100644 (file)
@@ -15,7 +15,7 @@
  * FIXME: Accessing the desc_struct through its fields is more elegant,
  * and should be the one valid thing to do. However, a lot of open code
  * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
+ * incrementally. We keep the signature as a struct, rather than a union,
  * so we can get rid of it transparently in the future -- glommer
  */
 /* 8 byte segment descriptor */
index d1d1e50..2005816 100644 (file)
  *
  * See arch/x86/kernel/kprobes.c for x86 kprobes history.
  */
+
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -32,7 +38,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xcc
 #define RELATIVEJUMP_OPCODE 0xe9
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
@@ -116,4 +121,6 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
                                    unsigned long val, void *data);
 extern int kprobe_int3_handler(struct pt_regs *regs);
 extern int kprobe_debug_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_X86_KPROBES_H */
index 8f50fb3..72277b1 100644 (file)
@@ -121,7 +121,8 @@ static inline void native_pmd_clear(pmd_t *pmd)
        *(tmp + 1) = 0;
 }
 
-#ifndef CONFIG_SMP
+#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \
+               defined(CONFIG_PARAVIRT))
 static inline void native_pud_clear(pud_t *pudp)
 {
 }
index bdcdb3b..84c0059 100644 (file)
@@ -100,7 +100,6 @@ obj-$(CONFIG_HPET_TIMER)    += hpet.o
 obj-$(CONFIG_APB_TIMER)                += apb_timer.o
 
 obj-$(CONFIG_AMD_NB)           += amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST)        += test_rodata.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)                += kvm.o kvmclock.o
index 8567c85..4261b32 100644 (file)
@@ -1865,14 +1865,14 @@ static void __smp_spurious_interrupt(u8 vector)
                "should never happen.\n", vector, smp_processor_id());
 }
 
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_spurious_interrupt(~regs->orig_ax);
        exiting_irq();
 }
 
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
 {
        u8 vector = ~regs->orig_ax;
 
@@ -1923,14 +1923,14 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
 }
 
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_error_interrupt(regs);
        exiting_irq();
 }
 
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        trace_error_apic_entry(ERROR_APIC_VECTOR);
index 5d30c5e..f3557a1 100644 (file)
@@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
                __send_cleanup_vector(data);
 }
 
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
 {
        unsigned vector, me;
 
index f07005e..c64ca59 100644 (file)
@@ -1510,7 +1510,7 @@ void cpu_init(void)
        for (i = 0; i <= IO_BITMAP_LONGS; i++)
                t->io_bitmap[i] = ~0UL;
 
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        me->active_mm = &init_mm;
        BUG_ON(me->mm);
        enter_lazy_tlb(&init_mm, me);
@@ -1561,7 +1561,7 @@ void cpu_init(void)
        /*
         * Set up and load the per-CPU TSS and LDT
         */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        curr->active_mm = &init_mm;
        BUG_ON(curr->mm);
        enter_lazy_tlb(&init_mm, curr);
index 9e5427d..524cc57 100644 (file)
@@ -816,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void)
        deferred_error_int_vector();
 }
 
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
 {
        entering_irq();
        __smp_deferred_error_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 {
        entering_irq();
        trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
index 85469f8..d7cc190 100644 (file)
@@ -396,14 +396,16 @@ static inline void __smp_thermal_interrupt(void)
        smp_thermal_vector();
 }
 
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        __smp_thermal_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
        entering_irq();
        trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
index 9beb092..bb0e75e 100644 (file)
@@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void)
        mce_threshold_vector();
 }
 
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
 {
        entering_irq();
        __smp_threshold_interrupt();
        exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
 {
        entering_irq();
        trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
index 7c6e9ff..4d8183b 100644 (file)
@@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void)
                x86_platform_ipi_callback();
 }
 
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
 
@@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
 }
 #endif
 
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
 
index 3512ba6..2754878 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/hardirq.h>
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
 
 static inline void __smp_irq_work_interrupt(void)
 {
@@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void)
        irq_work_run();
 }
 
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_irq_work_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_irq_work_entry(IRQ_WORK_VECTOR);
index 69780ed..4bf0c89 100644 (file)
@@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void)
        /* 0 means: find the address automatically */
        if (crash_base <= 0) {
                /*
-                *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+                * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+                * as old kexec-tools loads bzImage below that, unless
+                * "crashkernel=size[KMG],high" is specified.
                 */
                crash_base = memblock_find_in_range(CRASH_ALIGN,
                                                    high ? CRASH_ADDR_HIGH_MAX
index 68f8cc2..d3c66a1 100644 (file)
@@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void)
        scheduler_ipi();
 }
 
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
 {
        ack_APIC_irq();
        __smp_reschedule_interrupt();
@@ -268,7 +268,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
         */
 }
 
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
        /*
         * Need to call irq_enter() before calling the trace point.
@@ -292,14 +292,15 @@ static inline void __smp_call_function_interrupt(void)
        inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_call_function_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -314,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void)
        inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        __smp_call_function_single_interrupt();
        exiting_irq();
 }
 
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
        ipi_entering_ack_irq();
        trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644 (file)
index 222e84e..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
-       unsigned long result;
-       unsigned long start, end;
-
-       /* test 1: read the value */
-       /* If this test fails, some previous testrun has clobbered the state */
-       if (!rodata_test_data) {
-               printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
-               return -ENODEV;
-       }
-
-       /* test 2: write to the variable; this should fault */
-       /*
-        * If this test fails, we managed to overwrite the data
-        *
-        * This is written in assembly to be able to catch the
-        * exception that is supposed to happen in the correct
-        * case
-        */
-
-       result = 1;
-       asm volatile(
-               "0:     mov %[zero],(%[rodata_test])\n"
-               "       mov %[zero], %[rslt]\n"
-               "1:\n"
-               ".section .fixup,\"ax\"\n"
-               "2:     jmp 1b\n"
-               ".previous\n"
-               _ASM_EXTABLE(0b,2b)
-               : [rslt] "=r" (result)
-               : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
-       );
-
-
-       if (!result) {
-               printk(KERN_ERR "rodata_test: test data was not read only\n");
-               return -ENODEV;
-       }
-
-       /* test 3: check the value hasn't changed */
-       /* If this test fails, we managed to overwrite the data */
-       if (!rodata_test_data) {
-               printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
-               return -ENODEV;
-       }
-       /* test 4: check if the rodata section is 4Kb aligned */
-       start = (unsigned long)__start_rodata;
-       end = (unsigned long)__end_rodata;
-       if (start & (PAGE_SIZE - 1)) {
-               printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
-               return -ENODEV;
-       }
-       if (end & (PAGE_SIZE - 1)) {
-               printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
-               return -ENODEV;
-       }
-
-       return 0;
-}
index e79f15f..ad0118f 100644 (file)
@@ -346,6 +346,7 @@ SECTIONS
        /DISCARD/ : {
                *(.eh_frame)
                *(__func_stack_frame_non_standard)
+               *(__unreachable)
        }
 }
 
index 2fd7586..1cda352 100644 (file)
@@ -4102,7 +4102,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
                                 * as a SMAP violation if all of the following
                                 * conditions are ture:
                                 *   - X86_CR4_SMAP is set in CR4
-                                *   - An user page is accessed
+                                *   - A user page is accessed
                                 *   - Page fault in kernel mode
                                 *   - if CPL = 3 or X86_EFLAGS_AC is clear
                                 *
index 928d657..2b4b53e 100644 (file)
@@ -864,9 +864,6 @@ static noinline int do_test_wp_bit(void)
        return flag;
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly __read_mostly;
 
 void set_kernel_text_rw(void)
@@ -939,7 +936,6 @@ void mark_rodata_ro(void)
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
                size >> 10);
-       rodata_test();
 
 #ifdef CONFIG_CPA_DEBUG
        printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
index 97346f9..15173d3 100644 (file)
@@ -1000,9 +1000,6 @@ void __init mem_init(void)
        mem_init_print_info(NULL);
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly;
 
 void set_kernel_text_rw(void)
@@ -1071,8 +1068,6 @@ void mark_rodata_ro(void)
        all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
        set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
-       rodata_test();
-
 #ifdef CONFIG_CPA_DEBUG
        printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
        set_memory_rw(start, (end-start) >> PAGE_SHIFT);
index 9e9760b..f41408c 100644 (file)
@@ -31,3 +31,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
index fc4ad21..fcea720 100644 (file)
@@ -135,8 +135,8 @@ void secondary_start_kernel(void)
 
        /* All kernel threads share the same mm context. */
 
-       atomic_inc(&mm->mm_users);
-       atomic_inc(&mm->mm_count);
+       mmget(mm);
+       mmgrab(mm);
        current->active_mm = mm;
        cpumask_set_cpu(cpu, mm_cpumask(mm));
        enter_lazy_tlb(mm, current);
index 82fd0cc..8fab716 100644 (file)
@@ -185,7 +185,7 @@ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  * sq_to_td - return throtl_data the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
  *
- * A service_queue can be embeded in either a throtl_grp or throtl_data.
+ * A service_queue can be embedded in either a throtl_grp or throtl_data.
  * Determine the associated throtl_data accordingly and return it.
  */
 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
index a9a8b8e..74835db 100644 (file)
@@ -573,7 +573,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        int ret;
        ssize_t bytes_read;
 
-       dprintk("%s: read %Zd bytes\n", bd->name, count);
+       dprintk("%s: read %zd bytes\n", bd->name, count);
 
        bsg_set_block(bd, file);
 
@@ -648,7 +648,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
        ssize_t bytes_written;
        int ret;
 
-       dprintk("%s: write %Zd bytes\n", bd->name, count);
+       dprintk("%s: write %zd bytes\n", bd->name, count);
 
        if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
                return -EINVAL;
@@ -667,7 +667,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
        if (!bytes_written || err_block_err(ret))
                bytes_written = ret;
 
-       dprintk("%s: returning %Zd\n", bd->name, bytes_written);
+       dprintk("%s: returning %zd\n", bd->name, bytes_written);
        return bytes_written;
 }
 
index f77956c..747c2ba 100644 (file)
@@ -56,7 +56,7 @@ struct acpi_ipmi_device {
 struct ipmi_driver_data {
        struct list_head ipmi_devices;
        struct ipmi_smi_watcher bmc_events;
-       struct ipmi_user_hndl ipmi_hndlrs;
+       const struct ipmi_user_hndl ipmi_hndlrs;
        struct mutex ipmi_lock;
 
        /*
index 251f947..857dbc4 100644 (file)
@@ -242,7 +242,7 @@ acpi_status acpi_db_convert_to_package(char *string, union acpi_object *object)
  *
  * RETURN:      Status
  *
- * DESCRIPTION: Convert a typed and tokenized string to an union acpi_object. Typing:
+ * DESCRIPTION: Convert a typed and tokenized string to a union acpi_object. Typing:
  *              1) String objects were surrounded by quotes.
  *              2) Buffer objects were surrounded by parentheses.
  *              3) Package objects were surrounded by brackets "[]".
index 3dbbecf..9d14b50 100644 (file)
@@ -323,7 +323,7 @@ acpi_ns_check_reference(struct acpi_evaluate_info *info,
 
        /*
         * Check the reference object for the correct reference type (opcode).
-        * The only type of reference that can be converted to an union acpi_object is
+        * The only type of reference that can be converted to a union acpi_object is
         * a reference to a named object (reference class: NAME)
         */
        if (return_object->reference.class == ACPI_REFCLASS_NAME) {
index 8e365c0..c944ff5 100644 (file)
@@ -495,9 +495,9 @@ static void acpi_ns_resolve_references(struct acpi_evaluate_info *info)
        /*
         * Two types of references are supported - those created by Index and
         * ref_of operators. A name reference (AML_NAMEPATH_OP) can be converted
-        * to an union acpi_object, so it is not dereferenced here. A ddb_handle
+        * to a union acpi_object, so it is not dereferenced here. A ddb_handle
         * (AML_LOAD_OP) cannot be dereferenced, nor can it be converted to
-        * an union acpi_object.
+        * a union acpi_object.
         */
        switch (info->return_object->reference.class) {
        case ACPI_REFCLASS_INDEX:
index 8b11d6d..cd4c427 100644 (file)
@@ -406,7 +406,7 @@ static void acpi_dev_get_irqresource(struct resource *res, u32 gsi,
        }
 
        /*
-        * In IO-APIC mode, use overrided attribute. Two reasons:
+        * In IO-APIC mode, use overridden attribute. Two reasons:
         * 1. BIOS bug in DSDT
         * 2. BIOS uses IO-APIC mode Interrupt Source Override
         *
index 2b5d0fa..01c9466 100644 (file)
@@ -46,7 +46,7 @@ static bool qdf2400_erratum_44_present(struct acpi_table_header *h)
  * console is registered and if @earlycon is true, earlycon is set up.
  *
  * When CONFIG_ACPI_SPCR_TABLE is defined, this function should be called
- * from arch inintialization code as soon as the DT/ACPI decision is made.
+ * from arch initialization code as soon as the DT/ACPI decision is made.
  *
  */
 int __init parse_spcr(bool earlycon)
index 4e5bf36..ef68232 100644 (file)
@@ -2034,7 +2034,7 @@ static int speed_down_verdict_cb(struct ata_ering_entry *ent, void *void_arg)
  *     This is to expedite speed down decisions right after device is
  *     initially configured.
  *
- *     The followings are speed down rules.  #1 and #2 deal with
+ *     The following are speed down rules.  #1 and #2 deal with
  *     DUBIOUS errors.
  *
  *     1. If more than one DUBIOUS_ATA_BUS or DUBIOUS_TOUT_HSM errors
index f1a9198..4a61079 100644 (file)
@@ -2394,12 +2394,7 @@ static int __init amb_module_init (void)
 {
   PRINTD (DBG_FLOW|DBG_INIT, "init_module");
   
-  // sanity check - cast needed as printk does not support %Zu
-  if (sizeof(amb_mem) != 4*16 + 4*12) {
-    PRINTK (KERN_ERR, "Fix amb_mem (is %lu words).",
-           (unsigned long) sizeof(amb_mem));
-    return -ENOMEM;
-  }
+  BUILD_BUG_ON(sizeof(amb_mem) != 4*16 + 4*12);
   
   show_version();
   
index 623359e..b042ec4 100644 (file)
@@ -2326,11 +2326,7 @@ static int __init eni_init(void)
 {
        struct sk_buff *skb; /* dummy for sizeof */
 
-       if (sizeof(skb->cb) < sizeof(struct eni_skb_prv)) {
-               printk(KERN_ERR "eni_detect: skb->cb is too small (%Zd < %Zd)\n",
-                   sizeof(skb->cb),sizeof(struct eni_skb_prv));
-               return -EIO;
-       }
+       BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct eni_skb_prv));
        return pci_register_driver(&eni_driver);
 }
 
index 80c2ddc..22dcab9 100644 (file)
@@ -895,7 +895,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
        /* XXX handle qos parameters (rate limiting) ? */
 
        vcc = kmalloc(sizeof(struct fs_vcc), GFP_KERNEL);
-       fs_dprintk (FS_DEBUG_ALLOC, "Alloc VCC: %p(%Zd)\n", vcc, sizeof(struct fs_vcc));
+       fs_dprintk (FS_DEBUG_ALLOC, "Alloc VCC: %p(%zd)\n", vcc, sizeof(struct fs_vcc));
        if (!vcc) {
                clear_bit(ATM_VF_ADDR, &atm_vcc->flags);
                return -ENOMEM;
@@ -946,7 +946,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
 
        if (DO_DIRECTION (txtp)) {
                tc = kmalloc (sizeof (struct fs_transmit_config), GFP_KERNEL);
-               fs_dprintk (FS_DEBUG_ALLOC, "Alloc tc: %p(%Zd)\n",
+               fs_dprintk (FS_DEBUG_ALLOC, "Alloc tc: %p(%zd)\n",
                            tc, sizeof (struct fs_transmit_config));
                if (!tc) {
                        fs_dprintk (FS_DEBUG_OPEN, "fs: can't alloc transmit_config.\n");
@@ -1185,7 +1185,7 @@ static int fs_send (struct atm_vcc *atm_vcc, struct sk_buff *skb)
        vcc->last_skb = skb;
 
        td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
-       fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%Zd)\n", td, sizeof (struct FS_BPENTRY));
+       fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%zd)\n", td, sizeof (struct FS_BPENTRY));
        if (!td) {
                /* Oops out of mem */
                return -ENOMEM;
@@ -1492,7 +1492,7 @@ static void top_off_fp (struct fs_dev *dev, struct freepool *fp,
                fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-skb: %p(%d)\n", skb, fp->bufsize);
                if (!skb) break;
                ne = kmalloc (sizeof (struct FS_BPENTRY), gfp_flags);
-               fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-d: %p(%Zd)\n", ne, sizeof (struct FS_BPENTRY));
+               fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-d: %p(%zd)\n", ne, sizeof (struct FS_BPENTRY));
                if (!ne) {
                        fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p\n", skb);
                        dev_kfree_skb_any (skb);
@@ -1803,7 +1803,7 @@ static int fs_init(struct fs_dev *dev)
        }
        dev->atm_vccs = kcalloc (dev->nchannels, sizeof (struct atm_vcc *),
                                 GFP_KERNEL);
-       fs_dprintk (FS_DEBUG_ALLOC, "Alloc atmvccs: %p(%Zd)\n",
+       fs_dprintk (FS_DEBUG_ALLOC, "Alloc atmvccs: %p(%zd)\n",
                    dev->atm_vccs, dev->nchannels * sizeof (struct atm_vcc *));
 
        if (!dev->atm_vccs) {
@@ -1911,7 +1911,7 @@ static int firestream_init_one(struct pci_dev *pci_dev,
                goto err_out;
 
        fs_dev = kzalloc (sizeof (struct fs_dev), GFP_KERNEL);
-       fs_dprintk (FS_DEBUG_ALLOC, "Alloc fs-dev: %p(%Zd)\n",
+       fs_dprintk (FS_DEBUG_ALLOC, "Alloc fs-dev: %p(%zd)\n",
                    fs_dev, sizeof (struct fs_dev));
        if (!fs_dev)
                goto err_out;
index 584aa88..2bf1ef1 100644 (file)
@@ -2884,12 +2884,7 @@ static struct pci_driver hrz_driver = {
 /********** module entry **********/
 
 static int __init hrz_module_init (void) {
-  // sanity check - cast is needed since printk does not support %Zu
-  if (sizeof(struct MEMMAP) != 128*1024/4) {
-    PRINTK (KERN_ERR, "Fix struct MEMMAP (is %lu fakewords).",
-           (unsigned long) sizeof(struct MEMMAP));
-    return -ENOMEM;
-  }
+  BUILD_BUG_ON(sizeof(struct MEMMAP) != 128*1024/4);
   
   show_version();
   
index 8640baf..a4fa6c8 100644 (file)
@@ -21,7 +21,7 @@
       supports a variety of varients of Interphase ATM PCI (i)Chip adapter 
       card family (See www.iphase.com/products/ClassSheet.cfm?ClassID=ATM) 
       in terms of PHY type, the size of control memory and the size of 
-      packet memory. The followings are the change log and history:
+      packet memory. The following are the change log and history:
      
           Bugfix the Mona's UBR driver.
           Modify the basic memory allocation and dma logic.
index 53ecac5..2beacf2 100644 (file)
@@ -21,7 +21,7 @@
       supports a variety of varients of Interphase ATM PCI (i)Chip adapter 
       card family (See www.iphase.com/products/ClassSheet.cfm?ClassID=ATM) 
       in terms of PHY type, the size of control memory and the size of 
-      packet memory. The followings are the change log and history:
+      packet memory. The following are the change log and history:
      
           Bugfix the Mona's UBR driver.
           Modify the basic memory allocation and dma logic.
index 445505d..1a9bc51 100644 (file)
@@ -1389,7 +1389,7 @@ static void vcc_rx_aal5(struct lanai_vcc *lvcc, int endptr)
        if (n < 0)
                n += lanai_buf_size(&lvcc->rx.buf);
        APRINTK(n >= 0 && n < lanai_buf_size(&lvcc->rx.buf) && !(n & 15),
-           "vcc_rx_aal5: n out of range (%d/%Zu)\n",
+           "vcc_rx_aal5: n out of range (%d/%zu)\n",
            n, lanai_buf_size(&lvcc->rx.buf));
        /* Recover the second-to-last word to get true pdu length */
        if ((x = &end[-2]) < lvcc->rx.buf.start)
@@ -1493,9 +1493,9 @@ static int lanai_get_sized_buffer(struct lanai_dev *lanai,
                return -ENOMEM;
        if (unlikely(lanai_buf_size(buf) < size))
                printk(KERN_WARNING DEV_LABEL "(itf %d): wanted %d bytes "
-                   "for %s buffer, got only %Zu\n", lanai->number, size,
+                   "for %s buffer, got only %zu\n", lanai->number, size,
                    name, lanai_buf_size(buf));
-       DPRINTK("Allocated %Zu byte %s buffer\n", lanai_buf_size(buf), name);
+       DPRINTK("Allocated %zu byte %s buffer\n", lanai_buf_size(buf), name);
        return 0;
 }
 
@@ -1586,7 +1586,7 @@ static int service_buffer_allocate(struct lanai_dev *lanai)
            lanai->pci);
        if (unlikely(lanai->service.start == NULL))
                return -ENOMEM;
-       DPRINTK("allocated service buffer at 0x%08lX, size %Zu(%d)\n",
+       DPRINTK("allocated service buffer at 0x%08lX, size %zu(%d)\n",
            (unsigned long) lanai->service.start,
            lanai_buf_size(&lanai->service),
            lanai_buf_size_cardorder(&lanai->service));
@@ -2467,8 +2467,8 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
                    (lanai->status & STATUS_LED) ? 1 : 0,
                    (lanai->status & STATUS_GPIN) ? 1 : 0);
        if (left-- == 0)
-               return sprintf(page, "global buffer sizes: service=%Zu, "
-                   "aal0_rx=%Zu\n", lanai_buf_size(&lanai->service),
+               return sprintf(page, "global buffer sizes: service=%zu, "
+                   "aal0_rx=%zu\n", lanai_buf_size(&lanai->service),
                    lanai->naal0 ? lanai_buf_size(&lanai->aal0buf) : 0);
        if (left-- == 0) {
                get_statistics(lanai);
@@ -2513,7 +2513,7 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
                left += sprintf(&page[left], ",\n          rx_AAL=%d",
                    lvcc->rx.atmvcc->qos.aal == ATM_AAL5 ? 5 : 0);
                if (lvcc->rx.atmvcc->qos.aal == ATM_AAL5)
-                       left += sprintf(&page[left], ", rx_buf_size=%Zu, "
+                       left += sprintf(&page[left], ", rx_buf_size=%zu, "
                            "rx_bad_len=%u,\n          rx_service_trash=%u, "
                            "rx_service_stream=%u, rx_bad_crc=%u",
                            lanai_buf_size(&lvcc->rx.buf),
@@ -2524,7 +2524,7 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
        }
        if (lvcc->tx.atmvcc != NULL)
                left += sprintf(&page[left], ",\n          tx_AAL=%d, "
-                   "tx_buf_size=%Zu, tx_qos=%cBR, tx_backlogged=%c",
+                   "tx_buf_size=%zu, tx_qos=%cBR, tx_backlogged=%c",
                    lvcc->tx.atmvcc->qos.aal == ATM_AAL5 ? 5 : 0,
                    lanai_buf_size(&lvcc->tx.buf),
                    lvcc->tx.atmvcc == lanai->cbrvcc ? 'C' : 'U',
index cb28579..d879f3b 100644 (file)
@@ -1980,13 +1980,12 @@ static void dequeue_rx(ns_dev * card, ns_rsqe * rsqe)
        card->lbfqc = ns_stat_lfbqc_get(stat);
 
        id = le32_to_cpu(rsqe->buffer_handle);
-       skb = idr_find(&card->idr, id);
+       skb = idr_remove(&card->idr, id);
        if (!skb) {
                RXPRINTK(KERN_ERR
-                        "nicstar%d: idr_find() failed!\n", card->index);
+                        "nicstar%d: skb not found!\n", card->index);
                return;
        }
-       idr_remove(&card->idr, id);
        dma_sync_single_for_cpu(&card->pcidev->dev,
                                NS_PRV_DMA(skb),
                                (NS_PRV_BUFTYPE(skb) == BUF_SM
index 615e5b5..1165098 100644 (file)
@@ -2915,11 +2915,9 @@ out_idr_remove_vol:
        idr_remove(&connection->peer_devices, vnr);
 out_idr_remove_from_resource:
        for_each_connection(connection, resource) {
-               peer_device = idr_find(&connection->peer_devices, vnr);
-               if (peer_device) {
-                       idr_remove(&connection->peer_devices, vnr);
+               peer_device = idr_remove(&connection->peer_devices, vnr);
+               if (peer_device)
                        kref_put(&connection->kref, drbd_destroy_connection);
-               }
        }
        for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
                list_del(&peer_device->peer_devices);
index 3043771..4b52a16 100644 (file)
@@ -186,7 +186,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
         *
         * TODO: the above condition may be loosed in the future, and
         * direct I/O may be switched runtime at that time because most
-        * of requests in sane appplications should be PAGE_SIZE algined
+        * of requests in sane applications should be PAGE_SIZE aligned
         */
        if (dio) {
                if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
index 362cecc..4d68077 100644 (file)
@@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING   (1<<0)
 #define RBD_FEATURE_STRIPINGV2 (1<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
+#define RBD_FEATURE_DATA_POOL (1<<7)
 #define RBD_FEATURES_ALL       (RBD_FEATURE_LAYERING |         \
                                 RBD_FEATURE_STRIPINGV2 |       \
-                                RBD_FEATURE_EXCLUSIVE_LOCK)
+                                RBD_FEATURE_EXCLUSIVE_LOCK |   \
+                                RBD_FEATURE_DATA_POOL)
 
 /* Features supported by this (client software) implementation. */
 
@@ -144,10 +146,9 @@ struct rbd_image_header {
        /* These six fields never change for a given rbd image */
        char *object_prefix;
        __u8 obj_order;
-       __u8 crypt_type;
-       __u8 comp_type;
        u64 stripe_unit;
        u64 stripe_count;
+       s64 data_pool_id;
        u64 features;           /* Might be changeable someday? */
 
        /* The remaining fields need to be updated occasionally */
@@ -230,7 +231,7 @@ enum obj_req_flags {
 };
 
 struct rbd_obj_request {
-       const char              *object_name;
+       u64                     object_no;
        u64                     offset;         /* object start byte */
        u64                     length;         /* bytes from offset */
        unsigned long           flags;
@@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock);
 
 static struct kmem_cache       *rbd_img_request_cache;
 static struct kmem_cache       *rbd_obj_request_cache;
-static struct kmem_cache       *rbd_segment_name_cache;
 
 static int rbd_major;
 static DEFINE_IDA(rbd_dev_id_ida);
@@ -973,6 +973,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 }
 
 /*
+ * returns the size of an object in the image
+ */
+static u32 rbd_obj_bytes(struct rbd_image_header *header)
+{
+       return 1U << header->obj_order;
+}
+
+static void rbd_init_layout(struct rbd_device *rbd_dev)
+{
+       if (rbd_dev->header.stripe_unit == 0 ||
+           rbd_dev->header.stripe_count == 0) {
+               rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
+               rbd_dev->header.stripe_count = 1;
+       }
+
+       rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
+       rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
+       rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
+       rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
+                         rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
+       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+}
+
+/*
  * Fill an rbd image header with information from the given format 1
  * on-disk header.
  */
@@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        /* Allocate this now to avoid having to handle failure below */
 
        if (first_time) {
-               size_t len;
-
-               len = strnlen(ondisk->object_prefix,
-                               sizeof (ondisk->object_prefix));
-               object_prefix = kmalloc(len + 1, GFP_KERNEL);
+               object_prefix = kstrndup(ondisk->object_prefix,
+                                        sizeof(ondisk->object_prefix),
+                                        GFP_KERNEL);
                if (!object_prefix)
                        return -ENOMEM;
-               memcpy(object_prefix, ondisk->object_prefix, len);
-               object_prefix[len] = '\0';
        }
 
        /* Allocate the snapshot context and fill it in */
@@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        if (first_time) {
                header->object_prefix = object_prefix;
                header->obj_order = ondisk->options.order;
-               header->crypt_type = ondisk->options.crypt_type;
-               header->comp_type = ondisk->options.comp_type;
-               /* The rest aren't used for format 1 images */
-               header->stripe_unit = 0;
-               header->stripe_count = 0;
-               header->features = 0;
+               rbd_init_layout(rbd_dev);
        } else {
                ceph_put_snap_context(header->snapc);
                kfree(header->snap_names);
@@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
        rbd_dev->mapping.features = 0;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-       /* The explicit cast here is needed to drop the const qualifier */
-
-       kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
-static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
-{
-       char *name;
-       u64 segment;
-       int ret;
-       char *name_format;
-
-       name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
-       if (!name)
-               return NULL;
-       segment = offset >> rbd_dev->header.obj_order;
-       name_format = "%s.%012llx";
-       if (rbd_dev->image_format == 2)
-               name_format = "%s.%016llx";
-       ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
-                       rbd_dev->header.object_prefix, segment);
-       if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
-               pr_err("error formatting segment name for #%llu (%d)\n",
-                       segment, ret);
-               rbd_segment_name_free(name);
-               name = NULL;
-       }
-
-       return name;
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        return offset & (segment_size - 1);
 }
@@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
                                u64 offset, u64 length)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        offset &= segment_size - 1;
 
@@ -1287,14 +1269,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 }
 
 /*
- * returns the size of an object in the image
- */
-static u64 rbd_obj_bytes(struct rbd_image_header *header)
-{
-       return 1 << header->obj_order;
-}
-
-/*
  * bio helpers
  */
 
@@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
 {
        struct ceph_osd_request *osd_req = obj_request->osd_req;
 
-       dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
+       dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
+            obj_request, obj_request->object_no, obj_request->offset,
+            obj_request->length, osd_req);
        if (obj_request_img_data_test(obj_request)) {
                WARN_ON(obj_request->callback != rbd_img_obj_callback);
                rbd_img_request_get(obj_request->img_request);
@@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
        ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
 }
 
-static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
-{
-       dout("%s %p\n", __func__, obj_request);
-       ceph_osdc_cancel_request(obj_request->osd_req);
-}
-
-/*
- * Wait for an object request to complete.  If interrupted, cancel the
- * underlying osd request.
- *
- * @timeout: in jiffies, 0 means "wait forever"
- */
-static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
-                                 unsigned long timeout)
-{
-       long ret;
-
-       dout("%s %p\n", __func__, obj_request);
-       ret = wait_for_completion_interruptible_timeout(
-                                       &obj_request->completion,
-                                       ceph_timeout_jiffies(timeout));
-       if (ret <= 0) {
-               if (ret == 0)
-                       ret = -ETIMEDOUT;
-               rbd_obj_request_end(obj_request);
-       } else {
-               ret = 0;
-       }
-
-       dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
-       return ret;
-}
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
-       return __rbd_obj_request_wait(obj_request, 0);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
                rbd_osd_call_callback(obj_request);
                break;
        default:
-               rbd_warn(NULL, "%s: unsupported op %hu",
-                       obj_request->object_name, (unsigned short) opcode);
+               rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
+                        obj_request->object_no, opcode);
                break;
        }
 
@@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
        osd_req->r_data_offset = obj_request->offset;
 }
 
+static struct ceph_osd_request *
+__rbd_osd_req_create(struct rbd_device *rbd_dev,
+                    struct ceph_snap_context *snapc,
+                    int num_ops, unsigned int flags,
+                    struct rbd_obj_request *obj_request)
+{
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       const char *name_format = rbd_dev->image_format == 1 ?
+                                     RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+
+       req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
+       if (!req)
+               return NULL;
+
+       req->r_flags = flags;
+       req->r_callback = rbd_osd_req_callback;
+       req->r_priv = obj_request;
+
+       req->r_base_oloc.pool = rbd_dev->layout.pool_id;
+       if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+                       rbd_dev->header.object_prefix, obj_request->object_no))
+               goto err_req;
+
+       if (ceph_osdc_alloc_messages(req, GFP_NOIO))
+               goto err_req;
+
+       return req;
+
+err_req:
+       ceph_osdc_put_request(req);
+       return NULL;
+}
+
 /*
  * Create an osd request.  A read request has one osd op (read).
  * A write request has either one (watch) or two (hint+write) osd ops.
@@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_obj_request *obj_request)
 {
        struct ceph_snap_context *snapc = NULL;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
 
        if (obj_request_img_data_test(obj_request) &&
                (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
@@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
        rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
 
-       /* Allocate and initialize the request, for the num_ops ops */
-
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
-                                         GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
-               osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       else
-               osd_req->r_flags = CEPH_OSD_FLAG_READ;
-
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
+           (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+           CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
 }
 
 /*
@@ -2050,10 +1994,6 @@ static struct ceph_osd_request *
 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request;
-       struct ceph_snap_context *snapc;
-       struct rbd_device *rbd_dev;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
        int num_osd_ops = 3;
 
        rbd_assert(obj_request_img_data_test(obj_request));
@@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        if (img_request_discard_test(img_request))
                num_osd_ops = 2;
 
-       /* Allocate and initialize the request, for all the ops */
-
-       snapc = img_request->snapc;
-       rbd_dev = img_request->rbd_dev;
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
-                                               false, GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(img_request->rbd_dev,
+                                   img_request->snapc, num_osd_ops,
+                                   CEPH_OSD_FLAG_WRITE, obj_request);
 }
 
-
 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
 {
        ceph_osdc_put_request(osd_req);
 }
 
-/* object_name is assumed to be a non-null pointer and NUL-terminated */
-
-static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
-                                               u64 offset, u64 length,
-                                               enum obj_request_type type)
+static struct rbd_obj_request *
+rbd_obj_request_create(enum obj_request_type type)
 {
        struct rbd_obj_request *obj_request;
-       size_t size;
-       char *name;
 
        rbd_assert(obj_request_type_valid(type));
 
-       size = strlen(object_name) + 1;
-       name = kmalloc(size, GFP_NOIO);
-       if (!name)
-               return NULL;
-
        obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
-       if (!obj_request) {
-               kfree(name);
+       if (!obj_request)
                return NULL;
-       }
 
-       obj_request->object_name = memcpy(name, object_name, size);
-       obj_request->offset = offset;
-       obj_request->length = length;
-       obj_request->flags = 0;
        obj_request->which = BAD_WHICH;
        obj_request->type = type;
        INIT_LIST_HEAD(&obj_request->links);
        init_completion(&obj_request->completion);
        kref_init(&obj_request->kref);
 
-       dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
-               offset, length, (int)type, obj_request);
-
+       dout("%s %p\n", __func__, obj_request);
        return obj_request;
 }
 
@@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref)
                break;
        }
 
-       kfree(obj_request->object_name);
-       obj_request->object_name = NULL;
        kmem_cache_free(rbd_obj_request_cache, obj_request);
 }
 
@@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
        while (resid) {
                struct ceph_osd_request *osd_req;
-               const char *object_name;
-               u64 offset;
-               u64 length;
+               u64 object_no = img_offset >> rbd_dev->header.obj_order;
+               u64 offset = rbd_segment_offset(rbd_dev, img_offset);
+               u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
 
-               object_name = rbd_segment_name(rbd_dev, img_offset);
-               if (!object_name)
-                       goto out_unwind;
-               offset = rbd_segment_offset(rbd_dev, img_offset);
-               length = rbd_segment_length(rbd_dev, img_offset, resid);
-               obj_request = rbd_obj_request_create(object_name,
-                                               offset, length, type);
-               /* object request has its own copy of the object name */
-               rbd_segment_name_free(object_name);
+               obj_request = rbd_obj_request_create(type);
                if (!obj_request)
                        goto out_unwind;
 
+               obj_request->object_no = object_no;
+               obj_request->offset = offset;
+               obj_request->length = length;
+
                /*
                 * set obj_request->img_request before creating the
                 * osd_request so that it gets the right snapc
@@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
         * child image to which the original request was to be sent.
         */
        img_offset = obj_request->img_offset - obj_request->offset;
-       length = (u64)1 << rbd_dev->header.obj_order;
+       length = rbd_obj_bytes(&rbd_dev->header);
 
        /*
         * There is no defined parent data beyond the parent
@@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
        size_t size;
        int ret;
 
-       stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
-                                             OBJ_REQUEST_PAGES);
+       stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
        if (!stat_request)
                return -ENOMEM;
 
+       stat_request->object_no = obj_request->object_no;
+
        stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
                                                   stat_request);
        if (!stat_request->osd_req) {
@@ -3983,17 +3875,17 @@ out:
  * returned in the outbound buffer, or a negative error code.
  */
 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
-                            const char *object_name,
-                            const char *class_name,
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
                             const char *method_name,
                             const void *outbound,
                             size_t outbound_size,
                             void *inbound,
                             size_t inbound_size)
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages;
-       u32 page_count;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct page *req_page = NULL;
+       struct page *reply_page;
        int ret;
 
        /*
@@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
         * method.  Currently if this is present it will be a
         * snapshot id.
         */
-       page_count = (u32)calc_pages_for(0, inbound_size);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
+       if (outbound) {
+               if (outbound_size > PAGE_SIZE)
+                       return -E2BIG;
 
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
-
-       osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
-                                       class_name, method_name);
-       if (outbound_size) {
-               struct ceph_pagelist *pagelist;
-
-               pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
-               if (!pagelist)
-                       goto out;
+               req_page = alloc_page(GFP_KERNEL);
+               if (!req_page)
+                       return -ENOMEM;
 
-               ceph_pagelist_init(pagelist);
-               ceph_pagelist_append(pagelist, outbound, outbound_size);
-               osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
-                                               pagelist);
+               memcpy(page_address(req_page), outbound, outbound_size);
        }
-       osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages, inbound_size,
-                                       0, false, false);
-
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
-       if (ret)
-               goto out;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       reply_page = alloc_page(GFP_KERNEL);
+       if (!reply_page) {
+               if (req_page)
+                       __free_page(req_page);
+               return -ENOMEM;
+       }
 
-       rbd_assert(obj_request->xferred < (u64)INT_MAX);
-       ret = (int)obj_request->xferred;
-       ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
+                            CEPH_OSD_FLAG_READ, req_page, outbound_size,
+                            reply_page, &inbound_size);
+       if (!ret) {
+               memcpy(inbound, page_address(reply_page), inbound_size);
+               ret = inbound_size;
+       }
 
+       if (req_page)
+               __free_page(req_page);
+       __free_page(reply_page);
        return ret;
 }
 
@@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 }
 
 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
-                               const char *object_name,
-                               u64 offset, u64 length, void *buf)
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
+                            void *buf, int buf_len)
 
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages = NULL;
-       u32 page_count;
-       size_t size;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       struct page **pages;
+       int num_pages = calc_pages_for(0, buf_len);
        int ret;
 
-       page_count = (u32) calc_pages_for(offset, length);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, offset, length,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
-
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
 
-       osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
-                                       offset, length, 0, 0);
-       osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages,
-                                       obj_request->length,
-                                       obj_request->offset & ~PAGE_MASK,
-                                       false, false);
+       ceph_oid_copy(&req->r_base_oid, oid);
+       ceph_oloc_copy(&req->r_base_oloc, oloc);
+       req->r_flags = CEPH_OSD_FLAG_READ;
 
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
+       ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
        if (ret)
-               goto out;
+               goto out_req;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               goto out_req;
+       }
 
-       rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
-       size = (size_t) obj_request->xferred;
-       ceph_copy_from_page_vector(pages, buf, 0, size);
-       rbd_assert(size <= (size_t)INT_MAX);
-       ret = (int)size;
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
+       osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+                                        true);
+
+       ceph_osdc_start_request(osdc, req, false);
+       ret = ceph_osdc_wait_request(osdc, req);
+       if (ret >= 0)
+               ceph_copy_from_page_vector(pages, buf, 0, ret);
 
+out_req:
+       ceph_osdc_put_request(req);
        return ret;
 }
 
@@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
 
-               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
-                                      0, size, ondisk);
+               ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
+                                       &rbd_dev->header_oloc, ondisk, size);
                if (ret < 0)
                        goto out;
                if ((size_t)ret < size) {
@@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = {
 
 static void rbd_dev_release(struct device *dev);
 
-static struct device_type rbd_device_type = {
+static const struct device_type rbd_device_type = {
        .name           = "rbd",
        .groups         = rbd_attr_groups,
        .release        = rbd_dev_release,
@@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
+       rbd_dev->header.data_pool_id = CEPH_NOPOOL;
        ceph_oid_init(&rbd_dev->header_oid);
-       ceph_oloc_init(&rbd_dev->header_oloc);
+       rbd_dev->header_oloc.pool = spec->pool_id;
 
        mutex_init(&rbd_dev->watch_mutex);
        rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
@@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        rbd_dev->rbd_client = rbdc;
        rbd_dev->spec = spec;
 
-       rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.stripe_count = 1;
-       rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.pool_id = spec->pool_id;
-       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
-
        return rbd_dev;
 }
 
@@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_size",
-                               &snapid, sizeof (snapid),
-                               &size_buf, sizeof (size_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_size",
+                                 &snapid, sizeof(snapid),
+                                 &size_buf, sizeof(size_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_object_prefix", NULL, 0,
-                               reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_object_prefix",
+                                 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_features",
-                               &snapid, sizeof (snapid),
-                               &features_buf, sizeof (features_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_features",
+                                 &snapid, sizeof(snapid),
+                                 &features_buf, sizeof(features_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
 
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_parent",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_parent",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out_err;
@@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_stripe_unit_count", NULL, 0,
-                               (char *)&striping_info_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                               &rbd_dev->header_oloc, "get_stripe_unit_count",
+                               NULL, 0, &striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
         * out, and only fail if the image has non-default values.
         */
        ret = -EINVAL;
-       obj_size = (u64)1 << rbd_dev->header.obj_order;
+       obj_size = rbd_obj_bytes(&rbd_dev->header);
        p = &striping_info_buf;
        stripe_unit = ceph_decode_64(&p);
        if (stripe_unit != obj_size) {
@@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        return 0;
 }
 
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
+{
+       __le64 data_pool_id;
+       int ret;
+
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_data_pool",
+                                 NULL, 0, &data_pool_id, sizeof(data_pool_id));
+       if (ret < 0)
+               return ret;
+       if (ret < sizeof(data_pool_id))
+               return -EBADMSG;
+
+       rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
+       WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
+       return 0;
+}
+
 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 {
+       CEPH_DEFINE_OID_ONSTACK(oid);
        size_t image_id_size;
        char *image_id;
        void *p;
@@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
        if (!reply_buf)
                goto out;
 
-       ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
-                               "rbd", "dir_get_name",
-                               image_id, image_id_size,
-                               reply_buf, size);
+       ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "dir_get_name", image_id, image_id_size,
+                                 reply_buf, size);
        if (ret < 0)
                goto out;
        p = reply_buf;
@@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapcontext", NULL, 0,
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapcontext",
+                                 NULL, 0, reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        snapid = cpu_to_le64(snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapshot_name",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapshot_name",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0) {
                snap_name = ERR_PTR(ret);
@@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 {
        int ret;
        size_t size;
-       char *object_name;
+       CEPH_DEFINE_OID_ONSTACK(oid);
        void *response;
        char *image_id;
 
@@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
         * First, see if the format 2 image id file exists, and if
         * so, get the image's persistent id from it.
         */
-       size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
-       object_name = kmalloc(size, GFP_NOIO);
-       if (!object_name)
-               return -ENOMEM;
-       sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
-       dout("rbd id object name is %s\n", object_name);
+       ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
+                              rbd_dev->spec->image_name);
+       if (ret)
+               return ret;
+
+       dout("rbd id object name is %s\n", oid.name);
 
        /* Response will be an encoded string, which includes a length */
 
@@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 
        /* If it doesn't exist we'll assume it's a format 1 image */
 
-       ret = rbd_obj_method_sync(rbd_dev, object_name,
-                               "rbd", "get_id", NULL, 0,
-                               response, RBD_IMAGE_ID_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "get_id", NULL, 0,
+                                 response, RBD_IMAGE_ID_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret == -ENOENT) {
                image_id = kstrdup("", GFP_KERNEL);
@@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
        }
 out:
        kfree(response);
-       kfree(object_name);
-
+       ceph_oid_destroy(&oid);
        return ret;
 }
 
@@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
                if (ret < 0)
                        goto out_err;
        }
-       /* No support for crypto and compression type format 2 images */
 
+       if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
+               ret = rbd_dev_v2_data_pool(rbd_dev);
+               if (ret)
+                       goto out_err;
+       }
+
+       rbd_init_layout(rbd_dev);
        return 0;
+
 out_err:
        rbd_dev->header.features = 0;
        kfree(rbd_dev->header.object_prefix);
        rbd_dev->header.object_prefix = NULL;
-
        return ret;
 }
 
@@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
        /* Record the header object name for this rbd image. */
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
-
-       rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
        if (rbd_dev->image_format == 1)
                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
                                       spec->image_name, RBD_SUFFIX);
@@ -6471,27 +6335,16 @@ static int rbd_slab_init(void)
        if (!rbd_obj_request_cache)
                goto out_err;
 
-       rbd_assert(!rbd_segment_name_cache);
-       rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
-                                       CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
-       if (rbd_segment_name_cache)
-               return 0;
-out_err:
-       kmem_cache_destroy(rbd_obj_request_cache);
-       rbd_obj_request_cache = NULL;
+       return 0;
 
+out_err:
        kmem_cache_destroy(rbd_img_request_cache);
        rbd_img_request_cache = NULL;
-
        return -ENOMEM;
 }
 
 static void rbd_slab_exit(void)
 {
-       rbd_assert(rbd_segment_name_cache);
-       kmem_cache_destroy(rbd_segment_name_cache);
-       rbd_segment_name_cache = NULL;
-
        rbd_assert(rbd_obj_request_cache);
        kmem_cache_destroy(rbd_obj_request_cache);
        rbd_obj_request_cache = NULL;
index 94f367d..62ff50d 100644 (file)
@@ -25,8 +25,8 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
-#define RBD_DATA_PREFIX        "rbd_data."
 #define RBD_ID_PREFIX          "rbd_id."
+#define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
 #define RBD_LOCK_NAME          "rbd_lock"
 #define RBD_LOCK_TAG           "internal"
@@ -42,13 +42,14 @@ enum rbd_notify_op {
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd           - image metadata
- *   rb.<idhi>.<idlo>.00000000
- *   rb.<idhi>.<idlo>.00000001
+ *   rb.<idhi>.<idlo>.<extra>.000000000000
+ *   rb.<idhi>.<idlo>.<extra>.000000000001
  *   ...               - data
  * There is no notion of a persistent image id in rbd format 1.
  */
 
 #define RBD_SUFFIX             ".rbd"
+#define RBD_V1_DATA_FORMAT     "%s.%012llx"
 
 #define RBD_DIRECTORY           "rbd_directory"
 #define RBD_INFO                "rbd_info"
@@ -57,9 +58,6 @@ enum rbd_notify_op {
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_COMP_NONE          0
-#define RBD_CRYPT_NONE         0
-
 #define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
 #define RBD_HEADER_SIGNATURE   "RBD"
 #define RBD_HEADER_VERSION     "001.005"
index 7f81665..90f3edf 100644 (file)
@@ -78,7 +78,8 @@ config IPMI_POWEROFF
 endif # IPMI_HANDLER
 
 config ASPEED_BT_IPMI_BMC
-       depends on ARCH_ASPEED
+       depends on ARCH_ASPEED || COMPILE_TEST
+       depends on REGMAP && REGMAP_MMIO && MFD_SYSCON
        tristate "BT IPMI bmc driver"
        help
          Provides a driver for the BT (Block Transfer) IPMI interface
index fc9e889..d6f5d9e 100644 (file)
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/mfd/syscon.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/poll.h>
+#include <linux/regmap.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
 
@@ -60,7 +63,8 @@
 struct bt_bmc {
        struct device           dev;
        struct miscdevice       miscdev;
-       void __iomem            *base;
+       struct regmap           *map;
+       int                     offset;
        int                     irq;
        wait_queue_head_t       queue;
        struct timer_list       poll_timer;
@@ -69,14 +73,29 @@ struct bt_bmc {
 
 static atomic_t open_count = ATOMIC_INIT(0);
 
+static const struct regmap_config bt_regmap_cfg = {
+       .reg_bits = 32,
+       .val_bits = 32,
+       .reg_stride = 4,
+};
+
 static u8 bt_inb(struct bt_bmc *bt_bmc, int reg)
 {
-       return ioread8(bt_bmc->base + reg);
+       uint32_t val = 0;
+       int rc;
+
+       rc = regmap_read(bt_bmc->map, bt_bmc->offset + reg, &val);
+       WARN(rc != 0, "regmap_read() failed: %d\n", rc);
+
+       return rc == 0 ? (u8) val : 0;
 }
 
 static void bt_outb(struct bt_bmc *bt_bmc, u8 data, int reg)
 {
-       iowrite8(data, bt_bmc->base + reg);
+       int rc;
+
+       rc = regmap_write(bt_bmc->map, bt_bmc->offset + reg, data);
+       WARN(rc != 0, "regmap_write() failed: %d\n", rc);
 }
 
 static void clr_rd_ptr(struct bt_bmc *bt_bmc)
@@ -367,14 +386,18 @@ static irqreturn_t bt_bmc_irq(int irq, void *arg)
 {
        struct bt_bmc *bt_bmc = arg;
        u32 reg;
+       int rc;
+
+       rc = regmap_read(bt_bmc->map, bt_bmc->offset + BT_CR2, &reg);
+       if (rc)
+               return IRQ_NONE;
 
-       reg = ioread32(bt_bmc->base + BT_CR2);
        reg &= BT_CR2_IRQ_H2B | BT_CR2_IRQ_HBUSY;
        if (!reg)
                return IRQ_NONE;
 
        /* ack pending IRQs */
-       iowrite32(reg, bt_bmc->base + BT_CR2);
+       regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR2, reg);
 
        wake_up(&bt_bmc->queue);
        return IRQ_HANDLED;
@@ -384,7 +407,6 @@ static int bt_bmc_config_irq(struct bt_bmc *bt_bmc,
                             struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
-       u32 reg;
        int rc;
 
        bt_bmc->irq = platform_get_irq(pdev, 0);
@@ -405,18 +427,17 @@ static int bt_bmc_config_irq(struct bt_bmc *bt_bmc,
         * will be cleared (along with B2H) when we can write the next
         * message to the BT buffer
         */
-       reg = ioread32(bt_bmc->base + BT_CR1);
-       reg |= BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY;
-       iowrite32(reg, bt_bmc->base + BT_CR1);
+       rc = regmap_update_bits(bt_bmc->map, bt_bmc->offset + BT_CR1,
+                               (BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY),
+                               (BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY));
 
-       return 0;
+       return rc;
 }
 
 static int bt_bmc_probe(struct platform_device *pdev)
 {
        struct bt_bmc *bt_bmc;
        struct device *dev;
-       struct resource *res;
        int rc;
 
        if (!pdev || !pdev->dev.of_node)
@@ -431,10 +452,27 @@ static int bt_bmc_probe(struct platform_device *pdev)
 
        dev_set_drvdata(&pdev->dev, bt_bmc);
 
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       bt_bmc->base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(bt_bmc->base))
-               return PTR_ERR(bt_bmc->base);
+       bt_bmc->map = syscon_node_to_regmap(pdev->dev.parent->of_node);
+       if (IS_ERR(bt_bmc->map)) {
+               struct resource *res;
+               void __iomem *base;
+
+               /*
+                * Assume it's not the MFD-based devicetree description, in
+                * which case generate a regmap ourselves
+                */
+               res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+               base = devm_ioremap_resource(&pdev->dev, res);
+               if (IS_ERR(base))
+                       return PTR_ERR(base);
+
+               bt_bmc->map = devm_regmap_init_mmio(dev, base, &bt_regmap_cfg);
+               bt_bmc->offset = 0;
+       } else {
+               rc = of_property_read_u32(dev->of_node, "reg", &bt_bmc->offset);
+               if (rc)
+                       return rc;
+       }
 
        mutex_init(&bt_bmc->mutex);
        init_waitqueue_head(&bt_bmc->queue);
@@ -461,12 +499,12 @@ static int bt_bmc_probe(struct platform_device *pdev)
                add_timer(&bt_bmc->poll_timer);
        }
 
-       iowrite32((BT_IO_BASE << BT_CR0_IO_BASE) |
-                 (BT_IRQ << BT_CR0_IRQ) |
-                 BT_CR0_EN_CLR_SLV_RDP |
-                 BT_CR0_EN_CLR_SLV_WRP |
-                 BT_CR0_ENABLE_IBT,
-                 bt_bmc->base + BT_CR0);
+       regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR0,
+                    (BT_IO_BASE << BT_CR0_IO_BASE) |
+                    (BT_IRQ << BT_CR0_IRQ) |
+                    BT_CR0_EN_CLR_SLV_RDP |
+                    BT_CR0_EN_CLR_SLV_WRP |
+                    BT_CR0_ENABLE_IBT);
 
        clr_b_busy(bt_bmc);
 
index a21407d..f45119c 100644 (file)
@@ -108,7 +108,7 @@ static int ipmi_fasync(int fd, struct file *file, int on)
        return (result);
 }
 
-static struct ipmi_user_hndl ipmi_hndlrs =
+static const struct ipmi_user_hndl ipmi_hndlrs =
 {
        .ipmi_recv_hndl = file_receive_handler,
 };
index 92e53ac..9f69995 100644 (file)
@@ -102,7 +102,7 @@ struct ipmi_user {
        struct kref refcount;
 
        /* The upper layer that handles receive messages. */
-       struct ipmi_user_hndl *handler;
+       const struct ipmi_user_hndl *handler;
        void             *handler_data;
 
        /* The interface this user is bound to. */
@@ -919,7 +919,7 @@ static int intf_err_seq(ipmi_smi_t   intf,
 
 
 int ipmi_create_user(unsigned int          if_num,
-                    struct ipmi_user_hndl *handler,
+                    const struct ipmi_user_hndl *handler,
                     void                  *handler_data,
                     ipmi_user_t           *user)
 {
index 6e658aa..b338a4b 100644 (file)
@@ -196,7 +196,7 @@ static void ipmi_powernv_poll(void *send_info)
        ipmi_powernv_recv(smi);
 }
 
-static struct ipmi_smi_handlers ipmi_powernv_smi_handlers = {
+static const struct ipmi_smi_handlers ipmi_powernv_smi_handlers = {
        .owner                  = THIS_MODULE,
        .start_processing       = ipmi_powernv_start_processing,
        .sender                 = ipmi_powernv_send,
index 4035495..30b9e83 100644 (file)
@@ -985,7 +985,7 @@ static void ipmi_wdog_pretimeout_handler(void *handler_data)
        pretimeout_since_last_heartbeat = 1;
 }
 
-static struct ipmi_user_hndl ipmi_hndlrs = {
+static const struct ipmi_user_hndl ipmi_hndlrs = {
        .ipmi_recv_hndl           = ipmi_wdog_msg_handler,
        .ipmi_watchdog_pretimeout = ipmi_wdog_pretimeout_handler
 };
index e051fc8..cd53771 100644 (file)
@@ -655,7 +655,7 @@ static void terminate_monitor(struct cm4000_dev *dev)
  * monitor the card every 50msec. as a side-effect, retrieve the
  * atr once a card is inserted. another side-effect of retrieving the
  * atr is that the card will be powered on, so there is no need to
- * power on the card explictely from the application: the driver
+ * power on the card explicitly from the application: the driver
  * is already doing that for you.
  */
 
@@ -1037,7 +1037,7 @@ release_io:
        clear_bit(LOCK_IO, &dev->flags);
        wake_up_interruptible(&dev->ioq);
 
-       DEBUGP(2, dev, "<- cmm_read returns: rc = %Zi\n",
+       DEBUGP(2, dev, "<- cmm_read returns: rc = %zi\n",
               (rc < 0 ? rc : count));
        return rc < 0 ? rc : count;
 }
index d712325..d4dbd8d 100644 (file)
@@ -331,7 +331,7 @@ static ssize_t cm4040_write(struct file *filp, const char __user *buf,
        }
 
        if ((count < 5) || (count > READ_WRITE_BUFFER_SIZE)) {
-               DEBUGP(2, dev, "<- cm4040_write buffersize=%Zd < 5\n", count);
+               DEBUGP(2, dev, "<- cm4040_write buffersize=%zd < 5\n", count);
                return -EIO;
        }
 
index 4fa7fcd..f4f866e 100644 (file)
@@ -603,7 +603,7 @@ static void sonypi_type3_srs(void)
        u16 v16;
        u8  v8;
 
-       /* This model type uses the same initialiazation of
+       /* This model type uses the same initialization of
         * the embedded controller as the type2 models. */
        sonypi_type2_srs();
 
index 2cac445..0b49dbc 100644 (file)
@@ -62,19 +62,32 @@ config CRYPTO_DEV_GEODE
          will be called geode-aes.
 
 config ZCRYPT
-       tristate "Support for PCI-attached cryptographic adapters"
+       tristate "Support for s390 cryptographic adapters"
        depends on S390
        select HW_RANDOM
        help
-         Select this option if you want to use a PCI-attached cryptographic
-         adapter like:
-         + PCI Cryptographic Accelerator (PCICA)
-         + PCI Cryptographic Coprocessor (PCICC)
+         Select this option if you want to enable support for
+         s390 cryptographic adapters like:
          + PCI-X Cryptographic Coprocessor (PCIXCC)
-         + Crypto Express2 Coprocessor (CEX2C)
-         + Crypto Express2 Accelerator (CEX2A)
-         + Crypto Express3 Coprocessor (CEX3C)
-         + Crypto Express3 Accelerator (CEX3A)
+         + Crypto Express 2,3,4 or 5 Coprocessor (CEXxC)
+         + Crypto Express 2,3,4 or 5 Accelerator (CEXxA)
+         + Crypto Express 4 or 5 EP11 Coprocessor (CEXxP)
+
+config PKEY
+       tristate "Kernel API for protected key handling"
+       depends on S390
+       depends on ZCRYPT
+       help
+         With this option enabled the pkey kernel module provides an API
+         for creation and handling of protected keys. Other parts of the
+         kernel or userspace applications may use these functions.
+
+         Select this option if you want to enable the kernel and userspace
+         API for proteced key handling.
+
+         Please note that creation of protected keys from secure keys
+         requires to have at least one CEX card in coprocessor mode
+         available at runtime.
 
 config CRYPTO_SHA1_S390
        tristate "SHA1 digest algorithm"
@@ -124,6 +137,7 @@ config CRYPTO_AES_S390
        depends on S390
        select CRYPTO_ALGAPI
        select CRYPTO_BLKCIPHER
+       select PKEY
        help
          This is the s390 hardware accelerated implementation of the
          AES cipher algorithms (FIPS-197).
index 579f826..fef39f9 100644 (file)
@@ -269,7 +269,7 @@ static int deinstantiate_rng(struct device *ctrldev, int state_handle_mask)
                /*
                 * If the corresponding bit is set, then it means the state
                 * handle was initialized by us, and thus it needs to be
-                * deintialized as well
+                * deinitialized as well
                 */
                if ((1 << sh_idx) & state_handle_mask) {
                        /*
index 551a271..dea0487 100644 (file)
@@ -1228,7 +1228,7 @@ static int __init devfreq_init(void)
 subsys_initcall(devfreq_init);
 
 /*
- * The followings are helper functions for devfreq user device drivers with
+ * The following are helper functions for devfreq user device drivers with
  * OPP framework.
  */
 
index 718f832..0007b79 100644 (file)
@@ -325,6 +325,9 @@ static const struct file_operations dma_buf_fops = {
        .llseek         = dma_buf_llseek,
        .poll           = dma_buf_poll,
        .unlocked_ioctl = dma_buf_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = dma_buf_ioctl,
+#endif
 };
 
 /*
index 3e882aa..eaa355e 100644 (file)
@@ -537,7 +537,7 @@ static void rt8973a_init_dev_type(struct rt8973a_muic_info *info)
                regmap_update_bits(info->regmap, reg, mask, val);
        }
 
-       /* Check whether RT8973A is auto swithcing mode or not */
+       /* Check whether RT8973A is auto switching mode or not */
        ret = regmap_read(info->regmap, RT8973A_REG_CONTROL1, &data);
        if (ret) {
                dev_err(info->dev,
index aee149b..a301fcf 100644 (file)
@@ -1307,8 +1307,7 @@ static void iso_resource_work(struct work_struct *work)
         */
        if (r->todo == ISO_RES_REALLOC && !success &&
            !client->in_shutdown &&
-           idr_find(&client->resource_idr, r->resource.handle)) {
-               idr_remove(&client->resource_idr, r->resource.handle);
+           idr_remove(&client->resource_idr, r->resource.handle)) {
                client_put(client);
                free = true;
        }
index f9e3aee..7c2eed7 100644 (file)
@@ -1068,7 +1068,7 @@ static void fw_device_init(struct work_struct *work)
 
        /*
         * Transition the device to running state.  If it got pulled
-        * out from under us while we did the intialization work, we
+        * out from under us while we did the initialization work, we
         * have to shut down the device again here.  Normally, though,
         * fw_node_event will be responsible for shutting it down when
         * necessary.  We have to use the atomic cmpxchg here to avoid
@@ -1231,7 +1231,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event)
                        break;
 
                /*
-                * Do minimal intialization of the device here, the
+                * Do minimal initialization of the device here, the
                 * rest will happen in fw_device_init().
                 *
                 * Attention:  A lot of things, even fw_device_get(),
index c02db01..0218cea 100644 (file)
@@ -70,10 +70,10 @@ static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id)
        struct amdgpu_bo_list *list;
 
        mutex_lock(&fpriv->bo_list_lock);
-       list = idr_find(&fpriv->bo_list_handles, id);
+       list = idr_remove(&fpriv->bo_list_handles, id);
        if (list) {
+               /* Another user may have a reference to this list still */
                mutex_lock(&list->lock);
-               idr_remove(&fpriv->bo_list_handles, id);
                mutex_unlock(&list->lock);
                amdgpu_bo_list_free(list);
        }
index 400c66b..cf05006 100644 (file)
@@ -135,15 +135,11 @@ static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
        struct amdgpu_ctx *ctx;
 
        mutex_lock(&mgr->lock);
-       ctx = idr_find(&mgr->ctx_handles, id);
-       if (ctx) {
-               idr_remove(&mgr->ctx_handles, id);
+       ctx = idr_remove(&mgr->ctx_handles, id);
+       if (ctx)
                kref_put(&ctx->refcount, amdgpu_ctx_do_release);
-               mutex_unlock(&mgr->lock);
-               return 0;
-       }
        mutex_unlock(&mgr->lock);
-       return -EINVAL;
+       return ctx ? 0 : -EINVAL;
 }
 
 static int amdgpu_ctx_query(struct amdgpu_device *adev,
index ef7c8de..ca5f2aa 100644 (file)
@@ -262,7 +262,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
         * and because the mmu_notifier_unregister function also drop
         * mm_count we need to take an extra count here.
         */
-       atomic_inc(&p->mm->mm_count);
+       mmgrab(p->mm);
        mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
        mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
 }
index 4a4d379..181a2c3 100644 (file)
 #define HW_ASSISTED_I2C_STATUS_FAILURE     2
 #define HW_ASSISTED_I2C_STATUS_SUCCESS     1
 
-#pragma pack(1)                                       // BIOS data must use byte aligment
+#pragma pack(1)                                       // BIOS data must use byte alignment
 
 // Define offset to location of ROM header.
 #define OFFSET_TO_POINTER_TO_ATOM_ROM_HEADER         0x00000048L
@@ -4361,7 +4361,7 @@ typedef struct _ATOM_GPIO_PIN_ASSIGNMENT
 // GPIO use to control PCIE_VDDC in certain SLT board
 #define PCIE_VDDC_CONTROL_GPIO_PINID        56
 
-//from SMU7.x, if ucGPIO_ID=PP_AC_DC_SWITCH_GPIO_PINID in GPIO_LUTTable, AC/DC swithing feature is enable
+//from SMU7.x, if ucGPIO_ID=PP_AC_DC_SWITCH_GPIO_PINID in GPIO_LUTTable, AC/DC switching feature is enable
 #define PP_AC_DC_SWITCH_GPIO_PINID          60
 //from SMU7.x, if ucGPIO_ID=VDDC_REGULATOR_VRHOT_GPIO_PINID in GPIO_LUTable, VRHot feature is enable
 #define VDDC_VRHOT_GPIO_PINID               61
@@ -9180,7 +9180,7 @@ typedef struct  _ATOM_POWERPLAY_INFO_V3
 
 /*********************************************************************************/
 
-#pragma pack() // BIOS data must use byte aligment
+#pragma pack() // BIOS data must use byte alignment
 
 #pragma pack(1)
 
@@ -9211,7 +9211,7 @@ typedef struct _ATOM_SERVICE_INFO
 
 
 
-#pragma pack() // BIOS data must use byte aligment
+#pragma pack() // BIOS data must use byte alignment
 
 //
 // AMD ACPI Table
index 2612997..80ed659 100644 (file)
@@ -89,7 +89,7 @@ enum phm_platform_caps {
        PHM_PlatformCaps_EnableSideportControl,                 /* indicates Sideport can be controlled */
        PHM_PlatformCaps_VideoPlaybackEEUNotification,          /* indicates EEU notification of video start/stop is required */
        PHM_PlatformCaps_TurnOffPll_ASPML1,                     /* PCIE Turn Off PLL in ASPM L1 */
-       PHM_PlatformCaps_EnableHTLinkControl,                   /* indicates HT Link can be controlled by ACPI or CLMC overrided/automated mode. */
+       PHM_PlatformCaps_EnableHTLinkControl,                   /* indicates HT Link can be controlled by ACPI or CLMC overridden/automated mode. */
        PHM_PlatformCaps_PerformanceStateOnly,                  /* indicates only performance power state to be used on current system. */
        PHM_PlatformCaps_ExclusiveModeAlwaysHigh,               /* In Exclusive (3D) mode always stay in High state. */
        PHM_PlatformCaps_DisableMGClockGating,                  /* to disable Medium Grain Clock Gating or not */
index 9338145..dc4419a 100644 (file)
@@ -220,8 +220,8 @@ drm_connector_detect(struct drm_connector *connector, bool force)
  *    - drm_mode_validate_basic() performs basic sanity checks
  *    - drm_mode_validate_size() filters out modes larger than @maxX and @maxY
  *      (if specified)
- *    - drm_mode_validate_flag() checks the modes againt basic connector
- *      capabilites (interlace_allowed,doublescan_allowed,stereo_allowed)
+ *    - drm_mode_validate_flag() checks the modes against basic connector
+ *      capabilities (interlace_allowed,doublescan_allowed,stereo_allowed)
  *    - the optional &drm_connector_helper_funcs.mode_valid helper can perform
  *      driver and/or hardware specific checks
  *
index b42c81b..7032c54 100644 (file)
@@ -60,7 +60,7 @@ render_state_get_rodata(const struct intel_engine_cs *engine)
  * this is sufficient as the null state generator makes the final batch
  * with two passes to build command and state separately. At this point
  * the size of both are known and it compacts them by relocating the state
- * right after the commands taking care of aligment so we should sufficient
+ * right after the commands taking care of alignment so we should sufficient
  * space below them for adding new commands.
  */
 #define OUT_BATCH(batch, i, val)                               \
index 6a8fa08..0115989 100644 (file)
@@ -334,7 +334,7 @@ i915_gem_userptr_init__mm_struct(struct drm_i915_gem_object *obj)
                mm->i915 = to_i915(obj->base.dev);
 
                mm->mm = current->mm;
-               atomic_inc(&current->mm->mm_count);
+               mmgrab(current->mm);
 
                mm->mn = NULL;
 
@@ -507,7 +507,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
                        flags |= FOLL_WRITE;
 
                ret = -EFAULT;
-               if (atomic_inc_not_zero(&mm->mm_users)) {
+               if (mmget_not_zero(mm)) {
                        down_read(&mm->mmap_sem);
                        while (pinned < npages) {
                                ret = get_user_pages_remote
index d5ce829..45cf363 100644 (file)
@@ -266,7 +266,7 @@ do {                                                                        \
 do {                                                                   \
        if (MGA_VERBOSE) {                                              \
                DRM_INFO("BEGIN_DMA(%d)\n", (n));                       \
-               DRM_INFO("   space=0x%x req=0x%Zx\n",                   \
+               DRM_INFO("   space=0x%x req=0x%zx\n",                   \
                         dev_priv->prim.space, (n) * DMA_BLOCK_SIZE);   \
        }                                                               \
        prim = dev_priv->prim.start;                                    \
@@ -313,7 +313,7 @@ do {                                                                        \
 #define DMA_WRITE(offset, val)                                         \
 do {                                                                   \
        if (MGA_VERBOSE)                                                \
-               DRM_INFO("   DMA_WRITE( 0x%08x ) at 0x%04Zx\n",         \
+               DRM_INFO("   DMA_WRITE( 0x%08x ) at 0x%04zx\n",         \
                         (u32)(val), write + (offset) * sizeof(u32));   \
        *(volatile u32 *)(prim + write + (offset) * sizeof(u32)) = val; \
 } while (0)
index ab89eed..4b86e8b 100644 (file)
 #define HW_ASSISTED_I2C_STATUS_FAILURE          2
 #define HW_ASSISTED_I2C_STATUS_SUCCESS          1
 
-#pragma pack(1)                                       /* BIOS data must use byte aligment */
+#pragma pack(1)                                       /* BIOS data must use byte alignment */
 
 /*  Define offset to location of ROM header. */
 
@@ -3883,7 +3883,7 @@ typedef struct _ATOM_GPIO_PIN_ASSIGNMENT
 }ATOM_GPIO_PIN_ASSIGNMENT;
 
 //ucGPIO_ID pre-define id for multiple usage
-//from SMU7.x, if ucGPIO_ID=PP_AC_DC_SWITCH_GPIO_PINID in GPIO_LUTTable, AC/DC swithing feature is enable
+//from SMU7.x, if ucGPIO_ID=PP_AC_DC_SWITCH_GPIO_PINID in GPIO_LUTTable, AC/DC switching feature is enable
 #define PP_AC_DC_SWITCH_GPIO_PINID          60
 //from SMU7.x, if ucGPIO_ID=VDDC_REGULATOR_VRHOT_GPIO_PINID in GPIO_LUTable, VRHot feature is enable
 #define VDDC_VRHOT_GPIO_PINID               61
@@ -7909,7 +7909,7 @@ typedef struct  _ATOM_POWERPLAY_INFO_V3
 
 /*********************************************************************************/
 
-#pragma pack() // BIOS data must use byte aligment
+#pragma pack() // BIOS data must use byte alignment
 
 //
 // AMD ACPI Table
index ad31b3e..0e4eb84 100644 (file)
@@ -24,6 +24,7 @@ config ROCKCHIP_ANALOGIX_DP
 config ROCKCHIP_CDN_DP
         tristate "Rockchip cdn DP"
         depends on DRM_ROCKCHIP
+       depends on EXTCON
        select SND_SOC_HDMI_CODEC if SND_SOC
         help
          This selects support for Rockchip SoC specific extensions
index 9ab67a6..fd79a70 100644 (file)
@@ -111,7 +111,7 @@ static int cdn_dp_clk_enable(struct cdn_dp_device *dp)
        ret = pm_runtime_get_sync(dp->dev);
        if (ret < 0) {
                DRM_DEV_ERROR(dp->dev, "cannot get pm runtime %d\n", ret);
-               goto err_pclk;
+               goto err_pm_runtime_get;
        }
 
        reset_control_assert(dp->core_rst);
@@ -133,6 +133,8 @@ static int cdn_dp_clk_enable(struct cdn_dp_device *dp)
        return 0;
 
 err_set_rate:
+       pm_runtime_put(dp->dev);
+err_pm_runtime_get:
        clk_disable_unprepare(dp->core_clk);
 err_core_clk:
        clk_disable_unprepare(dp->pclk);
index 541a588..d08f269 100644 (file)
@@ -199,9 +199,14 @@ static const struct drm_ioctl_desc vmw_ioctls[] = {
        VMW_IOCTL_DEF(VMW_PRESENT_READBACK,
                      vmw_present_readback_ioctl,
                      DRM_MASTER | DRM_AUTH),
+       /*
+        * The permissions of the below ioctl are overridden in
+        * vmw_generic_ioctl(). We require either
+        * DRM_MASTER or capable(CAP_SYS_ADMIN).
+        */
        VMW_IOCTL_DEF(VMW_UPDATE_LAYOUT,
                      vmw_kms_update_layout_ioctl,
-                     DRM_MASTER | DRM_CONTROL_ALLOW),
+                     DRM_RENDER_ALLOW),
        VMW_IOCTL_DEF(VMW_CREATE_SHADER,
                      vmw_shader_define_ioctl,
                      DRM_AUTH | DRM_RENDER_ALLOW),
@@ -1123,6 +1128,10 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 
                        return (long) vmw_execbuf_ioctl(dev, arg, file_priv,
                                                        _IOC_SIZE(cmd));
+               } else if (nr == DRM_COMMAND_BASE + DRM_VMW_UPDATE_LAYOUT) {
+                       if (!drm_is_current_master(file_priv) &&
+                           !capable(CAP_SYS_ADMIN))
+                               return -EACCES;
                }
 
                if (unlikely(ioctl->cmd != cmd))
index 1e59a48..59ff419 100644 (file)
@@ -41,9 +41,9 @@
 #include <drm/ttm/ttm_module.h>
 #include "vmwgfx_fence.h"
 
-#define VMWGFX_DRIVER_DATE "20160210"
+#define VMWGFX_DRIVER_DATE "20170221"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 11
+#define VMWGFX_DRIVER_MINOR 12
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FILE_PAGE_OFFSET 0x00100000
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
index 1d08ba3..d646ac9 100644 (file)
@@ -159,7 +159,7 @@ static void zx_vl_rsz_setup(struct zx_plane *zplane, uint32_t format,
        void __iomem *rsz = zplane->rsz;
        u32 src_chroma_w = src_w;
        u32 src_chroma_h = src_h;
-       u32 fmt;
+       int fmt;
 
        /* Set up source and destination resolution */
        zx_writel(rsz + RSZ_SRC_CFG, RSZ_VER(src_h - 1) | RSZ_HOR(src_w - 1));
@@ -203,7 +203,7 @@ static void zx_vl_plane_atomic_update(struct drm_plane *plane,
        u32 src_x, src_y, src_w, src_h;
        u32 dst_x, dst_y, dst_w, dst_h;
        uint32_t format;
-       u32 fmt;
+       int fmt;
        int num_planes;
        int i;
 
index 0dd1167..9c113f6 100644 (file)
@@ -487,7 +487,7 @@ static __u8 *kye_consumer_control_fixup(struct hid_device *hdev, __u8 *rdesc,
                unsigned int *rsize, int offset, const char *device_name) {
        /*
         * the fixup that need to be done:
-        *   - change Usage Maximum in the Comsumer Control
+        *   - change Usage Maximum in the Consumer Control
         *     (report ID 3) to a reasonable value
         */
        if (*rsize >= offset + 31 &&
index 6dca2fd..6d1208b 100644 (file)
@@ -861,7 +861,7 @@ static ssize_t fan1_pulses_store(struct device *dev,
  * (i.e. closed or open-loop).
  *
  * Following documentation about hwmon's sysfs interface, a pwm1_enable node
- * should accept followings:
+ * should accept the following:
  *
  *  0 : no fan speed control (i.e. fan at full speed)
  *  1 : manual fan speed control enabled (use pwm[1-*]) (open-loop)
index b694099..9680384 100644 (file)
@@ -447,7 +447,7 @@ void ide_acpi_get_timing(ide_hwif_t *hwif)
        memcpy(&hwif->acpidata->gtm, out_obj->buffer.pointer,
               sizeof(struct GTM_buffer));
 
-       DEBPRINT("_GTM info: ptr: 0x%p, len: 0x%x, exp.len: 0x%Zx\n",
+       DEBPRINT("_GTM info: ptr: 0x%p, len: 0x%x, exp.len: 0x%zx\n",
                 out_obj->buffer.pointer, out_obj->buffer.length,
                 sizeof(struct GTM_buffer));
 
index 3c1b797..d8a552b 100644 (file)
@@ -1136,7 +1136,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
        ssize_t ret = 0;
        int rc;
 
-       ide_debug_log(IDE_DBG_FUNC, "count %Zd", count);
+       ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
 
        if (tape->chrdev_dir != IDETAPE_DIR_READ) {
                if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags))
@@ -1195,7 +1195,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
        if (tape->write_prot)
                return -EACCES;
 
-       ide_debug_log(IDE_DBG_FUNC, "count %Zd", count);
+       ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
 
        /* Initialize write operation */
        rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
index 46427ea..157f2d1 100644 (file)
@@ -300,7 +300,7 @@ static const struct ide_port_ops palm_bk3710_ports_ops = {
        .cable_detect           = palm_bk3710_cable_detect,
 };
 
-static struct ide_port_info palm_bk3710_port_info = {
+static struct ide_port_info palm_bk3710_port_info __initdata = {
        .init_dma               = palm_bk3710_init_dma,
        .port_ops               = &palm_bk3710_ports_ops,
        .dma_ops                = &sff_dma_ops,
index edaae9f..e426ac8 100644 (file)
@@ -13,6 +13,7 @@ ib_core-y :=                  packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 
 ib_cm-y :=                     cm.o
 
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644 (file)
index 0000000..126ac5f
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ *          accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+       device->cg_device.name = device->name;
+       return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+       rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                        struct ib_device *device,
+                        enum rdmacg_resource_type resource_index)
+{
+       return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+                                resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                       struct ib_device *device,
+                       enum rdmacg_resource_type resource_index)
+{
+       rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+                       resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
index 912ab4c..cb7d372 100644 (file)
@@ -35,6 +35,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                        struct ib_device *device,
+                        enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                       struct ib_device *device,
+                       enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                                      struct ib_device *device,
+                                      enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                                     struct ib_device *device,
+                                     enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
                                         struct net_device *upper)
 {
index a63e840..593d2ce 100644 (file)
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
                goto out;
        }
 
+       ret = ib_device_register_rdmacg(device);
+       if (ret) {
+               pr_warn("Couldn't register device with rdma cgroup\n");
+               ib_cache_cleanup_one(device);
+               goto out;
+       }
+
        memset(&device->attrs, 0, sizeof(device->attrs));
        ret = device->query_device(device, &device->attrs, &uhw);
        if (ret) {
                pr_warn("Couldn't query the device attributes\n");
+               ib_device_unregister_rdmacg(device);
                ib_cache_cleanup_one(device);
                goto out;
        }
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
        if (ret) {
                pr_warn("Couldn't register device %s with driver model\n",
                        device->name);
+               ib_device_unregister_rdmacg(device);
                ib_cache_cleanup_one(device);
                goto out;
        }
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
 
        mutex_unlock(&device_mutex);
 
+       ib_device_unregister_rdmacg(device);
        ib_device_unregister_sysfs(device);
        ib_cache_cleanup_one(device);
 
index b4b395a..7b7a76e 100644 (file)
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        struct ib_udata                   udata;
        struct ib_ucontext               *ucontext;
        struct file                      *filp;
+       struct ib_rdmacg_object          cg_obj;
        int ret;
 
        if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
                   (unsigned long) cmd.response + sizeof resp,
                   in_len - sizeof cmd, out_len - sizeof resp);
 
+       ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+       if (ret)
+               goto err;
+
        ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
        if (IS_ERR(ucontext)) {
                ret = PTR_ERR(ucontext);
-               goto err;
+               goto err_alloc;
        }
 
        ucontext->device = ib_dev;
+       ucontext->cg_obj = cg_obj;
        INIT_LIST_HEAD(&ucontext->pd_list);
        INIT_LIST_HEAD(&ucontext->mr_list);
        INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
        put_pid(ucontext->tgid);
        ib_dev->dealloc_ucontext(ucontext);
 
+err_alloc:
+       ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
 err:
        mutex_unlock(&file->mutex);
        return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
                return -ENOMEM;
 
        init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret) {
+               kfree(uobj);
+               return ret;
+       }
+
        down_write(&uobj->mutex);
 
        pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
        ib_dealloc_pd(pd);
 
 err:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
        put_uobj_write(uobj);
        return ret;
 }
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
        if (ret)
                goto err_put;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        uobj->live = 0;
        put_uobj_write(uobj);
 
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
                        goto err_put;
                }
        }
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
 
        mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
                                     cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
        ib_dereg_mr(mr);
 
 err_put:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
                   out_len - sizeof(resp));
 
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
        if (IS_ERR(mw)) {
                ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
        uverbs_dealloc_mw(mw);
 
 err_put:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
        if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
                attr.flags = cmd->flags;
 
+       ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        cq = ib_dev->create_cq(ib_dev, &attr,
                                             file->ucontext, uhw);
        if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
        ib_destroy_cq(cq);
 
 err_file:
+       ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
+                          RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        if (ev_file)
                ib_uverbs_release_ucq(file, ev_file, obj);
 
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
                        goto err_put;
                }
 
+       ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_put;
+
        if (cmd->qp_type == IB_QPT_XRC_TGT)
                qp = ib_create_qp(pd, &attr);
        else
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
 
        if (IS_ERR(qp)) {
                ret = PTR_ERR(qp);
-               goto err_put;
+               goto err_create;
        }
 
        if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1993,6 +2047,10 @@ err_cb:
 err_destroy:
        ib_destroy_qp(qp);
 
+err_create:
+       ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
+                          RDMACG_RESOURCE_HCA_OBJECT);
+
 err_put:
        if (xrcd)
                put_xrcd_read(xrcd_uobj);
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        if (obj->uxrcd)
                atomic_dec(&obj->uxrcd->refcnt);
 
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
        memset(&attr.dmac, 0, sizeof(attr.dmac));
        memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
 
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        ah = pd->device->create_ah(pd, &attr, &udata);
 
        if (IS_ERR(ah)) {
                ret = PTR_ERR(ah);
-               goto err_put;
+               goto err_create;
        }
 
        ah->device  = pd->device;
@@ -3013,7 +3078,10 @@ err_copy:
 err_destroy:
        ib_destroy_ah(ah);
 
-err_put:
+err_create:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err:
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
                err = -EINVAL;
                goto err_free;
        }
+
+       err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (err)
+               goto err_free;
+
        flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
        if (IS_ERR(flow_id)) {
                err = PTR_ERR(flow_id);
-               goto err_free;
+               goto err_create;
        }
        flow_id->uobject = uobj;
        uobj->object = flow_id;
@@ -3897,6 +3973,8 @@ err_copy:
        idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
        ib_destroy_flow(flow_id);
+err_create:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 err_free:
        kfree(flow_attr);
 err_put:
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
        flow_id = uobj->object;
 
        ret = ib_destroy_flow(flow_id);
-       if (!ret)
+       if (!ret) {
+               ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                uobj->live = 0;
+       }
 
        put_uobj_write(uobj);
 
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
        obj->uevent.events_reported = 0;
        INIT_LIST_HEAD(&obj->uevent.event_list);
 
+       ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_put_cq;
+
        srq = pd->device->create_srq(pd, &attr, udata);
        if (IS_ERR(srq)) {
                ret = PTR_ERR(srq);
@@ -4069,6 +4155,8 @@ err_destroy:
        ib_destroy_srq(srq);
 
 err_put:
+       ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
+                          RDMACG_RESOURCE_HCA_OBJECT);
        put_pd_read(pd);
 
 err_put_cq:
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        if (srq_type == IB_SRQT_XRC) {
                us = container_of(obj, struct ib_usrq_object, uevent);
                atomic_dec(&us->uxrcd->refcnt);
index e3fb4b1..35c788a 100644 (file)
@@ -51,6 +51,7 @@
 #include <rdma/ib.h>
 
 #include "uverbs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
                ib_destroy_ah(ah);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
                uverbs_dealloc_mw(mw);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
                ib_destroy_flow(flow_id);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                if (qp == qp->real_qp)
                        ib_uverbs_detach_umcast(qp, uqp);
                ib_destroy_qp(qp);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_uevent(file, &uqp->uevent);
                kfree(uqp);
        }
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
                ib_destroy_srq(srq);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_uevent(file, uevent);
                kfree(uevent);
        }
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
                ib_destroy_cq(cq);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_ucq(file, ev_file, ucq);
                kfree(ucq);
        }
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
                ib_dereg_mr(mr);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
                ib_dealloc_pd(pd);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
        put_pid(context->tgid);
 
+       ib_rdmacg_uncharge(&context->cg_obj, context->device,
+                          RDMACG_RESOURCE_HCA_HANDLE);
+
        return context->device->dealloc_ucontext(context);
 }
 
index f460339..3b19c16 100644 (file)
@@ -185,7 +185,7 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
        if (fd) {
                fd->rec_cpu_num = -1; /* no cpu affinity by default */
                fd->mm = current->mm;
-               atomic_inc(&fd->mm->mm_count);
+               mmgrab(fd->mm);
                fp->private_data = fd;
        } else {
                fp->private_data = NULL;
index 92399d3..06de1cb 100644 (file)
@@ -707,7 +707,7 @@ static void qib_6120_clear_freeze(struct qib_devdata *dd)
        /* disable error interrupts, to avoid confusion */
        qib_write_kreg(dd, kr_errmask, 0ULL);
 
-       /* also disable interrupts; errormask is sometimes overwriten */
+       /* also disable interrupts; errormask is sometimes overwritten */
        qib_6120_set_intr_state(dd, 0);
 
        qib_cancel_sends(dd->pport);
index e55e31a..55a1838 100644 (file)
@@ -1259,7 +1259,7 @@ static void qib_7220_clear_freeze(struct qib_devdata *dd)
        /* disable error interrupts, to avoid confusion */
        qib_write_kreg(dd, kr_errmask, 0ULL);
 
-       /* also disable interrupts; errormask is sometimes overwriten */
+       /* also disable interrupts; errormask is sometimes overwritten */
        qib_7220_set_intr_state(dd, 0);
 
        qib_cancel_sends(dd->pport);
index 9cc97bd..12c4208 100644 (file)
@@ -2053,7 +2053,7 @@ static void qib_7322_clear_freeze(struct qib_devdata *dd)
                        qib_write_kreg_port(dd->pport + pidx, krp_errmask,
                                            0ULL);
 
-       /* also disable interrupts; errormask is sometimes overwriten */
+       /* also disable interrupts; errormask is sometimes overwritten */
        qib_7322_set_intr_state(dd, 0);
 
        /* clear the freeze, and be sure chip saw it */
index f6e9977..bba241f 100644 (file)
@@ -74,9 +74,9 @@ int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
                    u16 *out_mad_pkey_index)
 {
        /*
-        * MAD processing is quite different between hfi1 and qib. Therfore this
-        * is expected to be provided by the driver. Other drivers in the future
-        * may chose to implement this but it should not be made into a
+        * MAD processing is quite different between hfi1 and qib. Therefore
+        * this is expected to be provided by the driver. Other drivers in the
+        * future may choose to implement this but it should not be made into a
         * requirement.
         */
        if (ibport_num_to_idx(ibdev, port_num) < 0)
index 44deca8..beaf61c 100644 (file)
@@ -202,7 +202,7 @@ static int cyttsp4_si_get_cydata(struct cyttsp4 *cd)
        int rc;
 
        si->si_ofs.cydata_size = si->si_ofs.test_ofs - si->si_ofs.cydata_ofs;
-       dev_dbg(cd->dev, "%s: cydata size: %Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: cydata size: %zd\n", __func__,
                        si->si_ofs.cydata_size);
 
        p = krealloc(si->si_ptrs.cydata, si->si_ofs.cydata_size, GFP_KERNEL);
@@ -430,13 +430,13 @@ static int cyttsp4_si_get_opcfg_data(struct cyttsp4 *cd)
        for (abs = 0; abs < CY_TCH_NUM_ABS; abs++) {
                dev_dbg(cd->dev, "%s: tch_rec_%s\n", __func__,
                        cyttsp4_tch_abs_string[abs]);
-               dev_dbg(cd->dev, "%s:     ofs =%2Zd\n", __func__,
+               dev_dbg(cd->dev, "%s:     ofs =%2zd\n", __func__,
                        si->si_ofs.tch_abs[abs].ofs);
-               dev_dbg(cd->dev, "%s:     siz =%2Zd\n", __func__,
+               dev_dbg(cd->dev, "%s:     siz =%2zd\n", __func__,
                        si->si_ofs.tch_abs[abs].size);
-               dev_dbg(cd->dev, "%s:     max =%2Zd\n", __func__,
+               dev_dbg(cd->dev, "%s:     max =%2zd\n", __func__,
                        si->si_ofs.tch_abs[abs].max);
-               dev_dbg(cd->dev, "%s:     bofs=%2Zd\n", __func__,
+               dev_dbg(cd->dev, "%s:     bofs=%2zd\n", __func__,
                        si->si_ofs.tch_abs[abs].bofs);
        }
 
@@ -586,62 +586,62 @@ static int cyttsp4_si_get_op_data_ptrs(struct cyttsp4 *cd)
 static void cyttsp4_si_put_log_data(struct cyttsp4 *cd)
 {
        struct cyttsp4_sysinfo *si = &cd->sysinfo;
-       dev_dbg(cd->dev, "%s: cydata_ofs =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: cydata_ofs =%4zd siz=%4zd\n", __func__,
                si->si_ofs.cydata_ofs, si->si_ofs.cydata_size);
-       dev_dbg(cd->dev, "%s: test_ofs   =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: test_ofs   =%4zd siz=%4zd\n", __func__,
                si->si_ofs.test_ofs, si->si_ofs.test_size);
-       dev_dbg(cd->dev, "%s: pcfg_ofs   =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: pcfg_ofs   =%4zd siz=%4zd\n", __func__,
                si->si_ofs.pcfg_ofs, si->si_ofs.pcfg_size);
-       dev_dbg(cd->dev, "%s: opcfg_ofs  =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: opcfg_ofs  =%4zd siz=%4zd\n", __func__,
                si->si_ofs.opcfg_ofs, si->si_ofs.opcfg_size);
-       dev_dbg(cd->dev, "%s: ddata_ofs  =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: ddata_ofs  =%4zd siz=%4zd\n", __func__,
                si->si_ofs.ddata_ofs, si->si_ofs.ddata_size);
-       dev_dbg(cd->dev, "%s: mdata_ofs  =%4Zd siz=%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: mdata_ofs  =%4zd siz=%4zd\n", __func__,
                si->si_ofs.mdata_ofs, si->si_ofs.mdata_size);
 
-       dev_dbg(cd->dev, "%s: cmd_ofs       =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: cmd_ofs       =%4zd\n", __func__,
                si->si_ofs.cmd_ofs);
-       dev_dbg(cd->dev, "%s: rep_ofs       =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: rep_ofs       =%4zd\n", __func__,
                si->si_ofs.rep_ofs);
-       dev_dbg(cd->dev, "%s: rep_sz        =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: rep_sz        =%4zd\n", __func__,
                si->si_ofs.rep_sz);
-       dev_dbg(cd->dev, "%s: num_btns      =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: num_btns      =%4zd\n", __func__,
                si->si_ofs.num_btns);
-       dev_dbg(cd->dev, "%s: num_btn_regs  =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: num_btn_regs  =%4zd\n", __func__,
                si->si_ofs.num_btn_regs);
-       dev_dbg(cd->dev, "%s: tt_stat_ofs   =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: tt_stat_ofs   =%4zd\n", __func__,
                si->si_ofs.tt_stat_ofs);
-       dev_dbg(cd->dev, "%s: tch_rec_size  =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: tch_rec_size  =%4zd\n", __func__,
                si->si_ofs.tch_rec_size);
-       dev_dbg(cd->dev, "%s: max_tchs      =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: max_tchs      =%4zd\n", __func__,
                si->si_ofs.max_tchs);
-       dev_dbg(cd->dev, "%s: mode_size     =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: mode_size     =%4zd\n", __func__,
                si->si_ofs.mode_size);
-       dev_dbg(cd->dev, "%s: data_size     =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: data_size     =%4zd\n", __func__,
                si->si_ofs.data_size);
-       dev_dbg(cd->dev, "%s: map_sz        =%4Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: map_sz        =%4zd\n", __func__,
                si->si_ofs.map_sz);
 
-       dev_dbg(cd->dev, "%s: btn_rec_size   =%2Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: btn_rec_size   =%2zd\n", __func__,
                si->si_ofs.btn_rec_size);
-       dev_dbg(cd->dev, "%s: btn_diff_ofs   =%2Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: btn_diff_ofs   =%2zd\n", __func__,
                si->si_ofs.btn_diff_ofs);
-       dev_dbg(cd->dev, "%s: btn_diff_size  =%2Zd\n", __func__,
+       dev_dbg(cd->dev, "%s: btn_diff_size  =%2zd\n", __func__,
                si->si_ofs.btn_diff_size);
 
-       dev_dbg(cd->dev, "%s: max_x    = 0x%04ZX (%Zd)\n", __func__,
+       dev_dbg(cd->dev, "%s: max_x    = 0x%04zX (%zd)\n", __func__,
                si->si_ofs.max_x, si->si_ofs.max_x);
-       dev_dbg(cd->dev, "%s: x_origin = %Zd (%s)\n", __func__,
+       dev_dbg(cd->dev, "%s: x_origin = %zd (%s)\n", __func__,
                si->si_ofs.x_origin,
                si->si_ofs.x_origin == CY_NORMAL_ORIGIN ?
                "left corner" : "right corner");
-       dev_dbg(cd->dev, "%s: max_y    = 0x%04ZX (%Zd)\n", __func__,
+       dev_dbg(cd->dev, "%s: max_y    = 0x%04zX (%zd)\n", __func__,
                si->si_ofs.max_y, si->si_ofs.max_y);
-       dev_dbg(cd->dev, "%s: y_origin = %Zd (%s)\n", __func__,
+       dev_dbg(cd->dev, "%s: y_origin = %zd (%s)\n", __func__,
                si->si_ofs.y_origin,
                si->si_ofs.y_origin == CY_NORMAL_ORIGIN ?
                "upper corner" : "lower corner");
-       dev_dbg(cd->dev, "%s: max_p    = 0x%04ZX (%Zd)\n", __func__,
+       dev_dbg(cd->dev, "%s: max_p    = 0x%04zX (%zd)\n", __func__,
                si->si_ofs.max_p, si->si_ofs.max_p);
 
        dev_dbg(cd->dev, "%s: xy_mode=%p xy_data=%p\n", __func__,
@@ -1000,7 +1000,7 @@ static int cyttsp4_xy_worker(struct cyttsp4 *cd)
                dev_dbg(dev, "%s: Large area detected\n", __func__);
 
        if (num_cur_tch > si->si_ofs.max_tchs) {
-               dev_err(dev, "%s: too many tch; set to max tch (n=%d c=%Zd)\n",
+               dev_err(dev, "%s: too many tch; set to max tch (n=%d c=%zd)\n",
                                __func__, num_cur_tch, si->si_ofs.max_tchs);
                num_cur_tch = si->si_ofs.max_tchs;
        }
index 04cdac7..6130278 100644 (file)
@@ -1507,7 +1507,7 @@ static ssize_t amd_iommu_show_cap(struct device *dev,
                                  struct device_attribute *attr,
                                  char *buf)
 {
-       struct amd_iommu *iommu = dev_get_drvdata(dev);
+       struct amd_iommu *iommu = dev_to_amd_iommu(dev);
        return sprintf(buf, "%x\n", iommu->cap);
 }
 static DEVICE_ATTR(cap, S_IRUGO, amd_iommu_show_cap, NULL);
@@ -1516,7 +1516,7 @@ static ssize_t amd_iommu_show_features(struct device *dev,
                                       struct device_attribute *attr,
                                       char *buf)
 {
-       struct amd_iommu *iommu = dev_get_drvdata(dev);
+       struct amd_iommu *iommu = dev_to_amd_iommu(dev);
        return sprintf(buf, "%llx\n", iommu->features);
 }
 static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL);
index af00f38..003f3ce 100644 (file)
@@ -569,6 +569,11 @@ struct amd_iommu {
        volatile u64 __aligned(8) cmd_sem;
 };
 
+static inline struct amd_iommu *dev_to_amd_iommu(struct device *dev)
+{
+       return container_of(dev, struct amd_iommu, iommu.dev);
+}
+
 #define ACPIHID_UID_LEN 256
 #define ACPIHID_HID_LEN 9
 
index a8f7ae0..238ad34 100644 (file)
@@ -4730,11 +4730,16 @@ static int intel_iommu_cpu_dead(unsigned int cpu)
        return 0;
 }
 
+static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+       return container_of(dev, struct intel_iommu, iommu.dev);
+}
+
 static ssize_t intel_iommu_show_version(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        u32 ver = readl(iommu->reg + DMAR_VER_REG);
        return sprintf(buf, "%d:%d\n",
                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
@@ -4745,7 +4750,7 @@ static ssize_t intel_iommu_show_address(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        return sprintf(buf, "%llx\n", iommu->reg_phys);
 }
 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
@@ -4754,7 +4759,7 @@ static ssize_t intel_iommu_show_cap(struct device *dev,
                                    struct device_attribute *attr,
                                    char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        return sprintf(buf, "%llx\n", iommu->cap);
 }
 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
@@ -4763,7 +4768,7 @@ static ssize_t intel_iommu_show_ecap(struct device *dev,
                                    struct device_attribute *attr,
                                    char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        return sprintf(buf, "%llx\n", iommu->ecap);
 }
 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
@@ -4772,7 +4777,7 @@ static ssize_t intel_iommu_show_ndoms(struct device *dev,
                                      struct device_attribute *attr,
                                      char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
 }
 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
@@ -4781,7 +4786,7 @@ static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
 {
-       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       struct intel_iommu *iommu = dev_to_intel_iommu(dev);
        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
                                                  cap_ndoms(iommu->cap)));
 }
index cb72e00..51f2b22 100644 (file)
@@ -579,7 +579,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                if (!svm->mm)
                        goto bad_req;
                /* If the mm is already defunct, don't handle faults. */
-               if (!atomic_inc_not_zero(&svm->mm->mm_users))
+               if (!mmget_not_zero(svm->mm))
                        goto bad_req;
                down_read(&svm->mm->mmap_sem);
                vma = find_extend_vma(svm->mm, address);
index 576b7b4..8bc2791 100644 (file)
@@ -2049,7 +2049,7 @@ static int diva_dbg_cmp_key(const char *ref, const char *key) {
 /*
   In case trace filter starts with "C" character then
   all following characters are interpreted as command.
-  Followings commands are available:
+  Following commands are available:
   - single, trace single call at time, independent from CPN/CiPN
 */
 static int diva_mnt_cmp_nmbr(const char *nmbr) {
index 8d338ba..77dec28 100644 (file)
@@ -1625,7 +1625,7 @@ mISDNipac_init(struct ipac_hw *ipac, void *hw)
                ipac->hscx[i].bch.hw = hw;
                ipac->hscx[i].ip = ipac;
                /* default values for IOM time slots
-                * can be overwriten by card */
+                * can be overwritten by card */
                ipac->hscx[i].slot = (i == 0) ? 0x2f : 0x03;
        }
 
index 0222b1a..9b85295 100644 (file)
  *
  * The CMX has special functions for conferences with one, two and more
  * members. It will allow different types of data flow. Receive and transmit
- * data to/form upper layer may be swithed on/off individually without losing
+ * data to/form upper layer may be switched on/off individually without losing
  * features of CMX, Tones and DTMF.
  *
  * Echo Cancellation: Sometimes we like to cancel echo from the interface.
index bbe9487..8ed6bcc 100644 (file)
@@ -136,7 +136,7 @@ extern void dvb_ringbuffer_flush_spinlock_wakeup(struct dvb_ringbuffer *rbuf);
 }
 
 /**
- * dvb_ringbuffer_read_user - Reads a buffer into an user pointer
+ * dvb_ringbuffer_read_user - Reads a buffer into a user pointer
  *
  * @rbuf: pointer to struct dvb_ringbuffer
  * @buf: pointer to the buffer where the data will be stored
@@ -193,7 +193,7 @@ extern ssize_t dvb_ringbuffer_write(struct dvb_ringbuffer *rbuf, const u8 *buf,
                                    size_t len);
 
 /**
- * dvb_ringbuffer_write_user - Writes a buffer received via an user pointer
+ * dvb_ringbuffer_write_user - Writes a buffer received via a user pointer
  *
  * @rbuf: pointer to struct dvb_ringbuffer
  * @buf: pointer to the buffer where the data will be read
index 9076bf2..7a681d8 100644 (file)
@@ -1317,9 +1317,9 @@ struct drx_version_list {
                DRX_MPEG_STR_WIDTH_8
        };
 
-/* CTRL CFG MPEG ouput */
+/* CTRL CFG MPEG output */
 /**
-* \struct struct drx_cfg_mpeg_output * \brief Configuartion parameters for MPEG output control.
+* \struct struct drx_cfg_mpeg_output * \brief Configuration parameters for MPEG output control.
 *
 * Used by DRX_CFG_MPEG_OUTPUT, in combination with DRX_CTRL_SET_CFG and
 * DRX_CTRL_GET_CFG.
index f1c3e3b..daeaf96 100644 (file)
@@ -601,7 +601,7 @@ static struct drxj_data drxj_data_g = {
        0,                      /* hi_cfg_wake_up_key    */
        0,                      /* hi_cfg_ctrl         */
        0,                      /* HICfgTimeout      */
-       /* UIO configuartion */
+       /* UIO configuration */
        DRX_UIO_MODE_DISABLE,   /* uio_sma_rx_mode      */
        DRX_UIO_MODE_DISABLE,   /* uio_sma_tx_mode      */
        DRX_UIO_MODE_DISABLE,   /* uioASELMode       */
@@ -619,7 +619,7 @@ static struct drxj_data drxj_data_g = {
 /*   false,                  * flagHDevSet       */
 /*   (u16) 0xFFF,          * rdsLastCount      */
 
-       /* ATV configuartion */
+       /* ATV configuration */
        0UL,                    /* flags cfg changes */
        /* shadow of ATV_TOP_EQU0__A */
        {-5,
@@ -3352,7 +3352,7 @@ rw_error:
 /*----------------------------------------------------------------------------*/
 
 /*----------------------------------------------------------------------------*/
-/* miscellaneous configuartions - begin                           */
+/* miscellaneous configurations - begin                           */
 /*----------------------------------------------------------------------------*/
 
 /**
@@ -3515,7 +3515,7 @@ rw_error:
 }
 
 /*----------------------------------------------------------------------------*/
-/* miscellaneous configuartions - end                             */
+/* miscellaneous configurations - end                             */
 /*----------------------------------------------------------------------------*/
 
 /*----------------------------------------------------------------------------*/
@@ -10952,7 +10952,7 @@ rw_error:
 
 static void drxj_reset_mode(struct drxj_data *ext_attr)
 {
-       /* Initialize default AFE configuartion for QAM */
+       /* Initialize default AFE configuration for QAM */
        if (ext_attr->has_lna) {
                /* IF AGC off, PGA active */
 #ifndef DRXJ_VSB_ONLY
@@ -10996,7 +10996,7 @@ static void drxj_reset_mode(struct drxj_data *ext_attr)
        ext_attr->qam_pre_saw_cfg.reference = 0x07;
        ext_attr->qam_pre_saw_cfg.use_pre_saw = true;
 #endif
-       /* Initialize default AFE configuartion for VSB */
+       /* Initialize default AFE configuration for VSB */
        ext_attr->vsb_rf_agc_cfg.standard = DRX_STANDARD_8VSB;
        ext_attr->vsb_rf_agc_cfg.ctrl_mode = DRX_AGC_CTRL_AUTO;
        ext_attr->vsb_rf_agc_cfg.min_output_level = 0;
@@ -11072,9 +11072,9 @@ ctrl_power_mode(struct drx_demod_instance *demod, enum drx_power_mode *mode)
        }
 
        if ((*mode == DRX_POWER_UP)) {
-               /* Restore analog & pin configuartion */
+               /* Restore analog & pin configuration */
 
-               /* Initialize default AFE configuartion for VSB */
+               /* Initialize default AFE configuration for VSB */
                drxj_reset_mode(ext_attr);
        } else {
                /* Power down to requested mode */
index 55ad535..6c5b8f7 100644 (file)
@@ -447,7 +447,7 @@ struct drxj_cfg_atv_output {
                u16 hi_cfg_ctrl;          /**< HI Configure() parameter 5                       */
                u16 hi_cfg_transmit;      /**< HI Configure() parameter 6                       */
 
-               /* UIO configuartion */
+               /* UIO configuration */
                enum drxuio_mode uio_sma_rx_mode;/**< current mode of SmaRx pin                        */
                enum drxuio_mode uio_sma_tx_mode;/**< current mode of SmaTx pin                        */
                enum drxuio_mode uio_gpio_mode; /**< current mode of ASEL pin                         */
@@ -459,7 +459,7 @@ struct drxj_cfg_atv_output {
                /* IQM RC frequecy shift */
                u32 iqm_rc_rate_ofs;       /**< frequency shifter setting after setchannel      */
 
-               /* ATV configuartion */
+               /* ATV configuration */
                u32 atv_cfg_changed_flags; /**< flag: flags cfg changes */
                s16 atv_top_equ0[DRXJ_COEF_IDX_MAX];         /**< shadow of ATV_TOP_EQU0__A */
                s16 atv_top_equ1[DRXJ_COEF_IDX_MAX];         /**< shadow of ATV_TOP_EQU1__A */
index 15d2cac..7e1bbba 100644 (file)
@@ -1626,7 +1626,7 @@ static int ctrl_power_mode(struct drxk_state *state, enum drx_power_mode *mode)
        }
 
        if (*mode == DRX_POWER_UP) {
-               /* Restore analog & pin configuartion */
+               /* Restore analog & pin configuration */
        } else {
                /* Power down to requested mode */
                /* Backup some register settings */
index ef35c2b..4bf5a55 100644 (file)
@@ -309,7 +309,7 @@ static int helene_write_regs(struct helene_priv *priv,
 
        if (len + 1 > sizeof(buf)) {
                dev_warn(&priv->i2c->dev,
-                               "wr reg=%04x: len=%d vs %Zu is too big!\n",
+                               "wr reg=%04x: len=%d vs %zu is too big!\n",
                                reg, len + 1, sizeof(buf));
                return -E2BIG;
        }
index 4b67d7e..62aa007 100644 (file)
@@ -133,7 +133,7 @@ static int or51132_load_firmware (struct dvb_frontend* fe, const struct firmware
        u32 firmwareAsize, firmwareBsize;
        int i,ret;
 
-       dprintk("Firmware is %Zd bytes\n",fw->size);
+       dprintk("Firmware is %zd bytes\n",fw->size);
 
        /* Get size of firmware A and B */
        firmwareAsize = le32_to_cpu(*((__le32*)fw->data));
index 92ab34c..143b39b 100644 (file)
@@ -499,7 +499,7 @@ static int tda10048_firmware_upload(struct dvb_frontend *fe)
                        __func__);
                return -EIO;
        } else {
-               printk(KERN_INFO "%s: firmware read %Zu bytes.\n",
+               printk(KERN_INFO "%s: firmware read %zu bytes.\n",
                        __func__,
                        fw->size);
                ret = 0;
index 843d499..4ade89d 100644 (file)
@@ -83,7 +83,7 @@
 #define ADV7183_LETTERBOX_3        0x9D /* Letterbox 3 */
 #define ADV7183_CRC_EN             0xB2 /* CRC enable */
 #define ADV7183_ADC_SWITCH_1       0xC3 /* ADC switch 1 */
-#define ADV7183_ADC_SWITCH_2       0xC4 /* ADC swithc 2 */
+#define ADV7183_ADC_SWITCH_2       0xC4 /* ADC switch 2 */
 #define ADV7183_LETTERBOX_CTRL_1   0xDC /* Letterbox control 1 */
 #define ADV7183_LETTERBOX_CTRL_2   0xDD /* Letterbox control 2 */
 #define ADV7183_SD_OFFSET_CB       0xE1 /* SD offset Cb */
index 4ba5ead..ef49064 100644 (file)
@@ -422,7 +422,7 @@ int saa7164_downloadfirmware(struct saa7164_dev *dev)
                        return -ENOMEM;
                }
 
-               printk(KERN_INFO "%s() firmware read %Zu bytes.\n",
+               printk(KERN_INFO "%s() firmware read %zu bytes.\n",
                        __func__, fw->size);
 
                if (fw->size != fwlength) {
index 5615fef..c0373ae 100644 (file)
@@ -358,7 +358,7 @@ struct fimc_pix_limit {
  * @pix_limit: pixel size constraints for the scaler
  * @min_inp_pixsize: minimum input pixel size
  * @min_out_pixsize: minimum output pixel size
- * @hor_offs_align: horizontal pixel offset aligment
+ * @hor_offs_align: horizontal pixel offset alignment
  * @min_vsize_align: minimum vertical pixel size alignment
  */
 struct fimc_variant {
index 0345b27..91947cf 100644 (file)
@@ -1144,7 +1144,7 @@ static int xc_load_fw_and_init_tuner(struct dvb_frontend *fe, int force)
                        pr_err("xc5000: Upload failed. rc %d\n", ret);
                        return ret;
                }
-               dprintk(1, "firmware read %Zu bytes.\n", fw->size);
+               dprintk(1, "firmware read %zu bytes.\n", fw->size);
 
                if (fw->size != desired_fw->size) {
                        pr_err("xc5000: Firmware file with incorrect size\n");
index 81d7fd4..85ab3fa 100644 (file)
@@ -2414,7 +2414,7 @@ static int stk9090m_frontend_attach(struct dvb_usb_adapter *adap)
                deb_info("%s: Upload failed. (file not found?)\n", __func__);
                return -ENODEV;
        } else {
-               deb_info("%s: firmware read %Zu bytes.\n", __func__, state->frontend_firmware->size);
+               deb_info("%s: firmware read %zu bytes.\n", __func__, state->frontend_firmware->size);
        }
        stk9090m_config.microcode_B_fe_size = state->frontend_firmware->size;
        stk9090m_config.microcode_B_fe_buffer = state->frontend_firmware->data;
@@ -2480,7 +2480,7 @@ static int nim9090md_frontend_attach(struct dvb_usb_adapter *adap)
                deb_info("%s: Upload failed. (file not found?)\n", __func__);
                return -EIO;
        } else {
-               deb_info("%s: firmware read %Zu bytes.\n", __func__, state->frontend_firmware->size);
+               deb_info("%s: firmware read %zu bytes.\n", __func__, state->frontend_firmware->size);
        }
        nim9090md_config[0].microcode_B_fe_size = state->frontend_firmware->size;
        nim9090md_config[0].microcode_B_fe_buffer = state->frontend_firmware->data;
index 4266771..46fb763 100644 (file)
@@ -570,9 +570,9 @@ static void setfreq(struct gspca_dev *gspca_dev, s32 val)
 /* this function is called at probe and resume time */
 static int sd_init(struct gspca_dev *gspca_dev)
 {
-       /* some of this registers are not really neded, because
-        * they are overriden by setbrigthness, setcontrast, etc,
-        * but wont hurt anyway, and can help someone with similar webcam
+       /* some of this registers are not really needed, because
+        * they are overridden by setbrigthness, setcontrast, etc.,
+        * but won't hurt anyway, and can help someone with similar webcam
         * to see the initial parameters.*/
        struct sd *sd = (struct sd *) gspca_dev;
        const struct additional_sensor_data *sensor;
index 4afd465..39c15bb 100644 (file)
@@ -438,7 +438,7 @@ int tm6000_ir_init(struct tm6000_core *dev)
 
        /* input setup */
        rc->allowed_protocols = RC_BIT_RC5 | RC_BIT_NEC;
-       /* Neded, in order to support NEC remotes with 24 or 32 bits */
+       /* Needed, in order to support NEC remotes with 24 or 32 bits */
        rc->scancode_mask = 0xffff;
        rc->priv = ir;
        rc->change_protocol = tm6000_ir_change_protocol;
index 05b5c66..e48b7c0 100644 (file)
@@ -245,7 +245,7 @@ static const struct analog_demod_ops tuner_analog_ops = {
  * @tuner_callback:    an optional function to be called when switching
  *                     to analog mode
  *
- * This function applys the tuner config to tuner specified
+ * This function applies the tuner config to tuner specified
  * by tun_setup structure. It contains several per-tuner initialization "magic"
  */
 static void set_type(struct i2c_client *c, unsigned int type,
@@ -463,7 +463,7 @@ attach_failed:
  * @sd:                subdev descriptor
  * @tun_setup: type to be associated to a given tuner i2c address
  *
- * This function applys the tuner config to tuner specified
+ * This function applies the tuner config to tuner specified
  * by tun_setup structure.
  * If tuner I2C address is UNSET, then it will only set the device
  * if the tuner supports the mode specified in the call.
index f866a4b..f35f0c8 100644 (file)
@@ -303,7 +303,7 @@ int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg)
 
        vmci_dg_size = VMCI_DG_SIZE(dg);
        if (vmci_dg_size > VMCI_MAX_DG_SIZE) {
-               pr_devel("Datagram too large (bytes=%Zu)\n", vmci_dg_size);
+               pr_devel("Datagram too large (bytes=%zu)\n", vmci_dg_size);
                return VMCI_ERROR_INVALID_ARGS;
        }
 
index f84a427..498c085 100644 (file)
@@ -2928,7 +2928,7 @@ int vmci_qpair_get_produce_indexes(const struct vmci_qp *qpair,
 EXPORT_SYMBOL_GPL(vmci_qpair_get_produce_indexes);
 
 /*
- * vmci_qpair_get_consume_indexes() - Retrieves the indexes of the comsumer.
+ * vmci_qpair_get_consume_indexes() - Retrieves the indexes of the consumer.
  * @qpair:      Pointer to the queue pair struct.
  * @consumer_tail:      Reference used for storing consumer tail index.
  * @producer_head:      Reference used for storing the producer head index.
index 2b7fc37..00750c9 100644 (file)
@@ -170,7 +170,7 @@ int dml_hw_init(struct mmci_host *host, struct device_node *np)
        writel_relaxed(producer_id | (consumer_id << CONSUMER_PIPE_ID_SHFT),
                       base + DML_PIPE_ID);
 
-       /* Make sure dml intialization is finished */
+       /* Make sure dml initialization is finished */
        mb();
 
        return 0;
index 82bd00a..268aae4 100644 (file)
@@ -75,18 +75,18 @@ static char module_name[] = "lart";
 
 /* blob */
 #define NUM_BLOB_BLOCKS                FLASH_NUMBLOCKS_16m_PARAM
-#define BLOB_START                     0x00000000
-#define BLOB_LEN                       (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
+#define PART_BLOB_START                0x00000000
+#define PART_BLOB_LEN          (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
 
 /* kernel */
 #define NUM_KERNEL_BLOCKS      7
-#define KERNEL_START           (BLOB_START + BLOB_LEN)
-#define KERNEL_LEN                     (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_KERNEL_START      (PART_BLOB_START + PART_BLOB_LEN)
+#define PART_KERNEL_LEN                (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN)
 
 /* initial ramdisk */
 #define NUM_INITRD_BLOCKS      24
-#define INITRD_START           (KERNEL_START + KERNEL_LEN)
-#define INITRD_LEN                     (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
+#define PART_INITRD_START      (PART_KERNEL_START + PART_KERNEL_LEN)
+#define PART_INITRD_LEN                (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN)
 
 /*
  * See section 4.0 in "3 Volt Fast Boot Block Flash Memory" Intel Datasheet
@@ -587,20 +587,20 @@ static struct mtd_partition lart_partitions[] = {
        /* blob */
        {
                .name   = "blob",
-               .offset = BLOB_START,
-               .size   = BLOB_LEN,
+               .offset = PART_BLOB_START,
+               .size   = PART_BLOB_LEN,
        },
        /* kernel */
        {
                .name   = "kernel",
-               .offset = KERNEL_START,         /* MTDPART_OFS_APPEND */
-               .size   = KERNEL_LEN,
+               .offset = PART_KERNEL_START,    /* MTDPART_OFS_APPEND */
+               .size   = PART_KERNEL_LEN,
        },
        /* initial ramdisk / file system */
        {
                .name   = "file system",
-               .offset = INITRD_START,         /* MTDPART_OFS_APPEND */
-               .size   = INITRD_LEN,           /* MTDPART_SIZ_FULL */
+               .offset = PART_INITRD_START,    /* MTDPART_OFS_APPEND */
+               .size   = PART_INITRD_LEN,      /* MTDPART_SIZ_FULL */
        }
 };
 #define NUM_PARTITIONS ARRAY_SIZE(lart_partitions)
index 6ea963e..62ee439 100644 (file)
@@ -123,7 +123,7 @@ static int __init arcnet_init(void)
                arc_proto_map[count] = arc_proto_default;
 
        if (BUGLVL(D_DURING))
-               pr_info("struct sizes: %Zd %Zd %Zd %Zd %Zd\n",
+               pr_info("struct sizes: %zd %zd %zd %zd %zd\n",
                        sizeof(struct arc_hardware),
                        sizeof(struct arc_rfc1201),
                        sizeof(struct arc_rfc1051),
index a817313..a9ac58c 100644 (file)
@@ -1206,7 +1206,7 @@ static void bfin_mac_rx(struct bfin_mac_local *lp)
        /* reserve 2 bytes for RXDWA padding */
        skb_reserve(new_skb, NET_IP_ALIGN);
        /* Invalidate the data cache of skb->data range when it is write back
-        * cache. It will prevent overwritting the new data from DMA
+        * cache. It will prevent overwriting the new data from DMA
         */
        blackfin_dcache_invalidate_range((unsigned long)new_skb->head,
                                         (unsigned long)new_skb->end);
index d0d0d12..e536301 100644 (file)
@@ -293,36 +293,29 @@ static int xgene_enet_tx_completion(struct xgene_enet_desc_ring *cp_ring,
 static int xgene_enet_setup_mss(struct net_device *ndev, u32 mss)
 {
        struct xgene_enet_pdata *pdata = netdev_priv(ndev);
-       bool mss_index_found = false;
-       int mss_index;
+       int mss_index = -EBUSY;
        int i;
 
        spin_lock(&pdata->mss_lock);
 
        /* Reuse the slot if MSS matches */
-       for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+       for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
                if (pdata->mss[i] == mss) {
                        pdata->mss_refcnt[i]++;
                        mss_index = i;
-                       mss_index_found = true;
                }
        }
 
        /* Overwrite the slot with ref_count = 0 */
-       for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) {
+       for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) {
                if (!pdata->mss_refcnt[i]) {
                        pdata->mss_refcnt[i]++;
                        pdata->mac_ops->set_mss(pdata, mss, i);
                        pdata->mss[i] = mss;
                        mss_index = i;
-                       mss_index_found = true;
                }
        }
 
-       /* No slots with ref_count = 0 available, return busy */
-       if (!mss_index_found)
-               mss_index = -EBUSY;
-
        spin_unlock(&pdata->mss_lock);
 
        return mss_index;
index 0ee6e20..50d88d3 100644 (file)
@@ -817,7 +817,7 @@ static void bcm_enet_adjust_phy_link(struct net_device *dev)
                        rx_pause_en = 1;
                        tx_pause_en = 1;
                } else if (!priv->pause_auto) {
-                       /* pause setting overrided by user */
+                       /* pause setting overridden by user */
                        rx_pause_en = priv->pause_rx;
                        tx_pause_en = priv->pause_tx;
                } else {
index 05356ef..b209b7f 100644 (file)
@@ -6957,7 +6957,7 @@ int bnx2x_link_update(struct link_params *params, struct link_vars *vars)
                         * hence its link is expected to be down
                         * - SECOND_PHY means that first phy should not be able
                         * to link up by itself (using configuration)
-                        * - DEFAULT should be overriden during initialiazation
+                        * - DEFAULT should be overridden during initialization
                         */
                                DP(NETIF_MSG_LINK, "Invalid link indication"
                                           "mpc=0x%x. DISABLING LINK !!!\n",
index 016d481..30606b1 100644 (file)
@@ -1622,7 +1622,7 @@ static void macb_init_rx_buffer_size(struct macb *bp, size_t size)
                }
        }
 
-       netdev_dbg(bp->dev, "mtu [%u] rx_buffer_size [%Zu]\n",
+       netdev_dbg(bp->dev, "mtu [%u] rx_buffer_size [%zu]\n",
                   bp->dev->mtu, bp->rx_buffer_size);
 }
 
index acc2312..f6e739d 100644 (file)
@@ -1416,7 +1416,7 @@ static unsigned int xdigit2int(unsigned char c)
  * <pattern data>[/<pattern mask>][@<anchor>]
  *
  * Up to 2 filter patterns can be specified.  If 2 are supplied the first one
- * must be anchored at 0.  An omited mask is taken as a mask of 1s, an omitted
+ * must be anchored at 0.  An omitted mask is taken as a mask of 1s, an omitted
  * anchor is taken as 0.
  */
 static ssize_t mps_trc_write(struct file *file, const char __user *buf,
index cbbf864..78460c5 100644 (file)
@@ -847,9 +847,7 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
                wr32(hw, reg_idx, reg);
                i40e_flush(hw);
        }
-       /* reset some of the state varibles keeping
-        * track of the resources
-        */
+       /* reset some of the state variables keeping track of the resources */
        vf->num_queue_pairs = 0;
        vf->vf_states = 0;
        clear_bit(I40E_VF_STAT_INIT, &vf->vf_states);
index 2788a54..68812d7 100644 (file)
@@ -294,7 +294,7 @@ s32 igb_write_phy_reg_i2c(struct e1000_hw *hw, u32 offset, u16 data)
        u32 i, i2ccmd = 0;
        u16 phy_data_swapped;
 
-       /* Prevent overwritting SFP I2C EEPROM which is at A0 address.*/
+       /* Prevent overwriting SFP I2C EEPROM which is at A0 address.*/
        if ((hw->phy.addr == 0) || (hw->phy.addr > 7)) {
                hw_dbg("PHY I2C Address %d is out of range.\n",
                          hw->phy.addr);
index 30535e6..c8ac460 100644 (file)
@@ -1449,7 +1449,7 @@ do { \
  *  @atr_input: input bitstream to compute the hash on
  *  @input_mask: mask for the input bitstream
  *
- *  This function serves two main purposes.  First it applys the input_mask
+ *  This function serves two main purposes.  First it applies the input_mask
  *  to the atr_input resulting in a cleaned up atr_input data stream.
  *  Secondly it computes the hash and stores it in the bkt_hash field at
  *  the end of the input byte stream.  This way it will be available for
index e7b81a3..0247885 100644 (file)
@@ -89,10 +89,17 @@ void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev)
        }
 }
 
+#define MLX4_EN_WRAP_AROUND_SEC        10UL
+/* By scheduling the overflow check every 5 seconds, we have a reasonably
+ * good chance we wont miss a wrap around.
+ * TOTO: Use a timer instead of a work queue to increase the guarantee.
+ */
+#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2)
+
 void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
 {
        bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
-                                             mdev->overflow_period);
+                                             MLX4_EN_OVERFLOW_PERIOD);
        unsigned long flags;
 
        if (timeout) {
@@ -237,7 +244,6 @@ static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
        .enable         = mlx4_en_phc_enable,
 };
 
-#define MLX4_EN_WRAP_AROUND_SEC        10ULL
 
 /* This function calculates the max shift that enables the user range
  * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
@@ -258,7 +264,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 {
        struct mlx4_dev *dev = mdev->dev;
        unsigned long flags;
-       u64 ns, zero = 0;
 
        /* mlx4_en_init_timestamp is called for each netdev.
         * mdev->ptp_clock is common for all ports, skip initialization if
@@ -282,13 +287,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
                         ktime_to_ns(ktime_get_real()));
        write_sequnlock_irqrestore(&mdev->clock_lock, flags);
 
-       /* Calculate period in seconds to call the overflow watchdog - to make
-        * sure counter is checked at least once every wrap around.
-        */
-       ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero);
-       do_div(ns, NSEC_PER_SEC / 2 / HZ);
-       mdev->overflow_period = ns;
-
        /* Configure the PHC */
        mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
        snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
index 4941b69..3629ce1 100644 (file)
@@ -430,7 +430,6 @@ struct mlx4_en_dev {
        seqlock_t               clock_lock;
        struct timecounter      clock;
        unsigned long           last_overflow_check;
-       unsigned long           overflow_period;
        struct ptp_clock        *ptp_clock;
        struct ptp_clock_info   ptp_clock_info;
        struct notifier_block   nb;
index ee38c18..ee1c78a 100644 (file)
@@ -1251,10 +1251,10 @@ struct ksz_port_info {
  * @tx_size:           Transmit data size.  Used for TX optimization.
  *                     The maximum is defined by MAX_TX_HELD_SIZE.
  * @perm_addr:         Permanent MAC address.
- * @override_addr:     Overrided MAC address.
+ * @override_addr:     Overridden MAC address.
  * @address:           Additional MAC address entries.
  * @addr_list_size:    Additional MAC address list size.
- * @mac_override:      Indication of MAC address overrided.
+ * @mac_override:      Indication of MAC address overridden.
  * @promiscuous:       Counter to keep track of promiscuous mode set.
  * @all_multi:         Counter to keep track of all multicast mode set.
  * @multi_list:                Multicast address entries.
@@ -4042,7 +4042,7 @@ static int empty_addr(u8 *addr)
  * @hw:        The hardware instance.
  *
  * This routine programs the MAC address of the hardware when the address is
- * overrided.
+ * overridden.
  */
 static void hw_set_addr(struct ksz_hw *hw)
 {
@@ -7043,7 +7043,7 @@ static int pcidev_init(struct pci_dev *pdev, const struct pci_device_id *id)
        if (macaddr[0] != ':')
                get_mac_addr(hw_priv, macaddr, MAIN_PORT);
 
-       /* Read MAC address and initialize override address if not overrided. */
+       /* Read MAC address and initialize override address if not overridden. */
        hw_read_addr(hw);
 
        /* Multiple device interfaces mode requires a second MAC address. */
index c5c1d0e..118723e 100644 (file)
@@ -5397,7 +5397,7 @@ static void s2io_ethtool_gdrvinfo(struct net_device *dev,
  *  s2io_nic structure.
  *  @regs : pointer to the structure with parameters given by ethtool for
  *  dumping the registers.
- *  @reg_space: The input argumnet into which all the registers are dumped.
+ *  @reg_space: The input argument into which all the registers are dumped.
  *  Description:
  *  Dumps the entire register space of xFrame NIC into the user given
  *  buffer area.
index db55e6d..0452848 100644 (file)
@@ -119,7 +119,7 @@ static void vxge_ethtool_gdrvinfo(struct net_device *dev,
  * @dev: device pointer.
  * @regs: pointer to the structure with parameters given by ethtool for
  * dumping the registers.
- * @reg_space: The input argumnet into which all the registers are dumped.
+ * @reg_space: The input argument into which all the registers are dumped.
  *
  * Dumps the vpath register space of Titan NIC into the user given
  * buffer area.
index 61a9cd5..00c17fa 100644 (file)
@@ -688,7 +688,9 @@ static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev,
 #define OOO_LB_TC 9
 
 int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate);
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+                                        struct qed_ptt *p_ptt,
+                                        u32 min_pf_rate);
 
 void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
index d6c5a81..e2a081c 100644 (file)
@@ -3198,7 +3198,8 @@ int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate)
 }
 
 /* API to configure WFQ from mcp link change */
-void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+                                        struct qed_ptt *p_ptt, u32 min_pf_rate)
 {
        int i;
 
@@ -3212,8 +3213,7 @@ void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
        for_each_hwfn(cdev, i) {
                struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
-               __qed_configure_vp_wfq_on_link_change(p_hwfn,
-                                                     p_hwfn->p_dpc_ptt,
+               __qed_configure_vp_wfq_on_link_change(p_hwfn, p_ptt,
                                                      min_pf_rate);
        }
 }
index 314022d..87fde20 100644 (file)
@@ -679,7 +679,8 @@ static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn,
 
        /* Min bandwidth configuration */
        __qed_configure_pf_min_bandwidth(p_hwfn, p_ptt, p_link, min_bw);
-       qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_link->min_pf_rate);
+       qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_ptt,
+                                           p_link->min_pf_rate);
 
        p_link->an = !!(status & LINK_STATUS_AUTO_NEGOTIATE_ENABLED);
        p_link->an_complete = !!(status &
index 29ed785..253c2bb 100644 (file)
@@ -3014,8 +3014,7 @@ cleanup:
                ack_vfs[vfid / 32] |= BIT((vfid % 32));
                p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
                    ~(1ULL << (rel_vf_id % 64));
-               p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &=
-                   ~(1ULL << (rel_vf_id % 64));
+               p_vf->vf_mbx.b_pending_msg = false;
        }
 
        return rc;
@@ -3128,11 +3127,20 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
        mbx = &p_vf->vf_mbx;
 
        /* qed_iov_process_mbx_request */
-       DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-                  "VF[%02x]: Processing mailbox message\n", p_vf->abs_vf_id);
+       if (!mbx->b_pending_msg) {
+               DP_NOTICE(p_hwfn,
+                         "VF[%02x]: Trying to process mailbox message when none is pending\n",
+                         p_vf->abs_vf_id);
+               return;
+       }
+       mbx->b_pending_msg = false;
 
        mbx->first_tlv = mbx->req_virt->first_tlv;
 
+       DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+                  "VF[%02x]: Processing mailbox message [type %04x]\n",
+                  p_vf->abs_vf_id, mbx->first_tlv.tl.type);
+
        /* check if tlv type is known */
        if (qed_iov_tlv_supported(mbx->first_tlv.tl.type) &&
            !p_vf->b_malicious) {
@@ -3219,20 +3227,19 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
        }
 }
 
-static void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid)
+void qed_iov_pf_get_pending_events(struct qed_hwfn *p_hwfn, u64 *events)
 {
-       u64 add_bit = 1ULL << (vfid % 64);
+       int i;
 
-       p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit;
-}
+       memset(events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
 
-static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn,
-                                                   u64 *events)
-{
-       u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events;
+       qed_for_each_vf(p_hwfn, i) {
+               struct qed_vf_info *p_vf;
 
-       memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH);
-       memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+               p_vf = &p_hwfn->pf_iov_info->vfs_array[i];
+               if (p_vf->vf_mbx.b_pending_msg)
+                       events[i / 64] |= 1ULL << (i % 64);
+       }
 }
 
 static struct qed_vf_info *qed_sriov_get_vf_from_absid(struct qed_hwfn *p_hwfn,
@@ -3266,7 +3273,7 @@ static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn,
        p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
 
        /* Mark the event and schedule the workqueue */
-       qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id);
+       p_vf->vf_mbx.b_pending_msg = true;
        qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
 
        return 0;
@@ -4030,7 +4037,7 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
                return;
        }
 
-       qed_iov_pf_get_and_clear_pending_events(hwfn, events);
+       qed_iov_pf_get_pending_events(hwfn, events);
 
        DP_VERBOSE(hwfn, QED_MSG_IOV,
                   "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
index fc08cc2..a896058 100644 (file)
@@ -140,6 +140,9 @@ struct qed_iov_vf_mbx {
        /* Address in VF where a pending message is located */
        dma_addr_t pending_req;
 
+       /* Message from VF awaits handling */
+       bool b_pending_msg;
+
        u8 *offset;
 
        /* saved VF request header */
@@ -232,7 +235,6 @@ struct qed_vf_info {
  */
 struct qed_pf_iov {
        struct qed_vf_info vfs_array[MAX_NUM_VFS];
-       u64 pending_events[QED_VF_ARRAY_LENGTH];
        u64 pending_flr[QED_VF_ARRAY_LENGTH];
 
        /* Allocate message address continuosuly and split to each VF */
index 99b187b..718bf58 100644 (file)
@@ -178,7 +178,7 @@ const u32 qlcnic_83xx_reg_tbl[] = {
        0x3540,         /* Device state, DRV_REG1 */
        0x3544,         /* Driver state, DRV_REG2 */
        0x3548,         /* Driver scratch, DRV_REG3 */
-       0x354C,         /* Device partiton info, DRV_REG4 */
+       0x354C,         /* Device partition info, DRV_REG4 */
        0x3524,         /* Driver IDC ver, DRV_REG5 */
        0x3550,         /* FW_VER_MAJOR */
        0x3554,         /* FW_VER_MINOR */
index 47ced8a..91fb54f 100644 (file)
 
 /***********************************/
 /* MC_CMD_GET_LICENSED_V3_FEATURE_STATES
- * Query the state of an one or more licensed features. (Note that the actual
+ * Query the state of one or more licensed features. (Note that the actual
  * state may be invalidated by the MC_CMD_LICENSING_V3 OP_UPDATE_LICENSE
  * operation or a reboot of the MC.) Used for V3 licensing (Medford)
  */
index 19a4587..1b6f617 100644 (file)
@@ -176,7 +176,7 @@ struct sis900_private {
 
        u32 msg_enable;
 
-       unsigned int cur_rx, dirty_rx; /* producer/comsumer pointers for Tx/Rx ring */
+       unsigned int cur_rx, dirty_rx; /* producer/consumer pointers for Tx/Rx ring */
        unsigned int cur_tx, dirty_tx;
 
        /* The saved address of a sent/receive-in-place packet buffer */
index 144fe84..04d9245 100644 (file)
@@ -416,7 +416,7 @@ struct stmmac_dma_ops {
        /* Configure the AXI Bus Mode Register */
        void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
        /* Dump DMA registers */
-       void (*dump_regs) (void __iomem *ioaddr);
+       void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
        /* Set tx/rx threshold in the csr6 register
         * An invalid value enables the store-and-forward mode */
        void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
@@ -456,7 +456,7 @@ struct stmmac_ops {
        /* Enable RX Queues */
        void (*rx_queue_enable)(struct mac_device_info *hw, u32 queue);
        /* Dump MAC registers */
-       void (*dump_regs)(struct mac_device_info *hw);
+       void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
        /* Handle extra events on specific interrupts hw dependent */
        int (*host_irq_status)(struct mac_device_info *hw,
                               struct stmmac_extra_stats *x);
index 91c8926..19b9b30 100644 (file)
@@ -92,17 +92,13 @@ static int dwmac1000_rx_ipc_enable(struct mac_device_info *hw)
        return !!(value & GMAC_CONTROL_IPC);
 }
 
-static void dwmac1000_dump_regs(struct mac_device_info *hw)
+static void dwmac1000_dump_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
        int i;
-       pr_info("\tDWMAC1000 regs (base addr = 0x%p)\n", ioaddr);
 
-       for (i = 0; i < 55; i++) {
-               int offset = i * 4;
-               pr_info("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
-                       offset, readl(ioaddr + offset));
-       }
+       for (i = 0; i < 55; i++)
+               reg_space[i] = readl(ioaddr + i * 4);
 }
 
 static void dwmac1000_set_umac_addr(struct mac_device_info *hw,
index fbaec0f..d3654a4 100644 (file)
@@ -201,18 +201,14 @@ static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode,
        writel(csr6, ioaddr + DMA_CONTROL);
 }
 
-static void dwmac1000_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
-       pr_info(" DMA registers\n");
-       for (i = 0; i < 22; i++) {
-               if ((i < 9) || (i > 17)) {
-                       int offset = i * 4;
-                       pr_err("\t Reg No. %d (offset 0x%x): 0x%08x\n", i,
-                              (DMA_BUS_MODE + offset),
-                              readl(ioaddr + DMA_BUS_MODE + offset));
-               }
-       }
+
+       for (i = 0; i < 22; i++)
+               if ((i < 9) || (i > 17))
+                       reg_space[DMA_BUS_MODE / 4 + i] =
+                               readl(ioaddr + DMA_BUS_MODE + i * 4);
 }
 
 static void dwmac1000_get_hw_feature(void __iomem *ioaddr,
index 8ab5189..e370cce 100644 (file)
@@ -40,28 +40,18 @@ static void dwmac100_core_init(struct mac_device_info *hw, int mtu)
 #endif
 }
 
-static void dwmac100_dump_mac_regs(struct mac_device_info *hw)
+static void dwmac100_dump_mac_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
-       pr_info("\t----------------------------------------------\n"
-               "\t  DWMAC 100 CSR (base addr = 0x%p)\n"
-               "\t----------------------------------------------\n", ioaddr);
-       pr_info("\tcontrol reg (offset 0x%x): 0x%08x\n", MAC_CONTROL,
-               readl(ioaddr + MAC_CONTROL));
-       pr_info("\taddr HI (offset 0x%x): 0x%08x\n ", MAC_ADDR_HIGH,
-               readl(ioaddr + MAC_ADDR_HIGH));
-       pr_info("\taddr LO (offset 0x%x): 0x%08x\n", MAC_ADDR_LOW,
-               readl(ioaddr + MAC_ADDR_LOW));
-       pr_info("\tmulticast hash HI (offset 0x%x): 0x%08x\n",
-               MAC_HASH_HIGH, readl(ioaddr + MAC_HASH_HIGH));
-       pr_info("\tmulticast hash LO (offset 0x%x): 0x%08x\n",
-               MAC_HASH_LOW, readl(ioaddr + MAC_HASH_LOW));
-       pr_info("\tflow control (offset 0x%x): 0x%08x\n",
-               MAC_FLOW_CTRL, readl(ioaddr + MAC_FLOW_CTRL));
-       pr_info("\tVLAN1 tag (offset 0x%x): 0x%08x\n", MAC_VLAN1,
-               readl(ioaddr + MAC_VLAN1));
-       pr_info("\tVLAN2 tag (offset 0x%x): 0x%08x\n", MAC_VLAN2,
-               readl(ioaddr + MAC_VLAN2));
+
+       reg_space[MAC_CONTROL / 4] = readl(ioaddr + MAC_CONTROL);
+       reg_space[MAC_ADDR_HIGH / 4] = readl(ioaddr + MAC_ADDR_HIGH);
+       reg_space[MAC_ADDR_LOW / 4] = readl(ioaddr + MAC_ADDR_LOW);
+       reg_space[MAC_HASH_HIGH / 4] = readl(ioaddr + MAC_HASH_HIGH);
+       reg_space[MAC_HASH_LOW / 4] = readl(ioaddr + MAC_HASH_LOW);
+       reg_space[MAC_FLOW_CTRL / 4] = readl(ioaddr + MAC_FLOW_CTRL);
+       reg_space[MAC_VLAN1 / 4] = readl(ioaddr + MAC_VLAN1);
+       reg_space[MAC_VLAN2 / 4] = readl(ioaddr + MAC_VLAN2);
 }
 
 static int dwmac100_rx_ipc_enable(struct mac_device_info *hw)
index d40e91e..eef2f22 100644 (file)
@@ -66,19 +66,18 @@ static void dwmac100_dma_operation_mode(void __iomem *ioaddr, int txmode,
        writel(csr6, ioaddr + DMA_CONTROL);
 }
 
-static void dwmac100_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac100_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
 
-       pr_debug("DWMAC 100 DMA CSR\n");
        for (i = 0; i < 9; i++)
-               pr_debug("\t CSR%d (offset 0x%x): 0x%08x\n", i,
-                        (DMA_BUS_MODE + i * 4),
-                        readl(ioaddr + DMA_BUS_MODE + i * 4));
+               reg_space[DMA_BUS_MODE / 4 + i] =
+                       readl(ioaddr + DMA_BUS_MODE + i * 4);
 
-       pr_debug("\tCSR20 (0x%x): 0x%08x, CSR21 (0x%x): 0x%08x\n",
-                DMA_CUR_TX_BUF_ADDR, readl(ioaddr + DMA_CUR_TX_BUF_ADDR),
-                DMA_CUR_RX_BUF_ADDR, readl(ioaddr + DMA_CUR_RX_BUF_ADDR));
+       reg_space[DMA_CUR_TX_BUF_ADDR / 4] =
+               readl(ioaddr + DMA_CUR_TX_BUF_ADDR);
+       reg_space[DMA_CUR_RX_BUF_ADDR / 4] =
+               readl(ioaddr + DMA_CUR_RX_BUF_ADDR);
 }
 
 /* DMA controller has two counters to track the number of the missed frames. */
index 202216c..1e79e65 100644 (file)
@@ -70,19 +70,13 @@ static void dwmac4_rx_queue_enable(struct mac_device_info *hw, u32 queue)
        writel(value, ioaddr + GMAC_RXQ_CTRL0);
 }
 
-static void dwmac4_dump_regs(struct mac_device_info *hw)
+static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
 {
        void __iomem *ioaddr = hw->pcsr;
        int i;
 
-       pr_debug("\tDWMAC4 regs (base addr = 0x%p)\n", ioaddr);
-
-       for (i = 0; i < GMAC_REG_NUM; i++) {
-               int offset = i * 4;
-
-               pr_debug("\tReg No. %d (offset 0x%x): 0x%08x\n", i,
-                        offset, readl(ioaddr + offset));
-       }
+       for (i = 0; i < GMAC_REG_NUM; i++)
+               reg_space[i] = readl(ioaddr + i * 4);
 }
 
 static int dwmac4_rx_ipc_enable(struct mac_device_info *hw)
index 377d1b4..f97b0d5 100644 (file)
@@ -127,53 +127,51 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
                dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i);
 }
 
-static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
+static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel,
+                                 u32 *reg_space)
 {
-       pr_debug(" Channel %d\n", channel);
-       pr_debug("\tDMA_CHAN_CONTROL, offset: 0x%x, val: 0x%x\n", 0,
-                readl(ioaddr + DMA_CHAN_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_TX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x4,
-                readl(ioaddr + DMA_CHAN_TX_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_RX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x8,
-                readl(ioaddr + DMA_CHAN_RX_CONTROL(channel)));
-       pr_debug("\tDMA_CHAN_TX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x14,
-                readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_RX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x1c,
-                readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_TX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x20,
-                readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_RX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x28,
-                readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_TX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x2c,
-                readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel)));
-       pr_debug("\tDMA_CHAN_RX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x30,
-                readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel)));
-       pr_debug("\tDMA_CHAN_INTR_ENA, offset: 0x%x, val: 0x%x\n", 0x34,
-                readl(ioaddr + DMA_CHAN_INTR_ENA(channel)));
-       pr_debug("\tDMA_CHAN_RX_WATCHDOG, offset: 0x%x, val: 0x%x\n", 0x38,
-                readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel)));
-       pr_debug("\tDMA_CHAN_SLOT_CTRL_STATUS, offset: 0x%x, val: 0x%x\n", 0x3c,
-                readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel)));
-       pr_debug("\tDMA_CHAN_CUR_TX_DESC, offset: 0x%x, val: 0x%x\n", 0x44,
-                readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel)));
-       pr_debug("\tDMA_CHAN_CUR_RX_DESC, offset: 0x%x, val: 0x%x\n", 0x4c,
-                readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel)));
-       pr_debug("\tDMA_CHAN_CUR_TX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x54,
-                readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_CUR_RX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x5c,
-                readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel)));
-       pr_debug("\tDMA_CHAN_STATUS, offset: 0x%x, val: 0x%x\n", 0x60,
-                readl(ioaddr + DMA_CHAN_STATUS(channel)));
+       reg_space[DMA_CHAN_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CONTROL(channel));
+       reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
+       reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
+       reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel));
+       reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
+       reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel));
+       reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel));
+       reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel));
+       reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel));
+       reg_space[DMA_CHAN_INTR_ENA(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_INTR_ENA(channel));
+       reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel));
+       reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel));
+       reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel));
+       reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel));
+       reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel));
+       reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel));
+       reg_space[DMA_CHAN_STATUS(channel) / 4] =
+               readl(ioaddr + DMA_CHAN_STATUS(channel));
 }
 
-static void dwmac4_dump_dma_regs(void __iomem *ioaddr)
+static void dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space)
 {
        int i;
 
-       pr_debug(" GMAC4 DMA registers\n");
-
        for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
-               _dwmac4_dump_dma_regs(ioaddr, i);
+               _dwmac4_dump_dma_regs(ioaddr, i, reg_space);
 }
 
 static void dwmac4_rx_watchdog(void __iomem *ioaddr, u32 riwt)
index 5ff6bc4..85d6411 100644 (file)
@@ -435,32 +435,14 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev)
 static void stmmac_ethtool_gregs(struct net_device *dev,
                          struct ethtool_regs *regs, void *space)
 {
-       int i;
        u32 *reg_space = (u32 *) space;
 
        struct stmmac_priv *priv = netdev_priv(dev);
 
        memset(reg_space, 0x0, REG_SPACE_SIZE);
 
-       if (priv->plat->has_gmac || priv->plat->has_gmac4) {
-               /* MAC registers */
-               for (i = 0; i < 55; i++)
-                       reg_space[i] = readl(priv->ioaddr + (i * 4));
-               /* DMA registers */
-               for (i = 0; i < 22; i++)
-                       reg_space[i + 55] =
-                           readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
-       } else {
-               /* MAC registers */
-               for (i = 0; i < 12; i++)
-                       reg_space[i] = readl(priv->ioaddr + (i * 4));
-               /* DMA registers */
-               for (i = 0; i < 9; i++)
-                       reg_space[i + 12] =
-                           readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4)));
-               reg_space[22] = readl(priv->ioaddr + DMA_CUR_TX_BUF_ADDR);
-               reg_space[23] = readl(priv->ioaddr + DMA_CUR_RX_BUF_ADDR);
-       }
+       priv->hw->mac->dump_regs(priv->hw, reg_space);
+       priv->hw->dma->dump_regs(priv->ioaddr, reg_space);
 }
 
 static void
index 3cbe096..4498a38 100644 (file)
@@ -1729,11 +1729,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
        priv->hw->dma->start_tx(priv->ioaddr);
        priv->hw->dma->start_rx(priv->ioaddr);
 
-       /* Dump DMA/MAC registers */
-       if (netif_msg_hw(priv)) {
-               priv->hw->mac->dump_regs(priv->hw);
-               priv->hw->dma->dump_regs(priv->ioaddr);
-       }
        priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
        if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) {
index bda0c64..8969874 100644 (file)
@@ -1330,7 +1330,7 @@ static int __init gtp_init(void)
        if (err < 0)
                goto unreg_genl_family;
 
-       pr_info("GTP module loaded (pdp ctx size %Zd bytes)\n",
+       pr_info("GTP module loaded (pdp ctx size %zd bytes)\n",
                sizeof(struct pdp_ctx));
        return 0;
 
index d6f7838..1be69d8 100644 (file)
@@ -146,7 +146,7 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
  */
 int phy_aneg_done(struct phy_device *phydev)
 {
-       if (phydev->drv->aneg_done)
+       if (phydev->drv && phydev->drv->aneg_done)
                return phydev->drv->aneg_done(phydev);
 
        return genphy_aneg_done(phydev);
index 3e37724..8aefb28 100644 (file)
@@ -343,7 +343,7 @@ static const struct driver_info kalmia_info = {
 static const struct usb_device_id products[] = {
        /* The unswitched USB ID, to get the module auto loaded: */
        { USB_DEVICE(0x04e8, 0x689a) },
-       /* The stick swithed into modem (by e.g. usb_modeswitch): */
+       /* The stick switched into modem (by e.g. usb_modeswitch): */
        { USB_DEVICE(0x04e8, 0x6889),
                .driver_info = (unsigned long) &kalmia_info, },
        { /* EMPTY == end of list */} };
index 4f4f71b..c5b2113 100644 (file)
@@ -383,7 +383,7 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 
        /* REVISIT:  peripheral "alignment" request is ignored ... */
        dev_dbg(&intf->dev,
-               "hard mtu %u (%u from dev), rx buflen %Zu, align %d\n",
+               "hard mtu %u (%u from dev), rx buflen %zu, align %d\n",
                dev->hard_mtu, tmp, dev->rx_urb_size,
                1 << le32_to_cpu(u.init_c->packet_alignment));
 
index d9440bc..ac69f28 100644 (file)
@@ -379,7 +379,7 @@ static int sierra_net_parse_lsi(struct usbnet *dev, char *data, int datalen)
        u32 expected_length;
 
        if (datalen < sizeof(struct lsi_umts_single)) {
-               netdev_err(dev->net, "%s: Data length %d, exp >= %Zu\n",
+               netdev_err(dev->net, "%s: Data length %d, exp >= %zu\n",
                           __func__, datalen, sizeof(struct lsi_umts_single));
                return -1;
        }
index 556953f..b791199 100644 (file)
@@ -2035,7 +2035,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
        const struct iphdr *old_iph = ip_hdr(skb);
        union vxlan_addr *dst;
        union vxlan_addr remote_ip, local_ip;
-       union vxlan_addr *src;
        struct vxlan_metadata _md;
        struct vxlan_metadata *md = &_md;
        __be16 src_port = 0, dst_port;
@@ -2062,7 +2061,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
                dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
                vni = (rdst->remote_vni) ? : default_vni;
-               src = &vxlan->cfg.saddr;
+               local_ip = vxlan->cfg.saddr;
                dst_cache = &rdst->dst_cache;
                md->gbp = skb->mark;
                ttl = vxlan->cfg.ttl;
@@ -2095,7 +2094,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                dst = &remote_ip;
                dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
                vni = tunnel_id_to_key32(info->key.tun_id);
-               src = &local_ip;
                dst_cache = &info->dst_cache;
                if (info->options_len)
                        md = ip_tunnel_info_opts(info);
@@ -2115,7 +2113,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                rt = vxlan_get_route(vxlan, dev, sock4, skb,
                                     rdst ? rdst->remote_ifindex : 0, tos,
                                     dst->sin.sin_addr.s_addr,
-                                    &src->sin.sin_addr.s_addr,
+                                    &local_ip.sin.sin_addr.s_addr,
                                     dst_port, src_port,
                                     dst_cache, info);
                if (IS_ERR(rt)) {
@@ -2142,7 +2140,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                if (err < 0)
                        goto tx_error;
 
-               udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, src->sin.sin_addr.s_addr,
+               udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
                                    dst->sin.sin_addr.s_addr, tos, ttl, df,
                                    src_port, dst_port, xnet, !udp_sum);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -2152,7 +2150,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
                                        rdst ? rdst->remote_ifindex : 0, tos,
                                        label, &dst->sin6.sin6_addr,
-                                       &src->sin6.sin6_addr,
+                                       &local_ip.sin6.sin6_addr,
                                        dst_port, src_port,
                                        dst_cache, info);
                if (IS_ERR(ndst)) {
@@ -2180,7 +2178,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                        goto tx_error;
 
                udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
-                                    &src->sin6.sin6_addr,
+                                    &local_ip.sin6.sin6_addr,
                                     &dst->sin6.sin6_addr, tos, ttl,
                                     label, src_port, dst_port, !udp_sum);
 #endif
@@ -2675,7 +2673,7 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
 
        if (data[IFLA_VXLAN_ID]) {
                __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
-               if (id >= VXLAN_VID_MASK)
+               if (id >= VXLAN_N_VID)
                        return -ERANGE;
        }
 
index e74664b..502c346 100644 (file)
@@ -237,7 +237,7 @@ void __i2400mu_bm_notif_cb(struct urb *urb)
  *
  * @i2400m: device descriptor
  * @urb: urb to use
- * @completion: completion varible to complete when done
+ * @completion: completion variable to complete when done
  *
  * Data is always read to i2400m->bm_ack_buf
  */
index 815efe9..5214dd7 100644 (file)
@@ -59,13 +59,13 @@ static const struct ani_ofdm_level_entry ofdm_level_table[] = {
 /*
  * MRC (Maximal Ratio Combining) has always been used with multi-antenna ofdm.
  * With OFDM for single stream you just add up all antenna inputs, you're
- * only interested in what you get after FFT. Signal aligment is also not
+ * only interested in what you get after FFT. Signal alignment is also not
  * required for OFDM because any phase difference adds up in the frequency
  * domain.
  *
  * MRC requires extra work for use with CCK. You need to align the antenna
  * signals from the different antenna before you can add the signals together.
- * You need aligment of signals as CCK is in time domain, so addition can cancel
+ * You need alignment of signals as CCK is in time domain, so addition can cancel
  * your signal completely if phase is 180 degrees (think of adding sine waves).
  * You also need to remove noise before the addition and this is where ANI
  * MRC CCK comes into play. One of the antenna inputs may be stronger but
index 10098b7..944b83c 100644 (file)
@@ -4874,7 +4874,7 @@ brcmf_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
                kfree(af_params);
        } else {
                brcmf_dbg(TRACE, "Unhandled, fc=%04x!!\n", mgmt->frame_control);
-               brcmf_dbg_hex_dump(true, buf, len, "payload, len=%Zu\n", len);
+               brcmf_dbg_hex_dump(true, buf, len, "payload, len=%zu\n", len);
        }
 
 exit:
index 356aba9..f922859 100644 (file)
@@ -1238,7 +1238,7 @@ static int ipw2100_get_hw_features(struct ipw2100_priv *priv)
 }
 
 /*
- * Start firmware execution after power on and intialization
+ * Start firmware execution after power on and initialization
  * The sequence is:
  *  1. Release ARC
  *  2. Wait for f/w initialization completes;
@@ -1277,7 +1277,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv)
        /* Release ARC - clear reset bit */
        write_register(priv->net_dev, IPW_REG_RESET_REG, 0);
 
-       /* wait for f/w intialization complete */
+       /* wait for f/w initialization complete */
        IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n");
        i = 5000;
        do {
@@ -5652,7 +5652,7 @@ static void shim__set_security(struct net_device *dev,
 
 /* As a temporary work around to enable WPA until we figure out why
  * wpa_supplicant toggles the security capability of the driver, which
- * forces a disassocation with force_update...
+ * forces a disassociation with force_update...
  *
  *     if (force_update || !(priv->status & STATUS_ASSOCIATED))*/
        if (!(priv->status & (STATUS_ASSOCIATED | STATUS_ASSOCIATING)))
index ef9af8a..5ef3c5c 100644 (file)
@@ -3974,7 +3974,7 @@ static void ipw_send_disassociate(struct ipw_priv *priv, int quiet)
                return;
        }
 
-       IPW_DEBUG_ASSOC("Disassocation attempt from %pM "
+       IPW_DEBUG_ASSOC("Disassociation attempt from %pM "
                        "on channel %d.\n",
                        priv->assoc_request.bssid,
                        priv->assoc_request.channel);
@@ -5196,7 +5196,7 @@ static void ipw_rx_queue_restock(struct ipw_priv *priv)
  * Move all used packet from rx_used to rx_free, allocating a new SKB for each.
  * Also restock the Rx queue via ipw_rx_queue_restock.
  *
- * This is called as a scheduled work item (except for during intialization)
+ * This is called as a scheduled work item (except for during initialization)
  */
 static void ipw_rx_queue_replenish(void *data)
 {
index a91d170..2781f57 100644 (file)
@@ -4855,39 +4855,39 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
         */
 
        D_INFO("f/w package hdr ucode version raw = 0x%x\n", il->ucode_ver);
-       D_INFO("f/w package hdr runtime inst size = %Zd\n", pieces.inst_size);
-       D_INFO("f/w package hdr runtime data size = %Zd\n", pieces.data_size);
-       D_INFO("f/w package hdr init inst size = %Zd\n", pieces.init_size);
-       D_INFO("f/w package hdr init data size = %Zd\n", pieces.init_data_size);
-       D_INFO("f/w package hdr boot inst size = %Zd\n", pieces.boot_size);
+       D_INFO("f/w package hdr runtime inst size = %zd\n", pieces.inst_size);
+       D_INFO("f/w package hdr runtime data size = %zd\n", pieces.data_size);
+       D_INFO("f/w package hdr init inst size = %zd\n", pieces.init_size);
+       D_INFO("f/w package hdr init data size = %zd\n", pieces.init_data_size);
+       D_INFO("f/w package hdr boot inst size = %zd\n", pieces.boot_size);
 
        /* Verify that uCode images will fit in card's SRAM */
        if (pieces.inst_size > il->hw_params.max_inst_size) {
-               IL_ERR("uCode instr len %Zd too large to fit in\n",
+               IL_ERR("uCode instr len %zd too large to fit in\n",
                       pieces.inst_size);
                goto try_again;
        }
 
        if (pieces.data_size > il->hw_params.max_data_size) {
-               IL_ERR("uCode data len %Zd too large to fit in\n",
+               IL_ERR("uCode data len %zd too large to fit in\n",
                       pieces.data_size);
                goto try_again;
        }
 
        if (pieces.init_size > il->hw_params.max_inst_size) {
-               IL_ERR("uCode init instr len %Zd too large to fit in\n",
+               IL_ERR("uCode init instr len %zd too large to fit in\n",
                       pieces.init_size);
                goto try_again;
        }
 
        if (pieces.init_data_size > il->hw_params.max_data_size) {
-               IL_ERR("uCode init data len %Zd too large to fit in\n",
+               IL_ERR("uCode init data len %zd too large to fit in\n",
                       pieces.init_data_size);
                goto try_again;
        }
 
        if (pieces.boot_size > il->hw_params.max_bsm_size) {
-               IL_ERR("uCode boot instr len %Zd too large to fit in\n",
+               IL_ERR("uCode boot instr len %zd too large to fit in\n",
                       pieces.boot_size);
                goto try_again;
        }
@@ -4938,7 +4938,7 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
        /* Copy images into buffers for card's bus-master reads ... */
 
        /* Runtime instructions (first block of data in file) */
-       D_INFO("Copying (but not loading) uCode instr len %Zd\n",
+       D_INFO("Copying (but not loading) uCode instr len %zd\n",
               pieces.inst_size);
        memcpy(il->ucode_code.v_addr, pieces.inst, pieces.inst_size);
 
@@ -4949,28 +4949,28 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
         * Runtime data
         * NOTE:  Copy into backup buffer will be done in il_up()
         */
-       D_INFO("Copying (but not loading) uCode data len %Zd\n",
+       D_INFO("Copying (but not loading) uCode data len %zd\n",
               pieces.data_size);
        memcpy(il->ucode_data.v_addr, pieces.data, pieces.data_size);
        memcpy(il->ucode_data_backup.v_addr, pieces.data, pieces.data_size);
 
        /* Initialization instructions */
        if (pieces.init_size) {
-               D_INFO("Copying (but not loading) init instr len %Zd\n",
+               D_INFO("Copying (but not loading) init instr len %zd\n",
                       pieces.init_size);
                memcpy(il->ucode_init.v_addr, pieces.init, pieces.init_size);
        }
 
        /* Initialization data */
        if (pieces.init_data_size) {
-               D_INFO("Copying (but not loading) init data len %Zd\n",
+               D_INFO("Copying (but not loading) init data len %zd\n",
                       pieces.init_data_size);
                memcpy(il->ucode_init_data.v_addr, pieces.init_data,
                       pieces.init_data_size);
        }
 
        /* Bootstrap instructions */
-       D_INFO("Copying (but not loading) boot instr len %Zd\n",
+       D_INFO("Copying (but not loading) boot instr len %zd\n",
               pieces.boot_size);
        memcpy(il->ucode_boot.v_addr, pieces.boot, pieces.boot_size);
 
index 0e0293d..be466a0 100644 (file)
@@ -1141,21 +1141,21 @@ static int validate_sec_sizes(struct iwl_drv *drv,
                              struct iwl_firmware_pieces *pieces,
                              const struct iwl_cfg *cfg)
 {
-       IWL_DEBUG_INFO(drv, "f/w package hdr runtime inst size = %Zd\n",
+       IWL_DEBUG_INFO(drv, "f/w package hdr runtime inst size = %zd\n",
                get_sec_size(pieces, IWL_UCODE_REGULAR,
                             IWL_UCODE_SECTION_INST));
-       IWL_DEBUG_INFO(drv, "f/w package hdr runtime data size = %Zd\n",
+       IWL_DEBUG_INFO(drv, "f/w package hdr runtime data size = %zd\n",
                get_sec_size(pieces, IWL_UCODE_REGULAR,
                             IWL_UCODE_SECTION_DATA));
-       IWL_DEBUG_INFO(drv, "f/w package hdr init inst size = %Zd\n",
+       IWL_DEBUG_INFO(drv, "f/w package hdr init inst size = %zd\n",
                get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_INST));
-       IWL_DEBUG_INFO(drv, "f/w package hdr init data size = %Zd\n",
+       IWL_DEBUG_INFO(drv, "f/w package hdr init data size = %zd\n",
                get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_DATA));
 
        /* Verify that uCode images will fit in card's SRAM. */
        if (get_sec_size(pieces, IWL_UCODE_REGULAR, IWL_UCODE_SECTION_INST) >
            cfg->max_inst_size) {
-               IWL_ERR(drv, "uCode instr len %Zd too large to fit in\n",
+               IWL_ERR(drv, "uCode instr len %zd too large to fit in\n",
                        get_sec_size(pieces, IWL_UCODE_REGULAR,
                                     IWL_UCODE_SECTION_INST));
                return -1;
@@ -1163,7 +1163,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
        if (get_sec_size(pieces, IWL_UCODE_REGULAR, IWL_UCODE_SECTION_DATA) >
            cfg->max_data_size) {
-               IWL_ERR(drv, "uCode data len %Zd too large to fit in\n",
+               IWL_ERR(drv, "uCode data len %zd too large to fit in\n",
                        get_sec_size(pieces, IWL_UCODE_REGULAR,
                                     IWL_UCODE_SECTION_DATA));
                return -1;
@@ -1171,7 +1171,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
        if (get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_INST) >
             cfg->max_inst_size) {
-               IWL_ERR(drv, "uCode init instr len %Zd too large to fit in\n",
+               IWL_ERR(drv, "uCode init instr len %zd too large to fit in\n",
                        get_sec_size(pieces, IWL_UCODE_INIT,
                                     IWL_UCODE_SECTION_INST));
                return -1;
@@ -1179,7 +1179,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
        if (get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_DATA) >
            cfg->max_data_size) {
-               IWL_ERR(drv, "uCode init data len %Zd too large to fit in\n",
+               IWL_ERR(drv, "uCode init data len %zd too large to fit in\n",
                        get_sec_size(pieces, IWL_UCODE_REGULAR,
                                     IWL_UCODE_SECTION_DATA));
                return -1;
index abdd0cf..fac28bd 100644 (file)
@@ -346,9 +346,7 @@ void mwifiex_parse_tx_status_event(struct mwifiex_private *priv,
                return;
 
        spin_lock_irqsave(&priv->ack_status_lock, flags);
-       ack_skb = idr_find(&priv->ack_status_frames, tx_status->tx_token_id);
-       if (ack_skb)
-               idr_remove(&priv->ack_status_frames, tx_status->tx_token_id);
+       ack_skb = idr_remove(&priv->ack_status_frames, tx_status->tx_token_id);
        spin_unlock_irqrestore(&priv->ack_status_lock, flags);
 
        if (ack_skb) {
index 28c2f6f..e4ff3b9 100644 (file)
@@ -673,8 +673,8 @@ void mwifiex_update_ralist_tx_pause(struct mwifiex_private *priv, u8 *mac,
        spin_unlock_irqrestore(&priv->wmm.ra_list_spinlock, flags);
 }
 
-/* This function update non-tdls peer ralist tx_pause while
- * tdls channel swithing
+/* This function updates non-tdls peer ralist tx_pause while
+ * tdls channel switching
  */
 void mwifiex_update_ralist_tx_pause_in_tdls_cs(struct mwifiex_private *priv,
                                               u8 *mac, u8 tx_pause)
index 1922e78..89a0a28 100644 (file)
@@ -455,7 +455,7 @@ static u32 _rtl92s_fill_h2c_cmd(struct sk_buff *skb, u32 h2cbufferlen,
        u8 i = 0;
 
        do {
-               /* 8 - Byte aligment */
+               /* 8 - Byte alignment */
                len = H2C_TX_CMD_HDR_LEN + N_BYTE_ALIGMENT(pcmd_len[i], 8);
 
                /* Buffer length is not enough */
@@ -504,7 +504,7 @@ static u32 _rtl92s_get_h2c_cmdlen(u32 h2cbufferlen, u32 cmd_num, u32 *pcmd_len)
        u8 i = 0;
 
        do {
-               /* 8 - Byte aligment */
+               /* 8 - Byte alignment */
                len = H2C_TX_CMD_HDR_LEN + N_BYTE_ALIGMENT(pcmd_len[i], 8);
 
                /* Buffer length is not enough */
index ef5d394..cc8deec 100644 (file)
@@ -516,7 +516,7 @@ err:
 
 /**
  * rsi_disconnect() - This function performs the reverse of the probe function,
- *                   it deintialize the driver structure.
+ *                   it deinitialize the driver structure.
  * @pfunction: Pointer to the USB interface structure.
  *
  * Return: None.
index 5bdf7a0..d1aa3ee 100644 (file)
@@ -178,7 +178,7 @@ static struct wlcore_conf wl18xx_conf = {
        .sg = {
                .params = {
                        [WL18XX_CONF_SG_PARAM_0] = 0,
-                       /* Configuartion Parameters */
+                       /* Configuration Parameters */
                        [WL18XX_CONF_SG_ANTENNA_CONFIGURATION] = 0,
                        [WL18XX_CONF_SG_ZIGBEE_COEX] = 0,
                        [WL18XX_CONF_SG_TIME_SYNC] = 0,
index d0b7734..58898b9 100644 (file)
@@ -544,7 +544,7 @@ static int wl12xx_init_sta_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
        return 0;
 }
 
-/* vif-specific intialization */
+/* vif-specific initialization */
 static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 {
        int ret;
index d9c5583..a966c6a 100644 (file)
@@ -487,7 +487,7 @@ static int pn533_send_cmd_async(struct pn533 *dev, u8 cmd_code,
 /*
  * pn533_send_cmd_direct_async
  *
- * The function sends a piority cmd directly to the chip omiting the cmd
+ * The function sends a piority cmd directly to the chip omitting the cmd
  * queue. It's intended to be used by chaining mechanism of received responses
  * where the host has to request every single chunk of data before scheduling
  * next cmd from the queue.
index bc20a24..779f516 100644 (file)
@@ -1051,7 +1051,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
         * sequencer is not allocated in our driver's tagset and it's
         * triggered to be freed by blk_cleanup_queue(). So we need to
         * always mark it as signaled to ensure that the "wr_cqe", which is
-        * embeded in request's payload, is not freed when __ib_process_cq()
+        * embedded in request's payload, is not freed when __ib_process_cq()
         * calls wr_cqe->done().
         */
        if ((++queue->sig_count % 32) == 0 || flush)
index c0e7d21..7507160 100644 (file)
@@ -307,7 +307,7 @@ size_t parport_ieee1284_read_byte (struct parport *port,
                if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
                end_of_data:
                        DPRINTK (KERN_DEBUG
-                                "%s: No more byte data (%Zd bytes)\n",
+                                "%s: No more byte data (%zd bytes)\n",
                                 port->name, count);
 
                        /* Go to reverse idle phase. */
index 78530d1..3e56e7d 100644 (file)
@@ -902,7 +902,7 @@ static size_t parport_pc_ecp_write_block_pio(struct parport *port,
  *     ******************************************
  */
 
-/* GCC is not inlining extern inline function later overwriten to non-inline,
+/* GCC is not inlining extern inline function later overwritten to non-inline,
    so we use outlined_ variants here.  */
 static const struct parport_operations parport_pc_ops = {
        .write_data     = parport_pc_write_data,
index ca77d23..f754453 100644 (file)
@@ -3630,7 +3630,7 @@ static int __init pci_apply_final_quirks(void)
 fs_initcall_sync(pci_apply_final_quirks);
 
 /*
- * Followings are device-specific reset methods which can be used to
+ * Following are device-specific reset methods which can be used to
  * reset a single function if other methods (e.g. FLR, PM D0->D3) are
  * not available.
  */
index 8968dd7..e8c4e4f 100644 (file)
@@ -70,7 +70,7 @@ config PINCTRL_CYGNUS_MUX
 
          The Broadcom Cygnus IOMUX driver supports group based IOMUX
          configuration, with the exception that certain individual pins
-         can be overrided to GPIO function
+         can be overridden to GPIO function
 
 config PINCTRL_NSP_GPIO
        bool "Broadcom NSP GPIO (with PINCONF) driver"
index fa0f19b..974fd68 100644 (file)
@@ -195,7 +195,7 @@ static void sr_stop_vddautocomp(struct omap_sr *sr)
 }
 
 /*
- * This function handles the intializations which have to be done
+ * This function handles the initializations which have to be done
  * only when both sr device and class driver regiter has
  * completed. This will be attempted to be called from both sr class
  * driver register and sr device intializtion API's. Only one call
@@ -671,7 +671,7 @@ int sr_register_class(struct omap_sr_class_data *class_data)
        sr_class = class_data;
 
        /*
-        * Call into late init to do intializations that require
+        * Call into late init to do initializations that require
         * both sr driver and sr class driver to be initiallized.
         */
        list_for_each_entry(sr_info, &sr_list, node)
@@ -899,7 +899,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
        list_add(&sr_info->node, &sr_list);
 
        /*
-        * Call into late init to do intializations that require
+        * Call into late init to do initializations that require
         * both sr driver and sr class driver to be initiallized.
         */
        if (sr_class) {
index 9013a58..50b617a 100644 (file)
@@ -889,17 +889,16 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
                        goto err_req;
                }
 
-               down_read(&current->mm->mmap_sem);
-               pinned = get_user_pages(
+               pinned = get_user_pages_unlocked(
                                (unsigned long)xfer->loc_addr & PAGE_MASK,
                                nr_pages,
-                               dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
-                               page_list, NULL);
-               up_read(&current->mm->mmap_sem);
+                               page_list,
+                               dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0);
 
                if (pinned != nr_pages) {
                        if (pinned < 0) {
-                               rmcd_error("get_user_pages err=%ld", pinned);
+                               rmcd_error("get_user_pages_unlocked err=%ld",
+                                          pinned);
                                nr_pages = 0;
                        } else
                                rmcd_error("pinned %ld out of %ld pages",
index 5dc673d..ee1b0e9 100644 (file)
@@ -1434,9 +1434,10 @@ config RTC_DRV_SUN4V
          based RTC on SUN4V systems.
 
 config RTC_DRV_SUN6I
-       tristate "Allwinner A31 RTC"
-       default MACH_SUN6I || MACH_SUN8I || COMPILE_TEST
-       depends on ARCH_SUNXI
+       bool "Allwinner A31 RTC"
+       default MACH_SUN6I || MACH_SUN8I
+       depends on COMMON_CLK
+       depends on ARCH_SUNXI || COMPILE_TEST
        help
          If you say Y here you will get support for the RTC found in
          some Allwinner SoCs like the A31 or the A64.
@@ -1719,6 +1720,17 @@ config RTC_DRV_R7301
           This driver can also be built as a module. If so, the module
           will be called rtc-r7301.
 
+config RTC_DRV_STM32
+       tristate "STM32 RTC"
+       select REGMAP_MMIO
+       depends on ARCH_STM32 || COMPILE_TEST
+       help
+          If you say yes here you get support for the STM32 On-Chip
+          Real Time Clock.
+
+          This driver can also be built as a module, if so, the module
+          will be called "rtc-stm32".
+
 comment "HID Sensor RTC drivers"
 
 config RTC_DRV_HID_SENSOR_TIME
index f13ab1c..f07297b 100644 (file)
@@ -145,6 +145,7 @@ obj-$(CONFIG_RTC_DRV_SNVS)  += rtc-snvs.o
 obj-$(CONFIG_RTC_DRV_SPEAR)    += rtc-spear.o
 obj-$(CONFIG_RTC_DRV_STARFIRE) += rtc-starfire.o
 obj-$(CONFIG_RTC_DRV_STK17TA8) += rtc-stk17ta8.o
+obj-$(CONFIG_RTC_DRV_STM32)    += rtc-stm32.o
 obj-$(CONFIG_RTC_DRV_STMP)     += rtc-stmp3xxx.o
 obj-$(CONFIG_RTC_DRV_ST_LPC)   += rtc-st-lpc.o
 obj-$(CONFIG_RTC_DRV_SUN4V)    += rtc-sun4v.o
index 9a3f2a6..21f355c 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
 
 #define RTC_STATUS_ALARM1          BIT(0)
 #define RTC_STATUS_ALARM2          BIT(1)
 #define RTC_IRQ1_CONF      0x4
-#define RTC_IRQ1_AL_EN             BIT(0)
-#define RTC_IRQ1_FREQ_EN           BIT(1)
-#define RTC_IRQ1_FREQ_1HZ          BIT(2)
+#define RTC_IRQ2_CONF      0x8
+#define RTC_IRQ_AL_EN              BIT(0)
+#define RTC_IRQ_FREQ_EN                    BIT(1)
+#define RTC_IRQ_FREQ_1HZ           BIT(2)
+
 #define RTC_TIME           0xC
 #define RTC_ALARM1         0x10
-
-#define SOC_RTC_INTERRUPT   0x8
-#define SOC_RTC_ALARM1         BIT(0)
-#define SOC_RTC_ALARM2         BIT(1)
-#define SOC_RTC_ALARM1_MASK    BIT(2)
-#define SOC_RTC_ALARM2_MASK    BIT(3)
+#define RTC_ALARM2         0x14
+
+/* Armada38x SoC registers  */
+#define RTC_38X_BRIDGE_TIMING_CTL   0x0
+#define RTC_38X_PERIOD_OFFS            0
+#define RTC_38X_PERIOD_MASK            (0x3FF << RTC_38X_PERIOD_OFFS)
+#define RTC_38X_READ_DELAY_OFFS                26
+#define RTC_38X_READ_DELAY_MASK                (0x1F << RTC_38X_READ_DELAY_OFFS)
+
+/* Armada 7K/8K registers  */
+#define RTC_8K_BRIDGE_TIMING_CTL0    0x0
+#define RTC_8K_WRCLK_PERIOD_OFFS       0
+#define RTC_8K_WRCLK_PERIOD_MASK       (0xFFFF << RTC_8K_WRCLK_PERIOD_OFFS)
+#define RTC_8K_WRCLK_SETUP_OFFS                16
+#define RTC_8K_WRCLK_SETUP_MASK                (0xFFFF << RTC_8K_WRCLK_SETUP_OFFS)
+#define RTC_8K_BRIDGE_TIMING_CTL1   0x4
+#define RTC_8K_READ_DELAY_OFFS         0
+#define RTC_8K_READ_DELAY_MASK         (0xFFFF << RTC_8K_READ_DELAY_OFFS)
+
+#define RTC_8K_ISR                 0x10
+#define RTC_8K_IMR                 0x14
+#define RTC_8K_ALARM2                  BIT(0)
+
+#define SOC_RTC_INTERRUPT          0x8
+#define SOC_RTC_ALARM1                 BIT(0)
+#define SOC_RTC_ALARM2                 BIT(1)
+#define SOC_RTC_ALARM1_MASK            BIT(2)
+#define SOC_RTC_ALARM2_MASK            BIT(3)
+
+#define SAMPLE_NR 100
+
+struct value_to_freq {
+       u32 value;
+       u8 freq;
+};
 
 struct armada38x_rtc {
        struct rtc_device   *rtc_dev;
@@ -41,38 +73,153 @@ struct armada38x_rtc {
        void __iomem        *regs_soc;
        spinlock_t          lock;
        int                 irq;
+       struct value_to_freq *val_to_freq;
+       struct armada38x_rtc_data *data;
+};
+
+#define ALARM1 0
+#define ALARM2 1
+
+#define ALARM_REG(base, alarm)  ((base) + (alarm) * sizeof(u32))
+
+struct armada38x_rtc_data {
+       /* Initialize the RTC-MBUS bridge timing */
+       void (*update_mbus_timing)(struct armada38x_rtc *rtc);
+       u32 (*read_rtc_reg)(struct armada38x_rtc *rtc, u8 rtc_reg);
+       void (*clear_isr)(struct armada38x_rtc *rtc);
+       void (*unmask_interrupt)(struct armada38x_rtc *rtc);
+       u32 alarm;
 };
 
 /*
  * According to the datasheet, the OS should wait 5us after every
  * register write to the RTC hard macro so that the required update
  * can occur without holding off the system bus
+ * According to errata RES-3124064, Write to any RTC register
+ * may fail. As a workaround, before writing to RTC
+ * register, issue a dummy write of 0x0 twice to RTC Status
+ * register.
  */
+
 static void rtc_delayed_write(u32 val, struct armada38x_rtc *rtc, int offset)
 {
+       writel(0, rtc->regs + RTC_STATUS);
+       writel(0, rtc->regs + RTC_STATUS);
        writel(val, rtc->regs + offset);
        udelay(5);
 }
 
+/* Update RTC-MBUS bridge timing parameters */
+static void rtc_update_38x_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+       u32 reg;
+
+       reg = readl(rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+       reg &= ~RTC_38X_PERIOD_MASK;
+       reg |= 0x3FF << RTC_38X_PERIOD_OFFS; /* Maximum value */
+       reg &= ~RTC_38X_READ_DELAY_MASK;
+       reg |= 0x1F << RTC_38X_READ_DELAY_OFFS; /* Maximum value */
+       writel(reg, rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+}
+
+static void rtc_update_8k_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+       u32 reg;
+
+       reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+       reg &= ~RTC_8K_WRCLK_PERIOD_MASK;
+       reg |= 0x3FF << RTC_8K_WRCLK_PERIOD_OFFS;
+       reg &= ~RTC_8K_WRCLK_SETUP_MASK;
+       reg |= 0x29 << RTC_8K_WRCLK_SETUP_OFFS;
+       writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+
+       reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+       reg &= ~RTC_8K_READ_DELAY_MASK;
+       reg |= 0x3F << RTC_8K_READ_DELAY_OFFS;
+       writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+}
+
+static u32 read_rtc_register(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+       return readl(rtc->regs + rtc_reg);
+}
+
+static u32 read_rtc_register_38x_wa(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+       int i, index_max = 0, max = 0;
+
+       for (i = 0; i < SAMPLE_NR; i++) {
+               rtc->val_to_freq[i].value = readl(rtc->regs + rtc_reg);
+               rtc->val_to_freq[i].freq = 0;
+       }
+
+       for (i = 0; i < SAMPLE_NR; i++) {
+               int j = 0;
+               u32 value = rtc->val_to_freq[i].value;
+
+               while (rtc->val_to_freq[j].freq) {
+                       if (rtc->val_to_freq[j].value == value) {
+                               rtc->val_to_freq[j].freq++;
+                               break;
+                       }
+                       j++;
+               }
+
+               if (!rtc->val_to_freq[j].freq) {
+                       rtc->val_to_freq[j].value = value;
+                       rtc->val_to_freq[j].freq = 1;
+               }
+
+               if (rtc->val_to_freq[j].freq > max) {
+                       index_max = j;
+                       max = rtc->val_to_freq[j].freq;
+               }
+
+               /*
+                * If a value already has half of the sample this is the most
+                * frequent one and we can stop the research right now
+                */
+               if (max > SAMPLE_NR / 2)
+                       break;
+       }
+
+       return rtc->val_to_freq[index_max].value;
+}
+
+static void armada38x_clear_isr(struct armada38x_rtc *rtc)
+{
+       u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+       writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada38x_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+       u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+       writel(val | SOC_RTC_ALARM1_MASK, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada8k_clear_isr(struct armada38x_rtc *rtc)
+{
+       writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_ISR);
+}
+
+static void armada8k_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+       writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_IMR);
+}
+
 static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
-       unsigned long time, time_check, flags;
+       unsigned long time, flags;
 
        spin_lock_irqsave(&rtc->lock, flags);
-       time = readl(rtc->regs + RTC_TIME);
-       /*
-        * WA for failing time set attempts. As stated in HW ERRATA if
-        * more than one second between two time reads is detected
-        * then read once again.
-        */
-       time_check = readl(rtc->regs + RTC_TIME);
-       if ((time_check - time) > 1)
-               time_check = readl(rtc->regs + RTC_TIME);
-
+       time = rtc->data->read_rtc_reg(rtc, RTC_TIME);
        spin_unlock_irqrestore(&rtc->lock, flags);
 
-       rtc_time_to_tm(time_check, tm);
+       rtc_time_to_tm(time, tm);
 
        return 0;
 }
@@ -87,16 +234,9 @@ static int armada38x_rtc_set_time(struct device *dev, struct rtc_time *tm)
 
        if (ret)
                goto out;
-       /*
-        * According to errata FE-3124064, Write to RTC TIME register
-        * may fail. As a workaround, after writing to RTC TIME
-        * register, issue a dummy write of 0x0 twice to RTC Status
-        * register.
-        */
+
        spin_lock_irqsave(&rtc->lock, flags);
        rtc_delayed_write(time, rtc, RTC_TIME);
-       rtc_delayed_write(0, rtc, RTC_STATUS);
-       rtc_delayed_write(0, rtc, RTC_STATUS);
        spin_unlock_irqrestore(&rtc->lock, flags);
 
 out:
@@ -107,12 +247,14 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
        unsigned long time, flags;
+       u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+       u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
        u32 val;
 
        spin_lock_irqsave(&rtc->lock, flags);
 
-       time = readl(rtc->regs + RTC_ALARM1);
-       val = readl(rtc->regs + RTC_IRQ1_CONF) & RTC_IRQ1_AL_EN;
+       time = rtc->data->read_rtc_reg(rtc, reg);
+       val = rtc->data->read_rtc_reg(rtc, reg_irq) & RTC_IRQ_AL_EN;
 
        spin_unlock_irqrestore(&rtc->lock, flags);
 
@@ -125,9 +267,10 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+       u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+       u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
        unsigned long time, flags;
        int ret = 0;
-       u32 val;
 
        ret = rtc_tm_to_time(&alrm->time, &time);
 
@@ -136,13 +279,11 @@ static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
        spin_lock_irqsave(&rtc->lock, flags);
 
-       rtc_delayed_write(time, rtc, RTC_ALARM1);
+       rtc_delayed_write(time, rtc, reg);
 
        if (alrm->enabled) {
-                       rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
-                       val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
-                       writel(val | SOC_RTC_ALARM1_MASK,
-                              rtc->regs_soc + SOC_RTC_INTERRUPT);
+               rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
+               rtc->data->unmask_interrupt(rtc);
        }
 
        spin_unlock_irqrestore(&rtc->lock, flags);
@@ -155,14 +296,15 @@ static int armada38x_rtc_alarm_irq_enable(struct device *dev,
                                         unsigned int enabled)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+       u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
        unsigned long flags;
 
        spin_lock_irqsave(&rtc->lock, flags);
 
        if (enabled)
-               rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
+               rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
        else
-               rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+               rtc_delayed_write(0, rtc, reg_irq);
 
        spin_unlock_irqrestore(&rtc->lock, flags);
 
@@ -174,24 +316,23 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
        struct armada38x_rtc *rtc = data;
        u32 val;
        int event = RTC_IRQF | RTC_AF;
+       u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
 
        dev_dbg(&rtc->rtc_dev->dev, "%s:irq(%d)\n", __func__, irq);
 
        spin_lock(&rtc->lock);
 
-       val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
-
-       writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
-       val = readl(rtc->regs + RTC_IRQ1_CONF);
-       /* disable all the interrupts for alarm 1 */
-       rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+       rtc->data->clear_isr(rtc);
+       val = rtc->data->read_rtc_reg(rtc, reg_irq);
+       /* disable all the interrupts for alarm*/
+       rtc_delayed_write(0, rtc, reg_irq);
        /* Ack the event */
-       rtc_delayed_write(RTC_STATUS_ALARM1, rtc, RTC_STATUS);
+       rtc_delayed_write(1 << rtc->data->alarm, rtc, RTC_STATUS);
 
        spin_unlock(&rtc->lock);
 
-       if (val & RTC_IRQ1_FREQ_EN) {
-               if (val & RTC_IRQ1_FREQ_1HZ)
+       if (val & RTC_IRQ_FREQ_EN) {
+               if (val & RTC_IRQ_FREQ_1HZ)
                        event |= RTC_UF;
                else
                        event |= RTC_PF;
@@ -202,7 +343,7 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
        return IRQ_HANDLED;
 }
 
-static struct rtc_class_ops armada38x_rtc_ops = {
+static const struct rtc_class_ops armada38x_rtc_ops = {
        .read_time = armada38x_rtc_read_time,
        .set_time = armada38x_rtc_set_time,
        .read_alarm = armada38x_rtc_read_alarm,
@@ -210,17 +351,65 @@ static struct rtc_class_ops armada38x_rtc_ops = {
        .alarm_irq_enable = armada38x_rtc_alarm_irq_enable,
 };
 
+static const struct rtc_class_ops armada38x_rtc_ops_noirq = {
+       .read_time = armada38x_rtc_read_time,
+       .set_time = armada38x_rtc_set_time,
+       .read_alarm = armada38x_rtc_read_alarm,
+};
+
+static const struct armada38x_rtc_data armada38x_data = {
+       .update_mbus_timing = rtc_update_38x_mbus_timing_params,
+       .read_rtc_reg = read_rtc_register_38x_wa,
+       .clear_isr = armada38x_clear_isr,
+       .unmask_interrupt = armada38x_unmask_interrupt,
+       .alarm = ALARM1,
+};
+
+static const struct armada38x_rtc_data armada8k_data = {
+       .update_mbus_timing = rtc_update_8k_mbus_timing_params,
+       .read_rtc_reg = read_rtc_register,
+       .clear_isr = armada8k_clear_isr,
+       .unmask_interrupt = armada8k_unmask_interrupt,
+       .alarm = ALARM2,
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id armada38x_rtc_of_match_table[] = {
+       {
+               .compatible = "marvell,armada-380-rtc",
+               .data = &armada38x_data,
+       },
+       {
+               .compatible = "marvell,armada-8k-rtc",
+               .data = &armada8k_data,
+       },
+       {}
+};
+MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
+#endif
+
 static __init int armada38x_rtc_probe(struct platform_device *pdev)
 {
+       const struct rtc_class_ops *ops;
        struct resource *res;
        struct armada38x_rtc *rtc;
+       const struct of_device_id *match;
        int ret;
 
+       match = of_match_device(armada38x_rtc_of_match_table, &pdev->dev);
+       if (!match)
+               return -ENODEV;
+
        rtc = devm_kzalloc(&pdev->dev, sizeof(struct armada38x_rtc),
                            GFP_KERNEL);
        if (!rtc)
                return -ENOMEM;
 
+       rtc->val_to_freq = devm_kcalloc(&pdev->dev, SAMPLE_NR,
+                               sizeof(struct value_to_freq), GFP_KERNEL);
+       if (!rtc->val_to_freq)
+               return -ENOMEM;
+
        spin_lock_init(&rtc->lock);
 
        res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rtc");
@@ -242,19 +431,27 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
                                0, pdev->name, rtc) < 0) {
                dev_warn(&pdev->dev, "Interrupt not available.\n");
                rtc->irq = -1;
+       }
+       platform_set_drvdata(pdev, rtc);
+
+       if (rtc->irq != -1) {
+               device_init_wakeup(&pdev->dev, 1);
+               ops = &armada38x_rtc_ops;
+       } else {
                /*
                 * If there is no interrupt available then we can't
                 * use the alarm
                 */
-               armada38x_rtc_ops.set_alarm = NULL;
-               armada38x_rtc_ops.alarm_irq_enable = NULL;
+               ops = &armada38x_rtc_ops_noirq;
        }
-       platform_set_drvdata(pdev, rtc);
-       if (rtc->irq != -1)
-               device_init_wakeup(&pdev->dev, 1);
+       rtc->data = (struct armada38x_rtc_data *)match->data;
+
+
+       /* Update RTC-MBUS bridge timing parameters */
+       rtc->data->update_mbus_timing(rtc);
 
        rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
-                                       &armada38x_rtc_ops, THIS_MODULE);
+                                               ops, THIS_MODULE);
        if (IS_ERR(rtc->rtc_dev)) {
                ret = PTR_ERR(rtc->rtc_dev);
                dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
@@ -280,6 +477,9 @@ static int armada38x_rtc_resume(struct device *dev)
        if (device_may_wakeup(dev)) {
                struct armada38x_rtc *rtc = dev_get_drvdata(dev);
 
+               /* Update RTC-MBUS bridge timing parameters */
+               rtc->data->update_mbus_timing(rtc);
+
                return disable_irq_wake(rtc->irq);
        }
 
@@ -290,14 +490,6 @@ static int armada38x_rtc_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(armada38x_rtc_pm_ops,
                         armada38x_rtc_suspend, armada38x_rtc_resume);
 
-#ifdef CONFIG_OF
-static const struct of_device_id armada38x_rtc_of_match_table[] = {
-       { .compatible = "marvell,armada-380-rtc", },
-       {}
-};
-MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
-#endif
-
 static struct platform_driver armada38x_rtc_driver = {
        .driver         = {
                .name   = "armada38x-rtc",
index 84d6e02..2ba44cc 100644 (file)
@@ -56,7 +56,7 @@ static int au1xtoy_rtc_set_time(struct device *dev, struct rtc_time *tm)
        return 0;
 }
 
-static struct rtc_class_ops au1xtoy_rtc_ops = {
+static const struct rtc_class_ops au1xtoy_rtc_ops = {
        .read_time      = au1xtoy_rtc_read_time,
        .set_time       = au1xtoy_rtc_set_time,
 };
index 535a5f9..15344b7 100644 (file)
@@ -333,7 +333,7 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq)
 #undef yesno
 }
 
-static struct rtc_class_ops bfin_rtc_ops = {
+static const struct rtc_class_ops bfin_rtc_ops = {
        .read_time     = bfin_rtc_read_time,
        .set_time      = bfin_rtc_set_time,
        .read_alarm    = bfin_rtc_read_alarm,
index 3977424..2b22393 100644 (file)
@@ -34,6 +34,7 @@
 #define BQ32K_CALIBRATION      0x07    /* CAL_CFG1, calibration and control */
 #define BQ32K_TCH2             0x08    /* Trickle charge enable */
 #define BQ32K_CFG2             0x09    /* Trickle charger control */
+#define BQ32K_TCFE             BIT(6)  /* Trickle charge FET bypass */
 
 struct bq32k_regs {
        uint8_t         seconds;
@@ -188,6 +189,65 @@ static int trickle_charger_of_init(struct device *dev, struct device_node *node)
        return 0;
 }
 
+static ssize_t bq32k_sysfs_show_tricklecharge_bypass(struct device *dev,
+                                              struct device_attribute *attr,
+                                              char *buf)
+{
+       int reg, error;
+
+       error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+       if (error)
+               return error;
+
+       return sprintf(buf, "%d\n", (reg & BQ32K_TCFE) ? 1 : 0);
+}
+
+static ssize_t bq32k_sysfs_store_tricklecharge_bypass(struct device *dev,
+                                               struct device_attribute *attr,
+                                               const char *buf, size_t count)
+{
+       int reg, enable, error;
+
+       if (kstrtoint(buf, 0, &enable))
+               return -EINVAL;
+
+       error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+       if (error)
+               return error;
+
+       if (enable) {
+               reg |= BQ32K_TCFE;
+               error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+               if (error)
+                       return error;
+
+               dev_info(dev, "Enabled trickle charge FET bypass.\n");
+       } else {
+               reg &= ~BQ32K_TCFE;
+               error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+               if (error)
+                       return error;
+
+               dev_info(dev, "Disabled trickle charge FET bypass.\n");
+       }
+
+       return count;
+}
+
+static DEVICE_ATTR(trickle_charge_bypass, 0644,
+                  bq32k_sysfs_show_tricklecharge_bypass,
+                  bq32k_sysfs_store_tricklecharge_bypass);
+
+static int bq32k_sysfs_register(struct device *dev)
+{
+       return device_create_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
+static void bq32k_sysfs_unregister(struct device *dev)
+{
+       device_remove_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
 static int bq32k_probe(struct i2c_client *client,
                                const struct i2c_device_id *id)
 {
@@ -224,11 +284,26 @@ static int bq32k_probe(struct i2c_client *client,
        if (IS_ERR(rtc))
                return PTR_ERR(rtc);
 
+       error = bq32k_sysfs_register(&client->dev);
+       if (error) {
+               dev_err(&client->dev,
+                       "Unable to create sysfs entries for rtc bq32000\n");
+               return error;
+       }
+
+
        i2c_set_clientdata(client, rtc);
 
        return 0;
 }
 
+static int bq32k_remove(struct i2c_client *client)
+{
+       bq32k_sysfs_unregister(&client->dev);
+
+       return 0;
+}
+
 static const struct i2c_device_id bq32k_id[] = {
        { "bq32000", 0 },
        { }
@@ -240,6 +315,7 @@ static struct i2c_driver bq32k_driver = {
                .name   = "bq32k",
        },
        .probe          = bq32k_probe,
+       .remove         = bq32k_remove,
        .id_table       = bq32k_id,
 };
 
index 94067f8..f225cd8 100644 (file)
@@ -116,7 +116,7 @@ static int dm355evm_rtc_set_time(struct device *dev, struct rtc_time *tm)
        return 0;
 }
 
-static struct rtc_class_ops dm355evm_rtc_ops = {
+static const struct rtc_class_ops dm355evm_rtc_ops = {
        .read_time      = dm355evm_rtc_read_time,
        .set_time       = dm355evm_rtc_set_time,
 };
index b1f20d8..9bb39a0 100644 (file)
 #include <linux/slab.h>
 #include <linux/regmap.h>
 
-#define DS3232_REG_SECONDS     0x00
-#define DS3232_REG_MINUTES     0x01
-#define DS3232_REG_HOURS       0x02
-#define DS3232_REG_AMPM                0x02
-#define DS3232_REG_DAY         0x03
-#define DS3232_REG_DATE                0x04
-#define DS3232_REG_MONTH       0x05
-#define DS3232_REG_CENTURY     0x05
-#define DS3232_REG_YEAR                0x06
-#define DS3232_REG_ALARM1         0x07 /* Alarm 1 BASE */
-#define DS3232_REG_ALARM2         0x0B /* Alarm 2 BASE */
-#define DS3232_REG_CR          0x0E    /* Control register */
-#      define DS3232_REG_CR_nEOSC        0x80
-#       define DS3232_REG_CR_INTCN        0x04
-#       define DS3232_REG_CR_A2IE        0x02
-#       define DS3232_REG_CR_A1IE        0x01
-
-#define DS3232_REG_SR  0x0F    /* control/status register */
-#      define DS3232_REG_SR_OSF   0x80
-#       define DS3232_REG_SR_BSY   0x04
-#       define DS3232_REG_SR_A2F   0x02
-#       define DS3232_REG_SR_A1F   0x01
+#define DS3232_REG_SECONDS      0x00
+#define DS3232_REG_MINUTES      0x01
+#define DS3232_REG_HOURS        0x02
+#define DS3232_REG_AMPM         0x02
+#define DS3232_REG_DAY          0x03
+#define DS3232_REG_DATE         0x04
+#define DS3232_REG_MONTH        0x05
+#define DS3232_REG_CENTURY      0x05
+#define DS3232_REG_YEAR         0x06
+#define DS3232_REG_ALARM1       0x07       /* Alarm 1 BASE */
+#define DS3232_REG_ALARM2       0x0B       /* Alarm 2 BASE */
+#define DS3232_REG_CR           0x0E       /* Control register */
+#       define DS3232_REG_CR_nEOSC   0x80
+#       define DS3232_REG_CR_INTCN   0x04
+#       define DS3232_REG_CR_A2IE    0x02
+#       define DS3232_REG_CR_A1IE    0x01
+
+#define DS3232_REG_SR           0x0F       /* control/status register */
+#       define DS3232_REG_SR_OSF     0x80
+#       define DS3232_REG_SR_BSY     0x04
+#       define DS3232_REG_SR_A2F     0x02
+#       define DS3232_REG_SR_A1F     0x01
 
 struct ds3232 {
        struct device *dev;
@@ -363,6 +363,9 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
        if (ret)
                return ret;
 
+       if (ds3232->irq > 0)
+               device_init_wakeup(dev, 1);
+
        ds3232->rtc = devm_rtc_device_register(dev, name, &ds3232_rtc_ops,
                                                THIS_MODULE);
        if (IS_ERR(ds3232->rtc))
@@ -374,10 +377,10 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
                                                IRQF_SHARED | IRQF_ONESHOT,
                                                name, dev);
                if (ret) {
+                       device_set_wakeup_capable(dev, 0);
                        ds3232->irq = 0;
                        dev_err(dev, "unable to request IRQ\n");
-               } else
-                       device_init_wakeup(dev, 1);
+               }
        }
 
        return 0;
@@ -420,6 +423,7 @@ static int ds3232_i2c_probe(struct i2c_client *client,
        static const struct regmap_config config = {
                .reg_bits = 8,
                .val_bits = 8,
+               .max_register = 0x13,
        };
 
        regmap = devm_regmap_init_i2c(client, &config);
@@ -479,6 +483,7 @@ static int ds3234_probe(struct spi_device *spi)
        static const struct regmap_config config = {
                .reg_bits = 8,
                .val_bits = 8,
+               .max_register = 0x13,
                .write_flag_mask = 0x80,
        };
        struct regmap *regmap;
index 688debc..ccf0dba 100644 (file)
@@ -159,9 +159,16 @@ static int gemini_rtc_remove(struct platform_device *pdev)
        return 0;
 }
 
+static const struct of_device_id gemini_rtc_dt_match[] = {
+       { .compatible = "cortina,gemini-rtc" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, gemini_rtc_dt_match);
+
 static struct platform_driver gemini_rtc_driver = {
        .driver         = {
                .name   = DRV_NAME,
+               .of_match_table = gemini_rtc_dt_match,
        },
        .probe          = gemini_rtc_probe,
        .remove         = gemini_rtc_remove,
index 67b56b8..6b54f6c 100644 (file)
  * @pdev: pionter to platform dev
  * @rtc: pointer to rtc struct
  * @ioaddr: IO registers pointer
- * @irq: dryice normal interrupt
  * @clk: input reference clock
  * @dsr: copy of the DSR register
  * @irq_lock: interrupt enable register (DIER) lock
@@ -120,7 +119,6 @@ struct imxdi_dev {
        struct platform_device *pdev;
        struct rtc_device *rtc;
        void __iomem *ioaddr;
-       int irq;
        struct clk *clk;
        u32 dsr;
        spinlock_t irq_lock;
@@ -668,7 +666,7 @@ static int dryice_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
        return 0;
 }
 
-static struct rtc_class_ops dryice_rtc_ops = {
+static const struct rtc_class_ops dryice_rtc_ops = {
        .read_time              = dryice_rtc_read_time,
        .set_mmss               = dryice_rtc_set_mmss,
        .alarm_irq_enable       = dryice_rtc_alarm_irq_enable,
@@ -677,9 +675,9 @@ static struct rtc_class_ops dryice_rtc_ops = {
 };
 
 /*
- * dryice "normal" interrupt handler
+ * interrupt handler for dryice "normal" and security violation interrupt
  */
-static irqreturn_t dryice_norm_irq(int irq, void *dev_id)
+static irqreturn_t dryice_irq(int irq, void *dev_id)
 {
        struct imxdi_dev *imxdi = dev_id;
        u32 dsr, dier;
@@ -765,6 +763,7 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 {
        struct resource *res;
        struct imxdi_dev *imxdi;
+       int norm_irq, sec_irq;
        int rc;
 
        imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL);
@@ -780,9 +779,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 
        spin_lock_init(&imxdi->irq_lock);
 
-       imxdi->irq = platform_get_irq(pdev, 0);
-       if (imxdi->irq < 0)
-               return imxdi->irq;
+       norm_irq = platform_get_irq(pdev, 0);
+       if (norm_irq < 0)
+               return norm_irq;
+
+       /* the 2nd irq is the security violation irq
+        * make this optional, don't break the device tree ABI
+        */
+       sec_irq = platform_get_irq(pdev, 1);
+       if (sec_irq <= 0)
+               sec_irq = IRQ_NOTCONNECTED;
 
        init_waitqueue_head(&imxdi->write_wait);
 
@@ -808,13 +814,20 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
        if (rc != 0)
                goto err;
 
-       rc = devm_request_irq(&pdev->dev, imxdi->irq, dryice_norm_irq,
-                       IRQF_SHARED, pdev->name, imxdi);
+       rc = devm_request_irq(&pdev->dev, norm_irq, dryice_irq,
+                             IRQF_SHARED, pdev->name, imxdi);
        if (rc) {
                dev_warn(&pdev->dev, "interrupt not available.\n");
                goto err;
        }
 
+       rc = devm_request_irq(&pdev->dev, sec_irq, dryice_irq,
+                             IRQF_SHARED, pdev->name, imxdi);
+       if (rc) {
+               dev_warn(&pdev->dev, "security violation interrupt not available.\n");
+               /* this is not an error, see above */
+       }
+
        platform_set_drvdata(pdev, imxdi);
        imxdi->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
                                  &dryice_rtc_ops, THIS_MODULE);
index 22a9ec4..e04ca54 100644 (file)
@@ -138,7 +138,7 @@ err:
        return ret;
 }
 
-static struct rtc_class_ops  ls1x_rtc_ops = {
+static const struct rtc_class_ops  ls1x_rtc_ops = {
        .read_time      = ls1x_rtc_read_time,
        .set_time       = ls1x_rtc_set_time,
 };
index 0eeb571..02af045 100644 (file)
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/bcd.h>
+#include <linux/io.h>
 
-#define M48T86_REG_SEC         0x00
-#define M48T86_REG_SECALRM     0x01
-#define M48T86_REG_MIN         0x02
-#define M48T86_REG_MINALRM     0x03
-#define M48T86_REG_HOUR                0x04
-#define M48T86_REG_HOURALRM    0x05
-#define M48T86_REG_DOW         0x06 /* 1 = sunday */
-#define M48T86_REG_DOM         0x07
-#define M48T86_REG_MONTH       0x08 /* 1 - 12 */
-#define M48T86_REG_YEAR                0x09 /* 0 - 99 */
-#define M48T86_REG_A           0x0A
-#define M48T86_REG_B           0x0B
-#define M48T86_REG_C           0x0C
-#define M48T86_REG_D           0x0D
-
-#define M48T86_REG_B_H24       (1 << 1)
-#define M48T86_REG_B_DM                (1 << 2)
-#define M48T86_REG_B_SET       (1 << 7)
-#define M48T86_REG_D_VRT       (1 << 7)
+#define M48T86_SEC             0x00
+#define M48T86_SECALRM         0x01
+#define M48T86_MIN             0x02
+#define M48T86_MINALRM         0x03
+#define M48T86_HOUR            0x04
+#define M48T86_HOURALRM                0x05
+#define M48T86_DOW             0x06 /* 1 = sunday */
+#define M48T86_DOM             0x07
+#define M48T86_MONTH           0x08 /* 1 - 12 */
+#define M48T86_YEAR            0x09 /* 0 - 99 */
+#define M48T86_A               0x0a
+#define M48T86_B               0x0b
+#define M48T86_B_SET           BIT(7)
+#define M48T86_B_DM            BIT(2)
+#define M48T86_B_H24           BIT(1)
+#define M48T86_C               0x0c
+#define M48T86_D               0x0d
+#define M48T86_D_VRT           BIT(7)
+#define M48T86_NVRAM(x)                (0x0e + (x))
+#define M48T86_NVRAM_LEN       114
+
+struct m48t86_rtc_info {
+       void __iomem *index_reg;
+       void __iomem *data_reg;
+       struct rtc_device *rtc;
+};
+
+static unsigned char m48t86_readb(struct device *dev, unsigned long addr)
+{
+       struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+       unsigned char value;
+
+       writeb(addr, info->index_reg);
+       value = readb(info->data_reg);
+
+       return value;
+}
+
+static void m48t86_writeb(struct device *dev,
+                         unsigned char value, unsigned long addr)
+{
+       struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+
+       writeb(addr, info->index_reg);
+       writeb(value, info->data_reg);
+}
 
 static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
        unsigned char reg;
-       struct platform_device *pdev = to_platform_device(dev);
-       struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-       reg = ops->readbyte(M48T86_REG_B);
+       reg = m48t86_readb(dev, M48T86_B);
 
-       if (reg & M48T86_REG_B_DM) {
+       if (reg & M48T86_B_DM) {
                /* data (binary) mode */
-               tm->tm_sec      = ops->readbyte(M48T86_REG_SEC);
-               tm->tm_min      = ops->readbyte(M48T86_REG_MIN);
-               tm->tm_hour     = ops->readbyte(M48T86_REG_HOUR) & 0x3F;
-               tm->tm_mday     = ops->readbyte(M48T86_REG_DOM);
+               tm->tm_sec      = m48t86_readb(dev, M48T86_SEC);
+               tm->tm_min      = m48t86_readb(dev, M48T86_MIN);
+               tm->tm_hour     = m48t86_readb(dev, M48T86_HOUR) & 0x3f;
+               tm->tm_mday     = m48t86_readb(dev, M48T86_DOM);
                /* tm_mon is 0-11 */
-               tm->tm_mon      = ops->readbyte(M48T86_REG_MONTH) - 1;
-               tm->tm_year     = ops->readbyte(M48T86_REG_YEAR) + 100;
-               tm->tm_wday     = ops->readbyte(M48T86_REG_DOW);
+               tm->tm_mon      = m48t86_readb(dev, M48T86_MONTH) - 1;
+               tm->tm_year     = m48t86_readb(dev, M48T86_YEAR) + 100;
+               tm->tm_wday     = m48t86_readb(dev, M48T86_DOW);
        } else {
                /* bcd mode */
-               tm->tm_sec      = bcd2bin(ops->readbyte(M48T86_REG_SEC));
-               tm->tm_min      = bcd2bin(ops->readbyte(M48T86_REG_MIN));
-               tm->tm_hour     = bcd2bin(ops->readbyte(M48T86_REG_HOUR) & 0x3F);
-               tm->tm_mday     = bcd2bin(ops->readbyte(M48T86_REG_DOM));
+               tm->tm_sec      = bcd2bin(m48t86_readb(dev, M48T86_SEC));
+               tm->tm_min      = bcd2bin(m48t86_readb(dev, M48T86_MIN));
+               tm->tm_hour     = bcd2bin(m48t86_readb(dev, M48T86_HOUR) &
+                                         0x3f);
+               tm->tm_mday     = bcd2bin(m48t86_readb(dev, M48T86_DOM));
                /* tm_mon is 0-11 */
-               tm->tm_mon      = bcd2bin(ops->readbyte(M48T86_REG_MONTH)) - 1;
-               tm->tm_year     = bcd2bin(ops->readbyte(M48T86_REG_YEAR)) + 100;
-               tm->tm_wday     = bcd2bin(ops->readbyte(M48T86_REG_DOW));
+               tm->tm_mon      = bcd2bin(m48t86_readb(dev, M48T86_MONTH)) - 1;
+               tm->tm_year     = bcd2bin(m48t86_readb(dev, M48T86_YEAR)) + 100;
+               tm->tm_wday     = bcd2bin(m48t86_readb(dev, M48T86_DOW));
        }
 
        /* correct the hour if the clock is in 12h mode */
-       if (!(reg & M48T86_REG_B_H24))
-               if (ops->readbyte(M48T86_REG_HOUR) & 0x80)
+       if (!(reg & M48T86_B_H24))
+               if (m48t86_readb(dev, M48T86_HOUR) & 0x80)
                        tm->tm_hour += 12;
 
        return rtc_valid_tm(tm);
@@ -80,38 +106,36 @@ static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
 static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
        unsigned char reg;
-       struct platform_device *pdev = to_platform_device(dev);
-       struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-       reg = ops->readbyte(M48T86_REG_B);
+       reg = m48t86_readb(dev, M48T86_B);
 
        /* update flag and 24h mode */
-       reg |= M48T86_REG_B_SET | M48T86_REG_B_H24;
-       ops->writebyte(reg, M48T86_REG_B);
+       reg |= M48T86_B_SET | M48T86_B_H24;
+       m48t86_writeb(dev, reg, M48T86_B);
 
-       if (reg & M48T86_REG_B_DM) {
+       if (reg & M48T86_B_DM) {
                /* data (binary) mode */
-               ops->writebyte(tm->tm_sec, M48T86_REG_SEC);
-               ops->writebyte(tm->tm_min, M48T86_REG_MIN);
-               ops->writebyte(tm->tm_hour, M48T86_REG_HOUR);
-               ops->writebyte(tm->tm_mday, M48T86_REG_DOM);
-               ops->writebyte(tm->tm_mon + 1, M48T86_REG_MONTH);
-               ops->writebyte(tm->tm_year % 100, M48T86_REG_YEAR);
-               ops->writebyte(tm->tm_wday, M48T86_REG_DOW);
+               m48t86_writeb(dev, tm->tm_sec, M48T86_SEC);
+               m48t86_writeb(dev, tm->tm_min, M48T86_MIN);
+               m48t86_writeb(dev, tm->tm_hour, M48T86_HOUR);
+               m48t86_writeb(dev, tm->tm_mday, M48T86_DOM);
+               m48t86_writeb(dev, tm->tm_mon + 1, M48T86_MONTH);
+               m48t86_writeb(dev, tm->tm_year % 100, M48T86_YEAR);
+               m48t86_writeb(dev, tm->tm_wday, M48T86_DOW);
        } else {
                /* bcd mode */
-               ops->writebyte(bin2bcd(tm->tm_sec), M48T86_REG_SEC);
-               ops->writebyte(bin2bcd(tm->tm_min), M48T86_REG_MIN);
-               ops->writebyte(bin2bcd(tm->tm_hour), M48T86_REG_HOUR);
-               ops->writebyte(bin2bcd(tm->tm_mday), M48T86_REG_DOM);
-               ops->writebyte(bin2bcd(tm->tm_mon + 1), M48T86_REG_MONTH);
-               ops->writebyte(bin2bcd(tm->tm_year % 100), M48T86_REG_YEAR);
-               ops->writebyte(bin2bcd(tm->tm_wday), M48T86_REG_DOW);
+               m48t86_writeb(dev, bin2bcd(tm->tm_sec), M48T86_SEC);
+               m48t86_writeb(dev, bin2bcd(tm->tm_min), M48T86_MIN);
+               m48t86_writeb(dev, bin2bcd(tm->tm_hour), M48T86_HOUR);
+               m48t86_writeb(dev, bin2bcd(tm->tm_mday), M48T86_DOM);
+               m48t86_writeb(dev, bin2bcd(tm->tm_mon + 1), M48T86_MONTH);
+               m48t86_writeb(dev, bin2bcd(tm->tm_year % 100), M48T86_YEAR);
+               m48t86_writeb(dev, bin2bcd(tm->tm_wday), M48T86_DOW);
        }
 
        /* update ended */
-       reg &= ~M48T86_REG_B_SET;
-       ops->writebyte(reg, M48T86_REG_B);
+       reg &= ~M48T86_B_SET;
+       m48t86_writeb(dev, reg, M48T86_B);
 
        return 0;
 }
@@ -119,18 +143,16 @@ static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
 static int m48t86_rtc_proc(struct device *dev, struct seq_file *seq)
 {
        unsigned char reg;
-       struct platform_device *pdev = to_platform_device(dev);
-       struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-       reg = ops->readbyte(M48T86_REG_B);
+       reg = m48t86_readb(dev, M48T86_B);
 
        seq_printf(seq, "mode\t\t: %s\n",
-                (reg & M48T86_REG_B_DM) ? "binary" : "bcd");
+                  (reg & M48T86_B_DM) ? "binary" : "bcd");
 
-       reg = ops->readbyte(M48T86_REG_D);
+       reg = m48t86_readb(dev, M48T86_D);
 
        seq_printf(seq, "battery\t\t: %s\n",
-                (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+                  (reg & M48T86_D_VRT) ? "ok" : "exhausted");
 
        return 0;
 }
@@ -141,25 +163,116 @@ static const struct rtc_class_ops m48t86_rtc_ops = {
        .proc           = m48t86_rtc_proc,
 };
 
-static int m48t86_rtc_probe(struct platform_device *dev)
+static ssize_t m48t86_nvram_read(struct file *filp, struct kobject *kobj,
+                                struct bin_attribute *attr,
+                                char *buf, loff_t off, size_t count)
+{
+       struct device *dev = kobj_to_dev(kobj);
+       unsigned int i;
+
+       for (i = 0; i < count; i++)
+               buf[i] = m48t86_readb(dev, M48T86_NVRAM(off + i));
+
+       return count;
+}
+
+static ssize_t m48t86_nvram_write(struct file *filp, struct kobject *kobj,
+                                 struct bin_attribute *attr,
+                                 char *buf, loff_t off, size_t count)
 {
+       struct device *dev = kobj_to_dev(kobj);
+       unsigned int i;
+
+       for (i = 0; i < count; i++)
+               m48t86_writeb(dev, buf[i], M48T86_NVRAM(off + i));
+
+       return count;
+}
+
+static BIN_ATTR(nvram, 0644, m48t86_nvram_read, m48t86_nvram_write,
+               M48T86_NVRAM_LEN);
+
+/*
+ * The RTC is an optional feature at purchase time on some Technologic Systems
+ * boards. Verify that it actually exists by checking if the last two bytes
+ * of the NVRAM can be changed.
+ *
+ * This is based on the method used in their rtc7800.c example.
+ */
+static bool m48t86_verify_chip(struct platform_device *pdev)
+{
+       unsigned int offset0 = M48T86_NVRAM(M48T86_NVRAM_LEN - 2);
+       unsigned int offset1 = M48T86_NVRAM(M48T86_NVRAM_LEN - 1);
+       unsigned char tmp0, tmp1;
+
+       tmp0 = m48t86_readb(&pdev->dev, offset0);
+       tmp1 = m48t86_readb(&pdev->dev, offset1);
+
+       m48t86_writeb(&pdev->dev, 0x00, offset0);
+       m48t86_writeb(&pdev->dev, 0x55, offset1);
+       if (m48t86_readb(&pdev->dev, offset1) == 0x55) {
+               m48t86_writeb(&pdev->dev, 0xaa, offset1);
+               if (m48t86_readb(&pdev->dev, offset1) == 0xaa &&
+                   m48t86_readb(&pdev->dev, offset0) == 0x00) {
+                       m48t86_writeb(&pdev->dev, tmp0, offset0);
+                       m48t86_writeb(&pdev->dev, tmp1, offset1);
+
+                       return true;
+               }
+       }
+       return false;
+}
+
+static int m48t86_rtc_probe(struct platform_device *pdev)
+{
+       struct m48t86_rtc_info *info;
+       struct resource *res;
        unsigned char reg;
-       struct m48t86_ops *ops = dev_get_platdata(&dev->dev);
-       struct rtc_device *rtc;
 
-       rtc = devm_rtc_device_register(&dev->dev, "m48t86",
-                               &m48t86_rtc_ops, THIS_MODULE);
+       info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res)
+               return -ENODEV;
+       info->index_reg = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(info->index_reg))
+               return PTR_ERR(info->index_reg);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       if (!res)
+               return -ENODEV;
+       info->data_reg = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(info->data_reg))
+               return PTR_ERR(info->data_reg);
 
-       if (IS_ERR(rtc))
-               return PTR_ERR(rtc);
+       dev_set_drvdata(&pdev->dev, info);
+
+       if (!m48t86_verify_chip(pdev)) {
+               dev_info(&pdev->dev, "RTC not present\n");
+               return -ENODEV;
+       }
 
-       platform_set_drvdata(dev, rtc);
+       info->rtc = devm_rtc_device_register(&pdev->dev, "m48t86",
+                                            &m48t86_rtc_ops, THIS_MODULE);
+       if (IS_ERR(info->rtc))
+               return PTR_ERR(info->rtc);
 
        /* read battery status */
-       reg = ops->readbyte(M48T86_REG_D);
-       dev_info(&dev->dev, "battery %s\n",
-               (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+       reg = m48t86_readb(&pdev->dev, M48T86_D);
+       dev_info(&pdev->dev, "battery %s\n",
+                (reg & M48T86_D_VRT) ? "ok" : "exhausted");
 
+       if (device_create_bin_file(&pdev->dev, &bin_attr_nvram))
+               dev_err(&pdev->dev, "failed to create nvram sysfs entry\n");
+
+       return 0;
+}
+
+static int m48t86_rtc_remove(struct platform_device *pdev)
+{
+       device_remove_bin_file(&pdev->dev, &bin_attr_nvram);
        return 0;
 }
 
@@ -168,6 +281,7 @@ static struct platform_driver m48t86_rtc_platform_driver = {
                .name   = "rtc-m48t86",
        },
        .probe          = m48t86_rtc_probe,
+       .remove         = m48t86_rtc_remove,
 };
 
 module_platform_driver(m48t86_rtc_platform_driver);
index ce75e42..77f2133 100644 (file)
 #define MCP795_REG_DAY         0x04
 #define MCP795_REG_MONTH       0x06
 #define MCP795_REG_CONTROL     0x08
+#define MCP795_REG_ALM0_SECONDS        0x0C
+#define MCP795_REG_ALM0_DAY    0x0F
 
 #define MCP795_ST_BIT          BIT(7)
 #define MCP795_24_BIT          BIT(6)
 #define MCP795_LP_BIT          BIT(5)
 #define MCP795_EXTOSC_BIT      BIT(3)
 #define MCP795_OSCON_BIT       BIT(5)
+#define MCP795_ALM0_BIT                BIT(4)
+#define MCP795_ALM1_BIT                BIT(5)
+#define MCP795_ALM0IF_BIT      BIT(3)
+#define MCP795_ALM0C0_BIT      BIT(4)
+#define MCP795_ALM0C1_BIT      BIT(5)
+#define MCP795_ALM0C2_BIT      BIT(6)
+
+#define SEC_PER_DAY            (24 * 60 * 60)
 
 static int mcp795_rtcc_read(struct device *dev, u8 addr, u8 *buf, u8 count)
 {
@@ -150,6 +160,30 @@ static int mcp795_start_oscillator(struct device *dev, bool *extosc)
                        dev, MCP795_REG_SECONDS, MCP795_ST_BIT, MCP795_ST_BIT);
 }
 
+/* Enable or disable Alarm 0 in RTC */
+static int mcp795_update_alarm(struct device *dev, bool enable)
+{
+       int ret;
+
+       dev_dbg(dev, "%s alarm\n", enable ? "Enable" : "Disable");
+
+       if (enable) {
+               /* clear ALM0IF (Alarm 0 Interrupt Flag) bit */
+               ret = mcp795_rtcc_set_bits(dev, MCP795_REG_ALM0_DAY,
+                                       MCP795_ALM0IF_BIT, 0);
+               if (ret)
+                       return ret;
+               /* enable alarm 0 */
+               ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+                                       MCP795_ALM0_BIT, MCP795_ALM0_BIT);
+       } else {
+               /* disable alarm 0 and alarm 1 */
+               ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+                                       MCP795_ALM0_BIT | MCP795_ALM1_BIT, 0);
+       }
+       return ret;
+}
+
 static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
 {
        int ret;
@@ -170,6 +204,7 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
        data[0] = (data[0] & 0x80) | bin2bcd(tim->tm_sec);
        data[1] = (data[1] & 0x80) | bin2bcd(tim->tm_min);
        data[2] = bin2bcd(tim->tm_hour);
+       data[3] = (data[3] & 0xF8) | bin2bcd(tim->tm_wday + 1);
        data[4] = bin2bcd(tim->tm_mday);
        data[5] = (data[5] & MCP795_LP_BIT) | bin2bcd(tim->tm_mon + 1);
 
@@ -198,9 +233,9 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
        if (ret)
                return ret;
 
-       dev_dbg(dev, "Set mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
+       dev_dbg(dev, "Set mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
                        tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
-                       tim->tm_hour, tim->tm_min, tim->tm_sec);
+                       tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
 
        return 0;
 }
@@ -218,20 +253,139 @@ static int mcp795_read_time(struct device *dev, struct rtc_time *tim)
        tim->tm_sec     = bcd2bin(data[0] & 0x7F);
        tim->tm_min     = bcd2bin(data[1] & 0x7F);
        tim->tm_hour    = bcd2bin(data[2] & 0x3F);
+       tim->tm_wday    = bcd2bin(data[3] & 0x07) - 1;
        tim->tm_mday    = bcd2bin(data[4] & 0x3F);
        tim->tm_mon     = bcd2bin(data[5] & 0x1F) - 1;
        tim->tm_year    = bcd2bin(data[6]) + 100; /* Assume we are in 20xx */
 
-       dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
-                               tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
-                               tim->tm_hour, tim->tm_min, tim->tm_sec);
+       dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
+                       tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
+                       tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
 
        return rtc_valid_tm(tim);
 }
 
+static int mcp795_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       struct rtc_time now_tm;
+       time64_t now;
+       time64_t later;
+       u8 tmp[6];
+       int ret;
+
+       /* Read current time from RTC hardware */
+       ret = mcp795_read_time(dev, &now_tm);
+       if (ret)
+               return ret;
+       /* Get the number of seconds since 1970 */
+       now = rtc_tm_to_time64(&now_tm);
+       later = rtc_tm_to_time64(&alm->time);
+       if (later <= now)
+               return -EINVAL;
+       /* make sure alarm fires within the next one year */
+       if ((later - now) >=
+               (SEC_PER_DAY * (365 + is_leap_year(alm->time.tm_year))))
+               return -EDOM;
+       /* disable alarm */
+       ret = mcp795_update_alarm(dev, false);
+       if (ret)
+               return ret;
+       /* Read registers, so we can leave configuration bits untouched */
+       ret = mcp795_rtcc_read(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+       if (ret)
+               return ret;
+
+       alm->time.tm_year       = -1;
+       alm->time.tm_isdst      = -1;
+       alm->time.tm_yday       = -1;
+
+       tmp[0] = (tmp[0] & 0x80) | bin2bcd(alm->time.tm_sec);
+       tmp[1] = (tmp[1] & 0x80) | bin2bcd(alm->time.tm_min);
+       tmp[2] = (tmp[2] & 0xE0) | bin2bcd(alm->time.tm_hour);
+       tmp[3] = (tmp[3] & 0x80) | bin2bcd(alm->time.tm_wday + 1);
+       /* set alarm match: seconds, minutes, hour, day, date and month */
+       tmp[3] |= (MCP795_ALM0C2_BIT | MCP795_ALM0C1_BIT | MCP795_ALM0C0_BIT);
+       tmp[4] = (tmp[4] & 0xC0) | bin2bcd(alm->time.tm_mday);
+       tmp[5] = (tmp[5] & 0xE0) | bin2bcd(alm->time.tm_mon + 1);
+
+       ret = mcp795_rtcc_write(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+       if (ret)
+               return ret;
+
+       /* enable alarm if requested */
+       if (alm->enabled) {
+               ret = mcp795_update_alarm(dev, true);
+               if (ret)
+                       return ret;
+               dev_dbg(dev, "Alarm IRQ armed\n");
+       }
+       dev_dbg(dev, "Set alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+                       alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+                       alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+       return 0;
+}
+
+static int mcp795_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       u8 data[6];
+       int ret;
+
+       ret = mcp795_rtcc_read(
+                       dev, MCP795_REG_ALM0_SECONDS, data, sizeof(data));
+       if (ret)
+               return ret;
+
+       alm->time.tm_sec        = bcd2bin(data[0] & 0x7F);
+       alm->time.tm_min        = bcd2bin(data[1] & 0x7F);
+       alm->time.tm_hour       = bcd2bin(data[2] & 0x1F);
+       alm->time.tm_wday       = bcd2bin(data[3] & 0x07) - 1;
+       alm->time.tm_mday       = bcd2bin(data[4] & 0x3F);
+       alm->time.tm_mon        = bcd2bin(data[5] & 0x1F) - 1;
+       alm->time.tm_year       = -1;
+       alm->time.tm_isdst      = -1;
+       alm->time.tm_yday       = -1;
+
+       dev_dbg(dev, "Read alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+                       alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+                       alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+       return 0;
+}
+
+static int mcp795_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+       return mcp795_update_alarm(dev, !!enabled);
+}
+
+static irqreturn_t mcp795_irq(int irq, void *data)
+{
+       struct spi_device *spi = data;
+       struct rtc_device *rtc = spi_get_drvdata(spi);
+       struct mutex *lock = &rtc->ops_lock;
+       int ret;
+
+       mutex_lock(lock);
+
+       /* Disable alarm.
+        * There is no need to clear ALM0IF (Alarm 0 Interrupt Flag) bit,
+        * because it is done every time when alarm is enabled.
+        */
+       ret = mcp795_update_alarm(&spi->dev, false);
+       if (ret)
+               dev_err(&spi->dev,
+                       "Failed to disable alarm in IRQ (ret=%d)\n", ret);
+       rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
+
+       mutex_unlock(lock);
+
+       return IRQ_HANDLED;
+}
+
 static const struct rtc_class_ops mcp795_rtc_ops = {
                .read_time = mcp795_read_time,
-               .set_time = mcp795_set_time
+               .set_time = mcp795_set_time,
+               .read_alarm = mcp795_read_alarm,
+               .set_alarm = mcp795_set_alarm,
+               .alarm_irq_enable = mcp795_alarm_irq_enable
 };
 
 static int mcp795_probe(struct spi_device *spi)
@@ -259,6 +413,23 @@ static int mcp795_probe(struct spi_device *spi)
 
        spi_set_drvdata(spi, rtc);
 
+       if (spi->irq > 0) {
+               dev_dbg(&spi->dev, "Alarm support enabled\n");
+
+               /* Clear any pending alarm (ALM0IF bit) before requesting
+                * the interrupt.
+                */
+               mcp795_rtcc_set_bits(&spi->dev, MCP795_REG_ALM0_DAY,
+                                       MCP795_ALM0IF_BIT, 0);
+               ret = devm_request_threaded_irq(&spi->dev, spi->irq, NULL,
+                               mcp795_irq, IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+                               dev_name(&rtc->dev), spi);
+               if (ret)
+                       dev_err(&spi->dev, "Failed to request IRQ: %d: %d\n",
+                                               spi->irq, ret);
+               else
+                       device_init_wakeup(&spi->dev, true);
+       }
        return 0;
 }
 
index 359876a..7731912 100644 (file)
@@ -353,7 +353,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 }
 
 /* RTC layer */
-static struct rtc_class_ops mxc_rtc_ops = {
+static const struct rtc_class_ops mxc_rtc_ops = {
        .release                = mxc_rtc_release,
        .read_time              = mxc_rtc_read_time,
        .set_mmss64             = mxc_rtc_set_mmss,
index 2bfdf63..f33447c 100644 (file)
@@ -52,9 +52,20 @@ static int pcf2127_rtc_read_time(struct device *dev, struct rtc_time *tm)
        struct pcf2127 *pcf2127 = dev_get_drvdata(dev);
        unsigned char buf[10];
        int ret;
+       int i;
 
-       ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_CTRL1, buf,
-                               sizeof(buf));
+       for (i = 0; i <= PCF2127_REG_CTRL3; i++) {
+               ret = regmap_read(pcf2127->regmap, PCF2127_REG_CTRL1 + i,
+                                 (unsigned int *)(buf + i));
+               if (ret) {
+                       dev_err(dev, "%s: read error\n", __func__);
+                       return ret;
+               }
+       }
+
+       ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_SC,
+                              (buf + PCF2127_REG_SC),
+                              ARRAY_SIZE(buf) - PCF2127_REG_SC);
        if (ret) {
                dev_err(dev, "%s: read error\n", __func__);
                return ret;
index 7163b91..d08da37 100644 (file)
@@ -63,7 +63,6 @@ struct rx8010_data {
        struct i2c_client *client;
        struct rtc_device *rtc;
        u8 ctrlreg;
-       spinlock_t flags_lock;
 };
 
 static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
@@ -72,12 +71,12 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
        struct rx8010_data *rx8010 = i2c_get_clientdata(client);
        int flagreg;
 
-       spin_lock(&rx8010->flags_lock);
+       mutex_lock(&rx8010->rtc->ops_lock);
 
        flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
 
        if (flagreg <= 0) {
-               spin_unlock(&rx8010->flags_lock);
+               mutex_unlock(&rx8010->rtc->ops_lock);
                return IRQ_NONE;
        }
 
@@ -101,7 +100,7 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
 
        i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
 
-       spin_unlock(&rx8010->flags_lock);
+       mutex_unlock(&rx8010->rtc->ops_lock);
        return IRQ_HANDLED;
 }
 
@@ -143,7 +142,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
        u8 date[7];
        int ctrl, flagreg;
        int ret;
-       unsigned long irqflags;
 
        if ((dt->tm_year < 100) || (dt->tm_year > 199))
                return -EINVAL;
@@ -181,11 +179,8 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
        if (ret < 0)
                return ret;
 
-       spin_lock_irqsave(&rx8010->flags_lock, irqflags);
-
        flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
        if (flagreg < 0) {
-               spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
                return flagreg;
        }
 
@@ -193,8 +188,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
                ret = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG,
                                                flagreg & ~RX8010_FLAG_VLF);
 
-       spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
-
        return 0;
 }
 
@@ -288,12 +281,9 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
        u8 alarmvals[3];
        int extreg, flagreg;
        int err;
-       unsigned long irqflags;
 
-       spin_lock_irqsave(&rx8010->flags_lock, irqflags);
        flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
        if (flagreg < 0) {
-               spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
                return flagreg;
        }
 
@@ -302,14 +292,12 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
                err = i2c_smbus_write_byte_data(rx8010->client, RX8010_CTRL,
                                                rx8010->ctrlreg);
                if (err < 0) {
-                       spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
                        return err;
                }
        }
 
        flagreg &= ~RX8010_FLAG_AF;
        err = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG, flagreg);
-       spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
        if (err < 0)
                return err;
 
@@ -404,7 +392,6 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
        struct rx8010_data *rx8010 = dev_get_drvdata(dev);
        int ret, tmp;
        int flagreg;
-       unsigned long irqflags;
 
        switch (cmd) {
        case RTC_VL_READ:
@@ -419,16 +406,13 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
                return 0;
 
        case RTC_VL_CLR:
-               spin_lock_irqsave(&rx8010->flags_lock, irqflags);
                flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
                if (flagreg < 0) {
-                       spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
                        return flagreg;
                }
 
                flagreg &= ~RX8010_FLAG_VLF;
                ret = i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
-               spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
                if (ret < 0)
                        return ret;
 
@@ -466,8 +450,6 @@ static int rx8010_probe(struct i2c_client *client,
        rx8010->client = client;
        i2c_set_clientdata(client, rx8010);
 
-       spin_lock_init(&rx8010->flags_lock);
-
        err = rx8010_init_client(client);
        if (err)
                return err;
index 17b6235..c626e43 100644 (file)
@@ -535,7 +535,7 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
        return 0;
 }
 
-static struct rtc_class_ops sh_rtc_ops = {
+static const struct rtc_class_ops sh_rtc_ops = {
        .read_time      = sh_rtc_read_time,
        .set_time       = sh_rtc_set_time,
        .read_alarm     = sh_rtc_read_alarm,
index 0f11c2a..d51b07d 100644 (file)
@@ -184,6 +184,7 @@ static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
        rtc_tm_to_time(alrm_tm, &time);
 
        regmap_update_bits(data->regmap, data->offset + SNVS_LPCR, SNVS_LPCR_LPTA_EN, 0);
+       rtc_write_sync_lp(data);
        regmap_write(data->regmap, data->offset + SNVS_LPTAR, time);
 
        /* Clear alarm interrupt status bit */
diff --git a/drivers/rtc/rtc-stm32.c b/drivers/rtc/rtc-stm32.c
new file mode 100644 (file)
index 0000000..bd57eb1
--- /dev/null
@@ -0,0 +1,725 @@
+/*
+ * Copyright (C) Amelie Delaunay 2016
+ * Author:  Amelie Delaunay <amelie.delaunay@st.com>
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/bcd.h>
+#include <linux/clk.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+
+#define DRIVER_NAME "stm32_rtc"
+
+/* STM32 RTC registers */
+#define STM32_RTC_TR           0x00
+#define STM32_RTC_DR           0x04
+#define STM32_RTC_CR           0x08
+#define STM32_RTC_ISR          0x0C
+#define STM32_RTC_PRER         0x10
+#define STM32_RTC_ALRMAR       0x1C
+#define STM32_RTC_WPR          0x24
+
+/* STM32_RTC_TR bit fields  */
+#define STM32_RTC_TR_SEC_SHIFT         0
+#define STM32_RTC_TR_SEC               GENMASK(6, 0)
+#define STM32_RTC_TR_MIN_SHIFT         8
+#define STM32_RTC_TR_MIN               GENMASK(14, 8)
+#define STM32_RTC_TR_HOUR_SHIFT                16
+#define STM32_RTC_TR_HOUR              GENMASK(21, 16)
+
+/* STM32_RTC_DR bit fields */
+#define STM32_RTC_DR_DATE_SHIFT                0
+#define STM32_RTC_DR_DATE              GENMASK(5, 0)
+#define STM32_RTC_DR_MONTH_SHIFT       8
+#define STM32_RTC_DR_MONTH             GENMASK(12, 8)
+#define STM32_RTC_DR_WDAY_SHIFT                13
+#define STM32_RTC_DR_WDAY              GENMASK(15, 13)
+#define STM32_RTC_DR_YEAR_SHIFT                16
+#define STM32_RTC_DR_YEAR              GENMASK(23, 16)
+
+/* STM32_RTC_CR bit fields */
+#define STM32_RTC_CR_FMT               BIT(6)
+#define STM32_RTC_CR_ALRAE             BIT(8)
+#define STM32_RTC_CR_ALRAIE            BIT(12)
+
+/* STM32_RTC_ISR bit fields */
+#define STM32_RTC_ISR_ALRAWF           BIT(0)
+#define STM32_RTC_ISR_INITS            BIT(4)
+#define STM32_RTC_ISR_RSF              BIT(5)
+#define STM32_RTC_ISR_INITF            BIT(6)
+#define STM32_RTC_ISR_INIT             BIT(7)
+#define STM32_RTC_ISR_ALRAF            BIT(8)
+
+/* STM32_RTC_PRER bit fields */
+#define STM32_RTC_PRER_PRED_S_SHIFT    0
+#define STM32_RTC_PRER_PRED_S          GENMASK(14, 0)
+#define STM32_RTC_PRER_PRED_A_SHIFT    16
+#define STM32_RTC_PRER_PRED_A          GENMASK(22, 16)
+
+/* STM32_RTC_ALRMAR and STM32_RTC_ALRMBR bit fields */
+#define STM32_RTC_ALRMXR_SEC_SHIFT     0
+#define STM32_RTC_ALRMXR_SEC           GENMASK(6, 0)
+#define STM32_RTC_ALRMXR_SEC_MASK      BIT(7)
+#define STM32_RTC_ALRMXR_MIN_SHIFT     8
+#define STM32_RTC_ALRMXR_MIN           GENMASK(14, 8)
+#define STM32_RTC_ALRMXR_MIN_MASK      BIT(15)
+#define STM32_RTC_ALRMXR_HOUR_SHIFT    16
+#define STM32_RTC_ALRMXR_HOUR          GENMASK(21, 16)
+#define STM32_RTC_ALRMXR_PM            BIT(22)
+#define STM32_RTC_ALRMXR_HOUR_MASK     BIT(23)
+#define STM32_RTC_ALRMXR_DATE_SHIFT    24
+#define STM32_RTC_ALRMXR_DATE          GENMASK(29, 24)
+#define STM32_RTC_ALRMXR_WDSEL         BIT(30)
+#define STM32_RTC_ALRMXR_WDAY_SHIFT    24
+#define STM32_RTC_ALRMXR_WDAY          GENMASK(27, 24)
+#define STM32_RTC_ALRMXR_DATE_MASK     BIT(31)
+
+/* STM32_RTC_WPR key constants */
+#define RTC_WPR_1ST_KEY                        0xCA
+#define RTC_WPR_2ND_KEY                        0x53
+#define RTC_WPR_WRONG_KEY              0xFF
+
+/*
+ * RTC registers are protected against parasitic write access.
+ * PWR_CR_DBP bit must be set to enable write access to RTC registers.
+ */
+/* STM32_PWR_CR */
+#define PWR_CR                         0x00
+/* STM32_PWR_CR bit field */
+#define PWR_CR_DBP                     BIT(8)
+
+struct stm32_rtc {
+       struct rtc_device *rtc_dev;
+       void __iomem *base;
+       struct regmap *dbp;
+       struct clk *ck_rtc;
+       int irq_alarm;
+};
+
+static void stm32_rtc_wpr_unlock(struct stm32_rtc *rtc)
+{
+       writel_relaxed(RTC_WPR_1ST_KEY, rtc->base + STM32_RTC_WPR);
+       writel_relaxed(RTC_WPR_2ND_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static void stm32_rtc_wpr_lock(struct stm32_rtc *rtc)
+{
+       writel_relaxed(RTC_WPR_WRONG_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static int stm32_rtc_enter_init_mode(struct stm32_rtc *rtc)
+{
+       unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+       if (!(isr & STM32_RTC_ISR_INITF)) {
+               isr |= STM32_RTC_ISR_INIT;
+               writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+               /*
+                * It takes around 2 ck_rtc clock cycles to enter in
+                * initialization phase mode (and have INITF flag set). As
+                * slowest ck_rtc frequency may be 32kHz and highest should be
+                * 1MHz, we poll every 10 us with a timeout of 100ms.
+                */
+               return readl_relaxed_poll_timeout_atomic(
+                                       rtc->base + STM32_RTC_ISR,
+                                       isr, (isr & STM32_RTC_ISR_INITF),
+                                       10, 100000);
+       }
+
+       return 0;
+}
+
+static void stm32_rtc_exit_init_mode(struct stm32_rtc *rtc)
+{
+       unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+       isr &= ~STM32_RTC_ISR_INIT;
+       writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+}
+
+static int stm32_rtc_wait_sync(struct stm32_rtc *rtc)
+{
+       unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+       isr &= ~STM32_RTC_ISR_RSF;
+       writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+       /*
+        * Wait for RSF to be set to ensure the calendar registers are
+        * synchronised, it takes around 2 ck_rtc clock cycles
+        */
+       return readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+                                                isr,
+                                                (isr & STM32_RTC_ISR_RSF),
+                                                10, 100000);
+}
+
+static irqreturn_t stm32_rtc_alarm_irq(int irq, void *dev_id)
+{
+       struct stm32_rtc *rtc = (struct stm32_rtc *)dev_id;
+       unsigned int isr, cr;
+
+       mutex_lock(&rtc->rtc_dev->ops_lock);
+
+       isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+       if ((isr & STM32_RTC_ISR_ALRAF) &&
+           (cr & STM32_RTC_CR_ALRAIE)) {
+               /* Alarm A flag - Alarm interrupt */
+               dev_dbg(&rtc->rtc_dev->dev, "Alarm occurred\n");
+
+               /* Pass event to the kernel */
+               rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
+
+               /* Clear event flag, otherwise new events won't be received */
+               writel_relaxed(isr & ~STM32_RTC_ISR_ALRAF,
+                              rtc->base + STM32_RTC_ISR);
+       }
+
+       mutex_unlock(&rtc->rtc_dev->ops_lock);
+
+       return IRQ_HANDLED;
+}
+
+/* Convert rtc_time structure from bin to bcd format */
+static void tm2bcd(struct rtc_time *tm)
+{
+       tm->tm_sec = bin2bcd(tm->tm_sec);
+       tm->tm_min = bin2bcd(tm->tm_min);
+       tm->tm_hour = bin2bcd(tm->tm_hour);
+
+       tm->tm_mday = bin2bcd(tm->tm_mday);
+       tm->tm_mon = bin2bcd(tm->tm_mon + 1);
+       tm->tm_year = bin2bcd(tm->tm_year - 100);
+       /*
+        * Number of days since Sunday
+        * - on kernel side, 0=Sunday...6=Saturday
+        * - on rtc side, 0=invalid,1=Monday...7=Sunday
+        */
+       tm->tm_wday = (!tm->tm_wday) ? 7 : tm->tm_wday;
+}
+
+/* Convert rtc_time structure from bcd to bin format */
+static void bcd2tm(struct rtc_time *tm)
+{
+       tm->tm_sec = bcd2bin(tm->tm_sec);
+       tm->tm_min = bcd2bin(tm->tm_min);
+       tm->tm_hour = bcd2bin(tm->tm_hour);
+
+       tm->tm_mday = bcd2bin(tm->tm_mday);
+       tm->tm_mon = bcd2bin(tm->tm_mon) - 1;
+       tm->tm_year = bcd2bin(tm->tm_year) + 100;
+       /*
+        * Number of days since Sunday
+        * - on kernel side, 0=Sunday...6=Saturday
+        * - on rtc side, 0=invalid,1=Monday...7=Sunday
+        */
+       tm->tm_wday %= 7;
+}
+
+static int stm32_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       unsigned int tr, dr;
+
+       /* Time and Date in BCD format */
+       tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+       dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+
+       tm->tm_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+       tm->tm_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+       tm->tm_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+       tm->tm_mday = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+       tm->tm_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+       tm->tm_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+       tm->tm_wday = (dr & STM32_RTC_DR_WDAY) >> STM32_RTC_DR_WDAY_SHIFT;
+
+       /* We don't report tm_yday and tm_isdst */
+
+       bcd2tm(tm);
+
+       return 0;
+}
+
+static int stm32_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       unsigned int tr, dr;
+       int ret = 0;
+
+       tm2bcd(tm);
+
+       /* Time in BCD format */
+       tr = ((tm->tm_sec << STM32_RTC_TR_SEC_SHIFT) & STM32_RTC_TR_SEC) |
+            ((tm->tm_min << STM32_RTC_TR_MIN_SHIFT) & STM32_RTC_TR_MIN) |
+            ((tm->tm_hour << STM32_RTC_TR_HOUR_SHIFT) & STM32_RTC_TR_HOUR);
+
+       /* Date in BCD format */
+       dr = ((tm->tm_mday << STM32_RTC_DR_DATE_SHIFT) & STM32_RTC_DR_DATE) |
+            ((tm->tm_mon << STM32_RTC_DR_MONTH_SHIFT) & STM32_RTC_DR_MONTH) |
+            ((tm->tm_year << STM32_RTC_DR_YEAR_SHIFT) & STM32_RTC_DR_YEAR) |
+            ((tm->tm_wday << STM32_RTC_DR_WDAY_SHIFT) & STM32_RTC_DR_WDAY);
+
+       stm32_rtc_wpr_unlock(rtc);
+
+       ret = stm32_rtc_enter_init_mode(rtc);
+       if (ret) {
+               dev_err(dev, "Can't enter in init mode. Set time aborted.\n");
+               goto end;
+       }
+
+       writel_relaxed(tr, rtc->base + STM32_RTC_TR);
+       writel_relaxed(dr, rtc->base + STM32_RTC_DR);
+
+       stm32_rtc_exit_init_mode(rtc);
+
+       ret = stm32_rtc_wait_sync(rtc);
+end:
+       stm32_rtc_wpr_lock(rtc);
+
+       return ret;
+}
+
+static int stm32_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       struct rtc_time *tm = &alrm->time;
+       unsigned int alrmar, cr, isr;
+
+       alrmar = readl_relaxed(rtc->base + STM32_RTC_ALRMAR);
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+       isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+       if (alrmar & STM32_RTC_ALRMXR_DATE_MASK) {
+               /*
+                * Date/day doesn't matter in Alarm comparison so alarm
+                * triggers every day
+                */
+               tm->tm_mday = -1;
+               tm->tm_wday = -1;
+       } else {
+               if (alrmar & STM32_RTC_ALRMXR_WDSEL) {
+                       /* Alarm is set to a day of week */
+                       tm->tm_mday = -1;
+                       tm->tm_wday = (alrmar & STM32_RTC_ALRMXR_WDAY) >>
+                                     STM32_RTC_ALRMXR_WDAY_SHIFT;
+                       tm->tm_wday %= 7;
+               } else {
+                       /* Alarm is set to a day of month */
+                       tm->tm_wday = -1;
+                       tm->tm_mday = (alrmar & STM32_RTC_ALRMXR_DATE) >>
+                                      STM32_RTC_ALRMXR_DATE_SHIFT;
+               }
+       }
+
+       if (alrmar & STM32_RTC_ALRMXR_HOUR_MASK) {
+               /* Hours don't matter in Alarm comparison */
+               tm->tm_hour = -1;
+       } else {
+               tm->tm_hour = (alrmar & STM32_RTC_ALRMXR_HOUR) >>
+                              STM32_RTC_ALRMXR_HOUR_SHIFT;
+               if (alrmar & STM32_RTC_ALRMXR_PM)
+                       tm->tm_hour += 12;
+       }
+
+       if (alrmar & STM32_RTC_ALRMXR_MIN_MASK) {
+               /* Minutes don't matter in Alarm comparison */
+               tm->tm_min = -1;
+       } else {
+               tm->tm_min = (alrmar & STM32_RTC_ALRMXR_MIN) >>
+                             STM32_RTC_ALRMXR_MIN_SHIFT;
+       }
+
+       if (alrmar & STM32_RTC_ALRMXR_SEC_MASK) {
+               /* Seconds don't matter in Alarm comparison */
+               tm->tm_sec = -1;
+       } else {
+               tm->tm_sec = (alrmar & STM32_RTC_ALRMXR_SEC) >>
+                             STM32_RTC_ALRMXR_SEC_SHIFT;
+       }
+
+       bcd2tm(tm);
+
+       alrm->enabled = (cr & STM32_RTC_CR_ALRAE) ? 1 : 0;
+       alrm->pending = (isr & STM32_RTC_ISR_ALRAF) ? 1 : 0;
+
+       return 0;
+}
+
+static int stm32_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       unsigned int isr, cr;
+
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+       stm32_rtc_wpr_unlock(rtc);
+
+       /* We expose Alarm A to the kernel */
+       if (enabled)
+               cr |= (STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+       else
+               cr &= ~(STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+       writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+       /* Clear event flag, otherwise new events won't be received */
+       isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+       isr &= ~STM32_RTC_ISR_ALRAF;
+       writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+       stm32_rtc_wpr_lock(rtc);
+
+       return 0;
+}
+
+static int stm32_rtc_valid_alrm(struct stm32_rtc *rtc, struct rtc_time *tm)
+{
+       int cur_day, cur_mon, cur_year, cur_hour, cur_min, cur_sec;
+       unsigned int dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+       unsigned int tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+
+       cur_day = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+       cur_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+       cur_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+       cur_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+       cur_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+       cur_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+       /*
+        * Assuming current date is M-D-Y H:M:S.
+        * RTC alarm can't be set on a specific month and year.
+        * So the valid alarm range is:
+        *      M-D-Y H:M:S < alarm <= (M+1)-D-Y H:M:S
+        * with a specific case for December...
+        */
+       if ((((tm->tm_year > cur_year) &&
+             (tm->tm_mon == 0x1) && (cur_mon == 0x12)) ||
+            ((tm->tm_year == cur_year) &&
+             (tm->tm_mon <= cur_mon + 1))) &&
+           ((tm->tm_mday > cur_day) ||
+            ((tm->tm_mday == cur_day) &&
+            ((tm->tm_hour > cur_hour) ||
+             ((tm->tm_hour == cur_hour) && (tm->tm_min > cur_min)) ||
+             ((tm->tm_hour == cur_hour) && (tm->tm_min == cur_min) &&
+              (tm->tm_sec >= cur_sec))))))
+               return 0;
+
+       return -EINVAL;
+}
+
+static int stm32_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       struct rtc_time *tm = &alrm->time;
+       unsigned int cr, isr, alrmar;
+       int ret = 0;
+
+       tm2bcd(tm);
+
+       /*
+        * RTC alarm can't be set on a specific date, unless this date is
+        * up to the same day of month next month.
+        */
+       if (stm32_rtc_valid_alrm(rtc, tm) < 0) {
+               dev_err(dev, "Alarm can be set only on upcoming month.\n");
+               return -EINVAL;
+       }
+
+       alrmar = 0;
+       /* tm_year and tm_mon are not used because not supported by RTC */
+       alrmar |= (tm->tm_mday << STM32_RTC_ALRMXR_DATE_SHIFT) &
+                 STM32_RTC_ALRMXR_DATE;
+       /* 24-hour format */
+       alrmar &= ~STM32_RTC_ALRMXR_PM;
+       alrmar |= (tm->tm_hour << STM32_RTC_ALRMXR_HOUR_SHIFT) &
+                 STM32_RTC_ALRMXR_HOUR;
+       alrmar |= (tm->tm_min << STM32_RTC_ALRMXR_MIN_SHIFT) &
+                 STM32_RTC_ALRMXR_MIN;
+       alrmar |= (tm->tm_sec << STM32_RTC_ALRMXR_SEC_SHIFT) &
+                 STM32_RTC_ALRMXR_SEC;
+
+       stm32_rtc_wpr_unlock(rtc);
+
+       /* Disable Alarm */
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+       cr &= ~STM32_RTC_CR_ALRAE;
+       writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+       /*
+        * Poll Alarm write flag to be sure that Alarm update is allowed: it
+        * takes around 2 ck_rtc clock cycles
+        */
+       ret = readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+                                               isr,
+                                               (isr & STM32_RTC_ISR_ALRAWF),
+                                               10, 100000);
+
+       if (ret) {
+               dev_err(dev, "Alarm update not allowed\n");
+               goto end;
+       }
+
+       /* Write to Alarm register */
+       writel_relaxed(alrmar, rtc->base + STM32_RTC_ALRMAR);
+
+       if (alrm->enabled)
+               stm32_rtc_alarm_irq_enable(dev, 1);
+       else
+               stm32_rtc_alarm_irq_enable(dev, 0);
+
+end:
+       stm32_rtc_wpr_lock(rtc);
+
+       return ret;
+}
+
+static const struct rtc_class_ops stm32_rtc_ops = {
+       .read_time      = stm32_rtc_read_time,
+       .set_time       = stm32_rtc_set_time,
+       .read_alarm     = stm32_rtc_read_alarm,
+       .set_alarm      = stm32_rtc_set_alarm,
+       .alarm_irq_enable = stm32_rtc_alarm_irq_enable,
+};
+
+static const struct of_device_id stm32_rtc_of_match[] = {
+       { .compatible = "st,stm32-rtc" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, stm32_rtc_of_match);
+
+static int stm32_rtc_init(struct platform_device *pdev,
+                         struct stm32_rtc *rtc)
+{
+       unsigned int prer, pred_a, pred_s, pred_a_max, pred_s_max, cr;
+       unsigned int rate;
+       int ret = 0;
+
+       rate = clk_get_rate(rtc->ck_rtc);
+
+       /* Find prediv_a and prediv_s to obtain the 1Hz calendar clock */
+       pred_a_max = STM32_RTC_PRER_PRED_A >> STM32_RTC_PRER_PRED_A_SHIFT;
+       pred_s_max = STM32_RTC_PRER_PRED_S >> STM32_RTC_PRER_PRED_S_SHIFT;
+
+       for (pred_a = pred_a_max; pred_a + 1 > 0; pred_a--) {
+               pred_s = (rate / (pred_a + 1)) - 1;
+
+               if (((pred_s + 1) * (pred_a + 1)) == rate)
+                       break;
+       }
+
+       /*
+        * Can't find a 1Hz, so give priority to RTC power consumption
+        * by choosing the higher possible value for prediv_a
+        */
+       if ((pred_s > pred_s_max) || (pred_a > pred_a_max)) {
+               pred_a = pred_a_max;
+               pred_s = (rate / (pred_a + 1)) - 1;
+
+               dev_warn(&pdev->dev, "ck_rtc is %s\n",
+                        (rate < ((pred_a + 1) * (pred_s + 1))) ?
+                        "fast" : "slow");
+       }
+
+       stm32_rtc_wpr_unlock(rtc);
+
+       ret = stm32_rtc_enter_init_mode(rtc);
+       if (ret) {
+               dev_err(&pdev->dev,
+                       "Can't enter in init mode. Prescaler config failed.\n");
+               goto end;
+       }
+
+       prer = (pred_s << STM32_RTC_PRER_PRED_S_SHIFT) & STM32_RTC_PRER_PRED_S;
+       writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+       prer |= (pred_a << STM32_RTC_PRER_PRED_A_SHIFT) & STM32_RTC_PRER_PRED_A;
+       writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+
+       /* Force 24h time format */
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+       cr &= ~STM32_RTC_CR_FMT;
+       writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+       stm32_rtc_exit_init_mode(rtc);
+
+       ret = stm32_rtc_wait_sync(rtc);
+end:
+       stm32_rtc_wpr_lock(rtc);
+
+       return ret;
+}
+
+static int stm32_rtc_probe(struct platform_device *pdev)
+{
+       struct stm32_rtc *rtc;
+       struct resource *res;
+       int ret;
+
+       rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+       if (!rtc)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       rtc->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(rtc->base))
+               return PTR_ERR(rtc->base);
+
+       rtc->dbp = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+                                                  "st,syscfg");
+       if (IS_ERR(rtc->dbp)) {
+               dev_err(&pdev->dev, "no st,syscfg\n");
+               return PTR_ERR(rtc->dbp);
+       }
+
+       rtc->ck_rtc = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(rtc->ck_rtc)) {
+               dev_err(&pdev->dev, "no ck_rtc clock");
+               return PTR_ERR(rtc->ck_rtc);
+       }
+
+       ret = clk_prepare_enable(rtc->ck_rtc);
+       if (ret)
+               return ret;
+
+       regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, PWR_CR_DBP);
+
+       /*
+        * After a system reset, RTC_ISR.INITS flag can be read to check if
+        * the calendar has been initalized or not. INITS flag is reset by a
+        * power-on reset (no vbat, no power-supply). It is not reset if
+        * ck_rtc parent clock has changed (so RTC prescalers need to be
+        * changed). That's why we cannot rely on this flag to know if RTC
+        * init has to be done.
+        */
+       ret = stm32_rtc_init(pdev, rtc);
+       if (ret)
+               goto err;
+
+       rtc->irq_alarm = platform_get_irq(pdev, 0);
+       if (rtc->irq_alarm <= 0) {
+               dev_err(&pdev->dev, "no alarm irq\n");
+               ret = rtc->irq_alarm;
+               goto err;
+       }
+
+       platform_set_drvdata(pdev, rtc);
+
+       ret = device_init_wakeup(&pdev->dev, true);
+       if (ret)
+               dev_warn(&pdev->dev,
+                        "alarm won't be able to wake up the system");
+
+       rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
+                       &stm32_rtc_ops, THIS_MODULE);
+       if (IS_ERR(rtc->rtc_dev)) {
+               ret = PTR_ERR(rtc->rtc_dev);
+               dev_err(&pdev->dev, "rtc device registration failed, err=%d\n",
+                       ret);
+               goto err;
+       }
+
+       /* Handle RTC alarm interrupts */
+       ret = devm_request_threaded_irq(&pdev->dev, rtc->irq_alarm, NULL,
+                                       stm32_rtc_alarm_irq,
+                                       IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+                                       pdev->name, rtc);
+       if (ret) {
+               dev_err(&pdev->dev, "IRQ%d (alarm interrupt) already claimed\n",
+                       rtc->irq_alarm);
+               goto err;
+       }
+
+       /*
+        * If INITS flag is reset (calendar year field set to 0x00), calendar
+        * must be initialized
+        */
+       if (!(readl_relaxed(rtc->base + STM32_RTC_ISR) & STM32_RTC_ISR_INITS))
+               dev_warn(&pdev->dev, "Date/Time must be initialized\n");
+
+       return 0;
+err:
+       clk_disable_unprepare(rtc->ck_rtc);
+
+       regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+       device_init_wakeup(&pdev->dev, false);
+
+       return ret;
+}
+
+static int stm32_rtc_remove(struct platform_device *pdev)
+{
+       struct stm32_rtc *rtc = platform_get_drvdata(pdev);
+       unsigned int cr;
+
+       /* Disable interrupts */
+       stm32_rtc_wpr_unlock(rtc);
+       cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+       cr &= ~STM32_RTC_CR_ALRAIE;
+       writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+       stm32_rtc_wpr_lock(rtc);
+
+       clk_disable_unprepare(rtc->ck_rtc);
+
+       /* Enable backup domain write protection */
+       regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+       device_init_wakeup(&pdev->dev, false);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int stm32_rtc_suspend(struct device *dev)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               return enable_irq_wake(rtc->irq_alarm);
+
+       return 0;
+}
+
+static int stm32_rtc_resume(struct device *dev)
+{
+       struct stm32_rtc *rtc = dev_get_drvdata(dev);
+       int ret = 0;
+
+       ret = stm32_rtc_wait_sync(rtc);
+       if (ret < 0)
+               return ret;
+
+       if (device_may_wakeup(dev))
+               return disable_irq_wake(rtc->irq_alarm);
+
+       return ret;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(stm32_rtc_pm_ops,
+                        stm32_rtc_suspend, stm32_rtc_resume);
+
+static struct platform_driver stm32_rtc_driver = {
+       .probe          = stm32_rtc_probe,
+       .remove         = stm32_rtc_remove,
+       .driver         = {
+               .name   = DRIVER_NAME,
+               .pm     = &stm32_rtc_pm_ops,
+               .of_match_table = stm32_rtc_of_match,
+       },
+};
+
+module_platform_driver(stm32_rtc_driver);
+
+MODULE_ALIAS("platform:" DRIVER_NAME);
+MODULE_AUTHOR("Amelie Delaunay <amelie.delaunay@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 Real Time Clock driver");
+MODULE_LICENSE("GPL v2");
index c169a2c..39cbc12 100644 (file)
@@ -20,6 +20,8 @@
  * more details.
  */
 
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 /* Control register */
 #define SUN6I_LOSC_CTRL                                0x0000
+#define SUN6I_LOSC_CTRL_KEY                    (0x16aa << 16)
 #define SUN6I_LOSC_CTRL_ALM_DHMS_ACC           BIT(9)
 #define SUN6I_LOSC_CTRL_RTC_HMS_ACC            BIT(8)
 #define SUN6I_LOSC_CTRL_RTC_YMD_ACC            BIT(7)
+#define SUN6I_LOSC_CTRL_EXT_OSC                        BIT(0)
 #define SUN6I_LOSC_CTRL_ACC_MASK               GENMASK(9, 7)
 
+#define SUN6I_LOSC_CLK_PRESCAL                 0x0008
+
 /* RTC */
 #define SUN6I_RTC_YMD                          0x0010
 #define SUN6I_RTC_HMS                          0x0014
@@ -114,13 +121,142 @@ struct sun6i_rtc_dev {
        void __iomem *base;
        int irq;
        unsigned long alarm;
+
+       struct clk_hw hw;
+       struct clk_hw *int_osc;
+       struct clk *losc;
+
+       spinlock_t lock;
+};
+
+static struct sun6i_rtc_dev *sun6i_rtc;
+
+static unsigned long sun6i_rtc_osc_recalc_rate(struct clk_hw *hw,
+                                              unsigned long parent_rate)
+{
+       struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+       u32 val;
+
+       val = readl(rtc->base + SUN6I_LOSC_CTRL);
+       if (val & SUN6I_LOSC_CTRL_EXT_OSC)
+               return parent_rate;
+
+       val = readl(rtc->base + SUN6I_LOSC_CLK_PRESCAL);
+       val &= GENMASK(4, 0);
+
+       return parent_rate / (val + 1);
+}
+
+static u8 sun6i_rtc_osc_get_parent(struct clk_hw *hw)
+{
+       struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+
+       return readl(rtc->base + SUN6I_LOSC_CTRL) & SUN6I_LOSC_CTRL_EXT_OSC;
+}
+
+static int sun6i_rtc_osc_set_parent(struct clk_hw *hw, u8 index)
+{
+       struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+       unsigned long flags;
+       u32 val;
+
+       if (index > 1)
+               return -EINVAL;
+
+       spin_lock_irqsave(&rtc->lock, flags);
+       val = readl(rtc->base + SUN6I_LOSC_CTRL);
+       val &= ~SUN6I_LOSC_CTRL_EXT_OSC;
+       val |= SUN6I_LOSC_CTRL_KEY;
+       val |= index ? SUN6I_LOSC_CTRL_EXT_OSC : 0;
+       writel(val, rtc->base + SUN6I_LOSC_CTRL);
+       spin_unlock_irqrestore(&rtc->lock, flags);
+
+       return 0;
+}
+
+static const struct clk_ops sun6i_rtc_osc_ops = {
+       .recalc_rate    = sun6i_rtc_osc_recalc_rate,
+
+       .get_parent     = sun6i_rtc_osc_get_parent,
+       .set_parent     = sun6i_rtc_osc_set_parent,
 };
 
+static void __init sun6i_rtc_clk_init(struct device_node *node)
+{
+       struct clk_hw_onecell_data *clk_data;
+       struct sun6i_rtc_dev *rtc;
+       struct clk_init_data init = {
+               .ops            = &sun6i_rtc_osc_ops,
+       };
+       const char *parents[2];
+
+       rtc = kzalloc(sizeof(*rtc), GFP_KERNEL);
+       if (!rtc)
+               return;
+       spin_lock_init(&rtc->lock);
+
+       clk_data = kzalloc(sizeof(*clk_data) + sizeof(*clk_data->hws),
+                          GFP_KERNEL);
+       if (!clk_data)
+               return;
+       spin_lock_init(&rtc->lock);
+
+       rtc->base = of_io_request_and_map(node, 0, of_node_full_name(node));
+       if (IS_ERR(rtc->base)) {
+               pr_crit("Can't map RTC registers");
+               return;
+       }
+
+       /* Switch to the external, more precise, oscillator */
+       writel(SUN6I_LOSC_CTRL_KEY | SUN6I_LOSC_CTRL_EXT_OSC,
+              rtc->base + SUN6I_LOSC_CTRL);
+
+       /* Yes, I know, this is ugly. */
+       sun6i_rtc = rtc;
+
+       /* Deal with old DTs */
+       if (!of_get_property(node, "clocks", NULL))
+               return;
+
+       rtc->int_osc = clk_hw_register_fixed_rate_with_accuracy(NULL,
+                                                               "rtc-int-osc",
+                                                               NULL, 0,
+                                                               667000,
+                                                               300000000);
+       if (IS_ERR(rtc->int_osc)) {
+               pr_crit("Couldn't register the internal oscillator\n");
+               return;
+       }
+
+       parents[0] = clk_hw_get_name(rtc->int_osc);
+       parents[1] = of_clk_get_parent_name(node, 0);
+
+       rtc->hw.init = &init;
+
+       init.parent_names = parents;
+       init.num_parents = of_clk_get_parent_count(node) + 1;
+       of_property_read_string(node, "clock-output-names", &init.name);
+
+       rtc->losc = clk_register(NULL, &rtc->hw);
+       if (IS_ERR(rtc->losc)) {
+               pr_crit("Couldn't register the LOSC clock\n");
+               return;
+       }
+
+       clk_data->num = 1;
+       clk_data->hws[0] = &rtc->hw;
+       of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data);
+}
+CLK_OF_DECLARE_DRIVER(sun6i_rtc_clk, "allwinner,sun6i-a31-rtc",
+                     sun6i_rtc_clk_init);
+
 static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
 {
        struct sun6i_rtc_dev *chip = (struct sun6i_rtc_dev *) id;
+       irqreturn_t ret = IRQ_NONE;
        u32 val;
 
+       spin_lock(&chip->lock);
        val = readl(chip->base + SUN6I_ALRM_IRQ_STA);
 
        if (val & SUN6I_ALRM_IRQ_STA_CNT_IRQ_PEND) {
@@ -129,10 +265,11 @@ static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
 
                rtc_update_irq(chip->rtc, 1, RTC_AF | RTC_IRQF);
 
-               return IRQ_HANDLED;
+               ret = IRQ_HANDLED;
        }
+       spin_unlock(&chip->lock);
 
-       return IRQ_NONE;
+       return ret;
 }
 
 static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
@@ -140,6 +277,7 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
        u32 alrm_val = 0;
        u32 alrm_irq_val = 0;
        u32 alrm_wake_val = 0;
+       unsigned long flags;
 
        if (to) {
                alrm_val = SUN6I_ALRM_EN_CNT_EN;
@@ -150,9 +288,11 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
                       chip->base + SUN6I_ALRM_IRQ_STA);
        }
 
+       spin_lock_irqsave(&chip->lock, flags);
        writel(alrm_val, chip->base + SUN6I_ALRM_EN);
        writel(alrm_irq_val, chip->base + SUN6I_ALRM_IRQ_EN);
        writel(alrm_wake_val, chip->base + SUN6I_ALARM_CONFIG);
+       spin_unlock_irqrestore(&chip->lock, flags);
 }
 
 static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
@@ -191,11 +331,15 @@ static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
 static int sun6i_rtc_getalarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 {
        struct sun6i_rtc_dev *chip = dev_get_drvdata(dev);
+       unsigned long flags;
        u32 alrm_st;
        u32 alrm_en;
 
+       spin_lock_irqsave(&chip->lock, flags);
        alrm_en = readl(chip->base + SUN6I_ALRM_IRQ_EN);
        alrm_st = readl(chip->base + SUN6I_ALRM_IRQ_STA);
+       spin_unlock_irqrestore(&chip->lock, flags);
+
        wkalrm->enabled = !!(alrm_en & SUN6I_ALRM_EN_CNT_EN);
        wkalrm->pending = !!(alrm_st & SUN6I_ALRM_EN_CNT_EN);
        rtc_time_to_tm(chip->alarm, &wkalrm->time);
@@ -349,22 +493,15 @@ static const struct rtc_class_ops sun6i_rtc_ops = {
 
 static int sun6i_rtc_probe(struct platform_device *pdev)
 {
-       struct sun6i_rtc_dev *chip;
-       struct resource *res;
+       struct sun6i_rtc_dev *chip = sun6i_rtc;
        int ret;
 
-       chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
        if (!chip)
-               return -ENOMEM;
+               return -ENODEV;
 
        platform_set_drvdata(pdev, chip);
        chip->dev = &pdev->dev;
 
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       chip->base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(chip->base))
-               return PTR_ERR(chip->base);
-
        chip->irq = platform_get_irq(pdev, 0);
        if (chip->irq < 0) {
                dev_err(&pdev->dev, "No IRQ resource\n");
@@ -404,8 +541,10 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
        /* disable alarm wakeup */
        writel(0, chip->base + SUN6I_ALARM_CONFIG);
 
-       chip->rtc = rtc_device_register("rtc-sun6i", &pdev->dev,
-                                       &sun6i_rtc_ops, THIS_MODULE);
+       clk_prepare_enable(chip->losc);
+
+       chip->rtc = devm_rtc_device_register(&pdev->dev, "rtc-sun6i",
+                                            &sun6i_rtc_ops, THIS_MODULE);
        if (IS_ERR(chip->rtc)) {
                dev_err(&pdev->dev, "unable to register device\n");
                return PTR_ERR(chip->rtc);
@@ -416,15 +555,6 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
        return 0;
 }
 
-static int sun6i_rtc_remove(struct platform_device *pdev)
-{
-       struct sun6i_rtc_dev *chip = platform_get_drvdata(pdev);
-
-       rtc_device_unregister(chip->rtc);
-
-       return 0;
-}
-
 static const struct of_device_id sun6i_rtc_dt_ids[] = {
        { .compatible = "allwinner,sun6i-a31-rtc" },
        { /* sentinel */ },
@@ -433,15 +563,9 @@ MODULE_DEVICE_TABLE(of, sun6i_rtc_dt_ids);
 
 static struct platform_driver sun6i_rtc_driver = {
        .probe          = sun6i_rtc_probe,
-       .remove         = sun6i_rtc_remove,
        .driver         = {
                .name           = "sun6i-rtc",
                .of_match_table = sun6i_rtc_dt_ids,
        },
 };
-
-module_platform_driver(sun6i_rtc_driver);
-
-MODULE_DESCRIPTION("sun6i RTC driver");
-MODULE_AUTHOR("Chen-Yu Tsai <wens@csie.org>");
-MODULE_LICENSE("GPL");
+builtin_platform_driver(sun6i_rtc_driver);
index 3853ba9..d30d57b 100644 (file)
  * with this program; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
-#include <linux/kernel.h>
+
+#include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
 #include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/rtc.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
 
 /* set to 1 = busy every eight 32kHz clocks during copy of sec+msec to AHB */
 #define TEGRA_RTC_REG_BUSY                     0x004
@@ -59,6 +61,7 @@ struct tegra_rtc_info {
        struct platform_device  *pdev;
        struct rtc_device       *rtc_dev;
        void __iomem            *rtc_base; /* NULL if not initialized. */
+       struct clk              *clk;
        int                     tegra_rtc_irq; /* alarm and periodic irq */
        spinlock_t              tegra_rtc_lock;
 };
@@ -326,6 +329,14 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
        if (info->tegra_rtc_irq <= 0)
                return -EBUSY;
 
+       info->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(info->clk))
+               return PTR_ERR(info->clk);
+
+       ret = clk_prepare_enable(info->clk);
+       if (ret < 0)
+               return ret;
+
        /* set context info. */
        info->pdev = pdev;
        spin_lock_init(&info->tegra_rtc_lock);
@@ -346,7 +357,7 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
                ret = PTR_ERR(info->rtc_dev);
                dev_err(&pdev->dev, "Unable to register device (err=%d).\n",
                        ret);
-               return ret;
+               goto disable_clk;
        }
 
        ret = devm_request_irq(&pdev->dev, info->tegra_rtc_irq,
@@ -356,12 +367,25 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
                dev_err(&pdev->dev,
                        "Unable to request interrupt for device (err=%d).\n",
                        ret);
-               return ret;
+               goto disable_clk;
        }
 
        dev_notice(&pdev->dev, "Tegra internal Real Time Clock\n");
 
        return 0;
+
+disable_clk:
+       clk_disable_unprepare(info->clk);
+       return ret;
+}
+
+static int tegra_rtc_remove(struct platform_device *pdev)
+{
+       struct tegra_rtc_info *info = platform_get_drvdata(pdev);
+
+       clk_disable_unprepare(info->clk);
+
+       return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -413,6 +437,7 @@ static void tegra_rtc_shutdown(struct platform_device *pdev)
 
 MODULE_ALIAS("platform:tegra_rtc");
 static struct platform_driver tegra_rtc_driver = {
+       .remove         = tegra_rtc_remove,
        .shutdown       = tegra_rtc_shutdown,
        .driver         = {
                .name   = "tegra_rtc",
index 5a3d53c..d0244d7 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
+#include <linux/math64.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/tps65910.h>
@@ -33,7 +34,21 @@ struct tps65910_rtc {
 /* Total number of RTC registers needed to set time*/
 #define NUM_TIME_REGS  (TPS65910_YEARS - TPS65910_SECONDS + 1)
 
-static int tps65910_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
+/* Total number of RTC registers needed to set compensation registers */
+#define NUM_COMP_REGS  (TPS65910_RTC_COMP_MSB - TPS65910_RTC_COMP_LSB + 1)
+
+/* Min and max values supported with 'offset' interface (swapped sign) */
+#define MIN_OFFSET     (-277761)
+#define MAX_OFFSET     (277778)
+
+/* Number of ticks per hour */
+#define TICKS_PER_HOUR (32768 * 3600)
+
+/* Multiplier for ppb conversions */
+#define PPB_MULT       (1000000000LL)
+
+static int tps65910_rtc_alarm_irq_enable(struct device *dev,
+                                        unsigned int enabled)
 {
        struct tps65910 *tps = dev_get_drvdata(dev->parent);
        u8 val = 0;
@@ -187,6 +202,133 @@ static int tps65910_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
        return ret;
 }
 
+static int tps65910_rtc_set_calibration(struct device *dev, int calibration)
+{
+       unsigned char comp_data[NUM_COMP_REGS];
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       s16 value;
+       int ret;
+
+       /*
+        * TPS65910 uses two's complement 16 bit value for compensation for RTC
+        * crystal inaccuracies. One time every hour when seconds counter
+        * increments from 0 to 1 compensation value will be added to internal
+        * RTC counter value.
+        *
+        * Compensation value 0x7FFF is prohibited value.
+        *
+        * Valid range for compensation value: [-32768 .. 32766]
+        */
+       if ((calibration < -32768) || (calibration > 32766)) {
+               dev_err(dev, "RTC calibration value out of range: %d\n",
+                       calibration);
+               return -EINVAL;
+       }
+
+       value = (s16)calibration;
+
+       comp_data[0] = (u16)value & 0xFF;
+       comp_data[1] = ((u16)value >> 8) & 0xFF;
+
+       /* Update all the compensation registers in one shot */
+       ret = regmap_bulk_write(tps->regmap, TPS65910_RTC_COMP_LSB,
+               comp_data, NUM_COMP_REGS);
+       if (ret < 0) {
+               dev_err(dev, "rtc_set_calibration error: %d\n", ret);
+               return ret;
+       }
+
+       /* Enable automatic compensation */
+       ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+               TPS65910_RTC_CTRL_AUTO_COMP, TPS65910_RTC_CTRL_AUTO_COMP);
+       if (ret < 0)
+               dev_err(dev, "auto_comp enable failed with error: %d\n", ret);
+
+       return ret;
+}
+
+static int tps65910_rtc_get_calibration(struct device *dev, int *calibration)
+{
+       unsigned char comp_data[NUM_COMP_REGS];
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       unsigned int ctrl;
+       u16 value;
+       int ret;
+
+       ret = regmap_read(tps->regmap, TPS65910_RTC_CTRL, &ctrl);
+       if (ret < 0)
+               return ret;
+
+       /* If automatic compensation is not enabled report back zero */
+       if (!(ctrl & TPS65910_RTC_CTRL_AUTO_COMP)) {
+               *calibration = 0;
+               return 0;
+       }
+
+       ret = regmap_bulk_read(tps->regmap, TPS65910_RTC_COMP_LSB, comp_data,
+               NUM_COMP_REGS);
+       if (ret < 0) {
+               dev_err(dev, "rtc_get_calibration error: %d\n", ret);
+               return ret;
+       }
+
+       value = (u16)comp_data[0] | ((u16)comp_data[1] << 8);
+
+       *calibration = (s16)value;
+
+       return 0;
+}
+
+static int tps65910_read_offset(struct device *dev, long *offset)
+{
+       int calibration;
+       s64 tmp;
+       int ret;
+
+       ret = tps65910_rtc_get_calibration(dev, &calibration);
+       if (ret < 0)
+               return ret;
+
+       /* Convert from RTC calibration register format to ppb format */
+       tmp = calibration * (s64)PPB_MULT;
+       if (tmp < 0)
+               tmp -= TICKS_PER_HOUR / 2LL;
+       else
+               tmp += TICKS_PER_HOUR / 2LL;
+       tmp = div_s64(tmp, TICKS_PER_HOUR);
+
+       /* Offset value operates in negative way, so swap sign */
+       *offset = (long)-tmp;
+
+       return 0;
+}
+
+static int tps65910_set_offset(struct device *dev, long offset)
+{
+       int calibration;
+       s64 tmp;
+       int ret;
+
+       /* Make sure offset value is within supported range */
+       if (offset < MIN_OFFSET || offset > MAX_OFFSET)
+               return -ERANGE;
+
+       /* Convert from ppb format to RTC calibration register format */
+       tmp = offset * (s64)TICKS_PER_HOUR;
+       if (tmp < 0)
+               tmp -= PPB_MULT / 2LL;
+       else
+               tmp += PPB_MULT / 2LL;
+       tmp = div_s64(tmp, PPB_MULT);
+
+       /* Offset value operates in negative way, so swap sign */
+       calibration = (int)-tmp;
+
+       ret = tps65910_rtc_set_calibration(dev, calibration);
+
+       return ret;
+}
+
 static irqreturn_t tps65910_rtc_interrupt(int irq, void *rtc)
 {
        struct device *dev = rtc;
@@ -219,6 +361,8 @@ static const struct rtc_class_ops tps65910_rtc_ops = {
        .read_alarm     = tps65910_rtc_read_alarm,
        .set_alarm      = tps65910_rtc_set_alarm,
        .alarm_irq_enable = tps65910_rtc_alarm_irq_enable,
+       .read_offset    = tps65910_read_offset,
+       .set_offset     = tps65910_set_offset,
 };
 
 static int tps65910_rtc_probe(struct platform_device *pdev)
index 0f17137..0b38217 100644 (file)
@@ -4864,7 +4864,7 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
                        break;
                case 3: /* tsa_intrg */
                        len += sprintf(page + len, PRINTK_HEADER
-                                     " tsb->tsa.intrg.: not supportet yet\n");
+                                     " tsb->tsa.intrg.: not supported yet\n");
                        break;
                }
 
index 8225da6..4182f60 100644 (file)
@@ -165,13 +165,15 @@ int tpi(struct tpi_info *addr)
 int chsc(void *chsc_area)
 {
        typedef struct { char _[4096]; } addr_type;
-       int cc;
+       int cc = -EIO;
 
        asm volatile(
                "       .insn   rre,0xb25f0000,%2,0\n"
-               "       ipm     %0\n"
+               "0:     ipm     %0\n"
                "       srl     %0,28\n"
-               : "=d" (cc), "=m" (*(addr_type *) chsc_area)
+               "1:\n"
+               EX_TABLE(0b, 1b)
+               : "+d" (cc), "=m" (*(addr_type *) chsc_area)
                : "d" (chsc_area), "m" (*(addr_type *) chsc_area)
                : "cc");
        trace_s390_cio_chsc(chsc_area, cc);
index 0a7fb83..be36f10 100644 (file)
@@ -10,3 +10,7 @@ zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o
 obj-$(CONFIG_ZCRYPT) += zcrypt.o
 # adapter drivers depend on ap.o and zcrypt.o
 obj-$(CONFIG_ZCRYPT) += zcrypt_pcixcc.o zcrypt_cex2a.o zcrypt_cex4.o
+
+# pkey kernel module
+pkey-objs := pkey_api.o
+obj-$(CONFIG_PKEY) += pkey.o
index 56db76c..9be4596 100644 (file)
@@ -1107,16 +1107,6 @@ static void ap_config_timeout(unsigned long ptr)
        queue_work(system_long_wq, &ap_scan_work);
 }
 
-static void ap_reset_domain(void)
-{
-       int i;
-
-       if (ap_domain_index == -1 || !ap_test_config_domain(ap_domain_index))
-               return;
-       for (i = 0; i < AP_DEVICES; i++)
-               ap_rapq(AP_MKQID(i, ap_domain_index));
-}
-
 static void ap_reset_all(void)
 {
        int i, j;
index 1cd9128..cfa161c 100644 (file)
@@ -58,9 +58,9 @@ static ssize_t ap_functions_show(struct device *dev,
 
 static DEVICE_ATTR(ap_functions, 0444, ap_functions_show, NULL);
 
-static ssize_t ap_request_count_show(struct device *dev,
-                                    struct device_attribute *attr,
-                                    char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *buf)
 {
        struct ap_card *ac = to_ap_card(dev);
        unsigned int req_cnt;
@@ -72,7 +72,23 @@ static ssize_t ap_request_count_show(struct device *dev,
        return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
 }
 
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+       struct ap_card *ac = to_ap_card(dev);
+       struct ap_queue *aq;
+
+       spin_lock_bh(&ap_list_lock);
+       for_each_ap_queue(aq, ac)
+               aq->total_request_count = 0;
+       spin_unlock_bh(&ap_list_lock);
+       atomic_set(&ac->total_request_count, 0);
+
+       return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
 
 static ssize_t ap_requestq_count_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
index 7be67fa..480c58a 100644 (file)
@@ -459,9 +459,9 @@ EXPORT_SYMBOL(ap_queue_resume);
 /*
  * AP queue related attributes.
  */
-static ssize_t ap_request_count_show(struct device *dev,
-                                    struct device_attribute *attr,
-                                    char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *buf)
 {
        struct ap_queue *aq = to_ap_queue(dev);
        unsigned int req_cnt;
@@ -472,7 +472,20 @@ static ssize_t ap_request_count_show(struct device *dev,
        return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
 }
 
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+       struct ap_queue *aq = to_ap_queue(dev);
+
+       spin_lock_bh(&aq->lock);
+       aq->total_request_count = 0;
+       spin_unlock_bh(&aq->lock);
+
+       return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
 
 static ssize_t ap_requestq_count_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
new file mode 100644 (file)
index 0000000..40f1136
--- /dev/null
@@ -0,0 +1,1148 @@
+/*
+ *  pkey device driver
+ *
+ *  Copyright IBM Corp. 2017
+ *  Author(s): Harald Freudenberger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "pkey"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <asm/zcrypt.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+#include "zcrypt_api.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("s390 protected key interface");
+
+/* Size of parameter block used for all cca requests/replies */
+#define PARMBSIZE 512
+
+/* Size of vardata block used for some of the cca requests/replies */
+#define VARDATASIZE 4096
+
+/*
+ * debug feature data and functions
+ */
+
+static debug_info_t *debug_info;
+
+#define DEBUG_DBG(...) debug_sprintf_event(debug_info, 6, ##__VA_ARGS__)
+#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__)
+#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__)
+#define DEBUG_ERR(...) debug_sprintf_event(debug_info, 3, ##__VA_ARGS__)
+
+static void __init pkey_debug_init(void)
+{
+       debug_info = debug_register("pkey", 1, 1, 4 * sizeof(long));
+       debug_register_view(debug_info, &debug_sprintf_view);
+       debug_set_level(debug_info, 3);
+}
+
+static void __exit pkey_debug_exit(void)
+{
+       debug_unregister(debug_info);
+}
+
+/* inside view of a secure key token (only type 0x01 version 0x04) */
+struct secaeskeytoken {
+       u8  type;     /* 0x01 for internal key token */
+       u8  res0[3];
+       u8  version;  /* should be 0x04 */
+       u8  res1[1];
+       u8  flag;     /* key flags */
+       u8  res2[1];
+       u64 mkvp;     /* master key verification pattern */
+       u8  key[32];  /* key value (encrypted) */
+       u8  cv[8];    /* control vector */
+       u16 bitsize;  /* key bit size */
+       u16 keysize;  /* key byte size */
+       u8  tvv[4];   /* token validation value */
+} __packed;
+
+/*
+ * Simple check if the token is a valid CCA secure AES key
+ * token. If keybitsize is given, the bitsize of the key is
+ * also checked. Returns 0 on success or errno value on failure.
+ */
+static int check_secaeskeytoken(u8 *token, int keybitsize)
+{
+       struct secaeskeytoken *t = (struct secaeskeytoken *) token;
+
+       if (t->type != 0x01) {
+               DEBUG_ERR(
+                       "check_secaeskeytoken secure token check failed, type mismatch 0x%02x != 0x01\n",
+                       (int) t->type);
+               return -EINVAL;
+       }
+       if (t->version != 0x04) {
+               DEBUG_ERR(
+                       "check_secaeskeytoken secure token check failed, version mismatch 0x%02x != 0x04\n",
+                       (int) t->version);
+               return -EINVAL;
+       }
+       if (keybitsize > 0 && t->bitsize != keybitsize) {
+               DEBUG_ERR(
+                       "check_secaeskeytoken secure token check failed, bitsize mismatch %d != %d\n",
+                       (int) t->bitsize, keybitsize);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate consecutive memory for request CPRB, request param
+ * block, reply CPRB and reply param block and fill in values
+ * for the common fields. Returns 0 on success or errno value
+ * on failure.
+ */
+static int alloc_and_prep_cprbmem(size_t paramblen,
+                                 u8 **pcprbmem,
+                                 struct CPRBX **preqCPRB,
+                                 struct CPRBX **prepCPRB)
+{
+       u8 *cprbmem;
+       size_t cprbplusparamblen = sizeof(struct CPRBX) + paramblen;
+       struct CPRBX *preqcblk, *prepcblk;
+
+       /*
+        * allocate consecutive memory for request CPRB, request param
+        * block, reply CPRB and reply param block
+        */
+       cprbmem = kmalloc(2 * cprbplusparamblen, GFP_KERNEL);
+       if (!cprbmem)
+               return -ENOMEM;
+       memset(cprbmem, 0, 2 * cprbplusparamblen);
+
+       preqcblk = (struct CPRBX *) cprbmem;
+       prepcblk = (struct CPRBX *) (cprbmem + cprbplusparamblen);
+
+       /* fill request cprb struct */
+       preqcblk->cprb_len = sizeof(struct CPRBX);
+       preqcblk->cprb_ver_id = 0x02;
+       memcpy(preqcblk->func_id, "T2", 2);
+       preqcblk->rpl_msgbl = cprbplusparamblen;
+       if (paramblen) {
+               preqcblk->req_parmb =
+                       ((u8 *) preqcblk) + sizeof(struct CPRBX);
+               preqcblk->rpl_parmb =
+                       ((u8 *) prepcblk) + sizeof(struct CPRBX);
+       }
+
+       *pcprbmem = cprbmem;
+       *preqCPRB = preqcblk;
+       *prepCPRB = prepcblk;
+
+       return 0;
+}
+
+/*
+ * Free the cprb memory allocated with the function above.
+ * If the scrub value is not zero, the memory is filled
+ * with zeros before freeing (useful if there was some
+ * clear key material in there).
+ */
+static void free_cprbmem(void *mem, size_t paramblen, int scrub)
+{
+       if (scrub)
+               memzero_explicit(mem, 2 * (sizeof(struct CPRBX) + paramblen));
+       kfree(mem);
+}
+
+/*
+ * Helper function to prepare the xcrb struct
+ */
+static inline void prep_xcrb(struct ica_xcRB *pxcrb,
+                            u16 cardnr,
+                            struct CPRBX *preqcblk,
+                            struct CPRBX *prepcblk)
+{
+       memset(pxcrb, 0, sizeof(*pxcrb));
+       pxcrb->agent_ID = 0x4341; /* 'CA' */
+       pxcrb->user_defined = (cardnr == 0xFFFF ? AUTOSELECT : cardnr);
+       pxcrb->request_control_blk_length =
+               preqcblk->cprb_len + preqcblk->req_parml;
+       pxcrb->request_control_blk_addr = (void *) preqcblk;
+       pxcrb->reply_control_blk_length = preqcblk->rpl_msgbl;
+       pxcrb->reply_control_blk_addr = (void *) prepcblk;
+}
+
+/*
+ * Helper function which calls zcrypt_send_cprb with
+ * memory management segment adjusted to kernel space
+ * so that the copy_from_user called within this
+ * function do in fact copy from kernel space.
+ */
+static inline int _zcrypt_send_cprb(struct ica_xcRB *xcrb)
+{
+       int rc;
+       mm_segment_t old_fs = get_fs();
+
+       set_fs(KERNEL_DS);
+       rc = zcrypt_send_cprb(xcrb);
+       set_fs(old_fs);
+
+       return rc;
+}
+
+/*
+ * Generate (random) AES secure key.
+ */
+int pkey_genseckey(u16 cardnr, u16 domain,
+                  u32 keytype, struct pkey_seckey *seckey)
+{
+       int i, rc, keysize;
+       int seckeysize;
+       u8 *mem;
+       struct CPRBX *preqcblk, *prepcblk;
+       struct ica_xcRB xcrb;
+       struct kgreqparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               struct lv1 {
+                       u16 len;
+                       char  key_form[8];
+                       char  key_length[8];
+                       char  key_type1[8];
+                       char  key_type2[8];
+               } lv1;
+               struct lv2 {
+                       u16 len;
+                       struct keyid {
+                               u16 len;
+                               u16 attr;
+                               u8  data[SECKEYBLOBSIZE];
+                       } keyid[6];
+               } lv2;
+       } *preqparm;
+       struct kgrepparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               struct lv3 {
+                       u16 len;
+                       u16 keyblocklen;
+                       struct {
+                               u16 toklen;
+                               u16 tokattr;
+                               u8  tok[0];
+                               /* ... some more data ... */
+                       } keyblock;
+               } lv3;
+       } *prepparm;
+
+       /* get already prepared memory for 2 cprbs with param block each */
+       rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+       if (rc)
+               return rc;
+
+       /* fill request cprb struct */
+       preqcblk->domain = domain;
+
+       /* fill request cprb param block with KG request */
+       preqparm = (struct kgreqparm *) preqcblk->req_parmb;
+       memcpy(preqparm->subfunc_code, "KG", 2);
+       preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+       preqparm->lv1.len = sizeof(struct lv1);
+       memcpy(preqparm->lv1.key_form,   "OP      ", 8);
+       switch (keytype) {
+       case PKEY_KEYTYPE_AES_128:
+               keysize = 16;
+               memcpy(preqparm->lv1.key_length, "KEYLN16 ", 8);
+               break;
+       case PKEY_KEYTYPE_AES_192:
+               keysize = 24;
+               memcpy(preqparm->lv1.key_length, "KEYLN24 ", 8);
+               break;
+       case PKEY_KEYTYPE_AES_256:
+               keysize = 32;
+               memcpy(preqparm->lv1.key_length, "KEYLN32 ", 8);
+               break;
+       default:
+               DEBUG_ERR(
+                       "pkey_genseckey unknown/unsupported keytype %d\n",
+                       keytype);
+               rc = -EINVAL;
+               goto out;
+       }
+       memcpy(preqparm->lv1.key_type1,  "AESDATA ", 8);
+       preqparm->lv2.len = sizeof(struct lv2);
+       for (i = 0; i < 6; i++) {
+               preqparm->lv2.keyid[i].len = sizeof(struct keyid);
+               preqparm->lv2.keyid[i].attr = (i == 2 ? 0x30 : 0x10);
+       }
+       preqcblk->req_parml = sizeof(struct kgreqparm);
+
+       /* fill xcrb struct */
+       prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+       /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+       rc = _zcrypt_send_cprb(&xcrb);
+       if (rc) {
+               DEBUG_ERR(
+                       "pkey_genseckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+                       (int) cardnr, (int) domain, rc);
+               goto out;
+       }
+
+       /* check response returncode and reasoncode */
+       if (prepcblk->ccp_rtcode != 0) {
+               DEBUG_ERR(
+                       "pkey_genseckey secure key generate failure, card response %d/%d\n",
+                       (int) prepcblk->ccp_rtcode,
+                       (int) prepcblk->ccp_rscode);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* process response cprb param block */
+       prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+       prepparm = (struct kgrepparm *) prepcblk->rpl_parmb;
+
+       /* check length of the returned secure key token */
+       seckeysize = prepparm->lv3.keyblock.toklen
+               - sizeof(prepparm->lv3.keyblock.toklen)
+               - sizeof(prepparm->lv3.keyblock.tokattr);
+       if (seckeysize != SECKEYBLOBSIZE) {
+               DEBUG_ERR(
+                       "pkey_genseckey secure token size mismatch %d != %d bytes\n",
+                       seckeysize, SECKEYBLOBSIZE);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* check secure key token */
+       rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+       if (rc) {
+               rc = -EIO;
+               goto out;
+       }
+
+       /* copy the generated secure key token */
+       memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+       free_cprbmem(mem, PARMBSIZE, 0);
+       return rc;
+}
+EXPORT_SYMBOL(pkey_genseckey);
+
+/*
+ * Generate an AES secure key with given key value.
+ */
+int pkey_clr2seckey(u16 cardnr, u16 domain, u32 keytype,
+                   const struct pkey_clrkey *clrkey,
+                   struct pkey_seckey *seckey)
+{
+       int rc, keysize, seckeysize;
+       u8 *mem;
+       struct CPRBX *preqcblk, *prepcblk;
+       struct ica_xcRB xcrb;
+       struct cmreqparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               char  rule_array[8];
+               struct lv1 {
+                       u16 len;
+                       u8  clrkey[0];
+               } lv1;
+               struct lv2 {
+                       u16 len;
+                       struct keyid {
+                               u16 len;
+                               u16 attr;
+                               u8  data[SECKEYBLOBSIZE];
+                       } keyid;
+               } lv2;
+       } *preqparm;
+       struct lv2 *plv2;
+       struct cmrepparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               struct lv3 {
+                       u16 len;
+                       u16 keyblocklen;
+                       struct {
+                               u16 toklen;
+                               u16 tokattr;
+                               u8  tok[0];
+                               /* ... some more data ... */
+                       } keyblock;
+               } lv3;
+       } *prepparm;
+
+       /* get already prepared memory for 2 cprbs with param block each */
+       rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+       if (rc)
+               return rc;
+
+       /* fill request cprb struct */
+       preqcblk->domain = domain;
+
+       /* fill request cprb param block with CM request */
+       preqparm = (struct cmreqparm *) preqcblk->req_parmb;
+       memcpy(preqparm->subfunc_code, "CM", 2);
+       memcpy(preqparm->rule_array, "AES     ", 8);
+       preqparm->rule_array_len =
+               sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+       switch (keytype) {
+       case PKEY_KEYTYPE_AES_128:
+               keysize = 16;
+               break;
+       case PKEY_KEYTYPE_AES_192:
+               keysize = 24;
+               break;
+       case PKEY_KEYTYPE_AES_256:
+               keysize = 32;
+               break;
+       default:
+               DEBUG_ERR(
+                       "pkey_clr2seckey unknown/unsupported keytype %d\n",
+                       keytype);
+               rc = -EINVAL;
+               goto out;
+       }
+       preqparm->lv1.len = sizeof(struct lv1) + keysize;
+       memcpy(preqparm->lv1.clrkey, clrkey->clrkey, keysize);
+       plv2 = (struct lv2 *) (((u8 *) &preqparm->lv2) + keysize);
+       plv2->len = sizeof(struct lv2);
+       plv2->keyid.len = sizeof(struct keyid);
+       plv2->keyid.attr = 0x30;
+       preqcblk->req_parml = sizeof(struct cmreqparm) + keysize;
+
+       /* fill xcrb struct */
+       prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+       /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+       rc = _zcrypt_send_cprb(&xcrb);
+       if (rc) {
+               DEBUG_ERR(
+                       "pkey_clr2seckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+                       (int) cardnr, (int) domain, rc);
+               goto out;
+       }
+
+       /* check response returncode and reasoncode */
+       if (prepcblk->ccp_rtcode != 0) {
+               DEBUG_ERR(
+                       "pkey_clr2seckey clear key import failure, card response %d/%d\n",
+                       (int) prepcblk->ccp_rtcode,
+                       (int) prepcblk->ccp_rscode);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* process response cprb param block */
+       prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+       prepparm = (struct cmrepparm *) prepcblk->rpl_parmb;
+
+       /* check length of the returned secure key token */
+       seckeysize = prepparm->lv3.keyblock.toklen
+               - sizeof(prepparm->lv3.keyblock.toklen)
+               - sizeof(prepparm->lv3.keyblock.tokattr);
+       if (seckeysize != SECKEYBLOBSIZE) {
+               DEBUG_ERR(
+                       "pkey_clr2seckey secure token size mismatch %d != %d bytes\n",
+                       seckeysize, SECKEYBLOBSIZE);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* check secure key token */
+       rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+       if (rc) {
+               rc = -EIO;
+               goto out;
+       }
+
+       /* copy the generated secure key token */
+       memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+       free_cprbmem(mem, PARMBSIZE, 1);
+       return rc;
+}
+EXPORT_SYMBOL(pkey_clr2seckey);
+
+/*
+ * Derive a proteced key from the secure key blob.
+ */
+int pkey_sec2protkey(u16 cardnr, u16 domain,
+                    const struct pkey_seckey *seckey,
+                    struct pkey_protkey *protkey)
+{
+       int rc;
+       u8 *mem;
+       struct CPRBX *preqcblk, *prepcblk;
+       struct ica_xcRB xcrb;
+       struct uskreqparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               struct lv1 {
+                       u16 len;
+                       u16 attr_len;
+                       u16 attr_flags;
+               } lv1;
+               struct lv2 {
+                       u16 len;
+                       u16 attr_len;
+                       u16 attr_flags;
+                       u8  token[0];         /* cca secure key token */
+               } lv2 __packed;
+       } *preqparm;
+       struct uskrepparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               struct lv3 {
+                       u16 len;
+                       u16 attr_len;
+                       u16 attr_flags;
+                       struct cpacfkeyblock {
+                               u8  version;  /* version of this struct */
+                               u8  flags[2];
+                               u8  algo;
+                               u8  form;
+                               u8  pad1[3];
+                               u16 keylen;
+                               u8  key[64];  /* the key (keylen bytes) */
+                               u16 keyattrlen;
+                               u8  keyattr[32];
+                               u8  pad2[1];
+                               u8  vptype;
+                               u8  vp[32];  /* verification pattern */
+                       } keyblock;
+               } lv3 __packed;
+       } *prepparm;
+
+       /* get already prepared memory for 2 cprbs with param block each */
+       rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+       if (rc)
+               return rc;
+
+       /* fill request cprb struct */
+       preqcblk->domain = domain;
+
+       /* fill request cprb param block with USK request */
+       preqparm = (struct uskreqparm *) preqcblk->req_parmb;
+       memcpy(preqparm->subfunc_code, "US", 2);
+       preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+       preqparm->lv1.len = sizeof(struct lv1);
+       preqparm->lv1.attr_len = sizeof(struct lv1) - sizeof(preqparm->lv1.len);
+       preqparm->lv1.attr_flags = 0x0001;
+       preqparm->lv2.len = sizeof(struct lv2) + SECKEYBLOBSIZE;
+       preqparm->lv2.attr_len = sizeof(struct lv2)
+               - sizeof(preqparm->lv2.len) + SECKEYBLOBSIZE;
+       preqparm->lv2.attr_flags = 0x0000;
+       memcpy(preqparm->lv2.token, seckey->seckey, SECKEYBLOBSIZE);
+       preqcblk->req_parml = sizeof(struct uskreqparm) + SECKEYBLOBSIZE;
+
+       /* fill xcrb struct */
+       prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+       /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+       rc = _zcrypt_send_cprb(&xcrb);
+       if (rc) {
+               DEBUG_ERR(
+                       "pkey_sec2protkey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+                       (int) cardnr, (int) domain, rc);
+               goto out;
+       }
+
+       /* check response returncode and reasoncode */
+       if (prepcblk->ccp_rtcode != 0) {
+               DEBUG_ERR(
+                       "pkey_sec2protkey unwrap secure key failure, card response %d/%d\n",
+                       (int) prepcblk->ccp_rtcode,
+                       (int) prepcblk->ccp_rscode);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* process response cprb param block */
+       prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+       prepparm = (struct uskrepparm *) prepcblk->rpl_parmb;
+
+       /* check the returned keyblock */
+       if (prepparm->lv3.keyblock.version != 0x01) {
+               DEBUG_ERR(
+                       "pkey_sec2protkey reply param keyblock version mismatch 0x%02x != 0x01\n",
+                       (int) prepparm->lv3.keyblock.version);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* copy the tanslated protected key */
+       switch (prepparm->lv3.keyblock.keylen) {
+       case 16+32:
+               protkey->type = PKEY_KEYTYPE_AES_128;
+               break;
+       case 24+32:
+               protkey->type = PKEY_KEYTYPE_AES_192;
+               break;
+       case 32+32:
+               protkey->type = PKEY_KEYTYPE_AES_256;
+               break;
+       default:
+               DEBUG_ERR("pkey_sec2protkey unknown/unsupported keytype %d\n",
+                         prepparm->lv3.keyblock.keylen);
+               rc = -EIO;
+               goto out;
+       }
+       protkey->len = prepparm->lv3.keyblock.keylen;
+       memcpy(protkey->protkey, prepparm->lv3.keyblock.key, protkey->len);
+
+out:
+       free_cprbmem(mem, PARMBSIZE, 0);
+       return rc;
+}
+EXPORT_SYMBOL(pkey_sec2protkey);
+
+/*
+ * Create a protected key from a clear key value.
+ */
+int pkey_clr2protkey(u32 keytype,
+                    const struct pkey_clrkey *clrkey,
+                    struct pkey_protkey *protkey)
+{
+       long fc;
+       int keysize;
+       u8 paramblock[64];
+
+       switch (keytype) {
+       case PKEY_KEYTYPE_AES_128:
+               keysize = 16;
+               fc = CPACF_PCKMO_ENC_AES_128_KEY;
+               break;
+       case PKEY_KEYTYPE_AES_192:
+               keysize = 24;
+               fc = CPACF_PCKMO_ENC_AES_192_KEY;
+               break;
+       case PKEY_KEYTYPE_AES_256:
+               keysize = 32;
+               fc = CPACF_PCKMO_ENC_AES_256_KEY;
+               break;
+       default:
+               DEBUG_ERR("pkey_clr2protkey unknown/unsupported keytype %d\n",
+                         keytype);
+               return -EINVAL;
+       }
+
+       /* prepare param block */
+       memset(paramblock, 0, sizeof(paramblock));
+       memcpy(paramblock, clrkey->clrkey, keysize);
+
+       /* call the pckmo instruction */
+       cpacf_pckmo(fc, paramblock);
+
+       /* copy created protected key */
+       protkey->type = keytype;
+       protkey->len = keysize + 32;
+       memcpy(protkey->protkey, paramblock, keysize + 32);
+
+       return 0;
+}
+EXPORT_SYMBOL(pkey_clr2protkey);
+
+/*
+ * query cryptographic facility from adapter
+ */
+static int query_crypto_facility(u16 cardnr, u16 domain,
+                                const char *keyword,
+                                u8 *rarray, size_t *rarraylen,
+                                u8 *varray, size_t *varraylen)
+{
+       int rc;
+       u16 len;
+       u8 *mem, *ptr;
+       struct CPRBX *preqcblk, *prepcblk;
+       struct ica_xcRB xcrb;
+       struct fqreqparm {
+               u8  subfunc_code[2];
+               u16 rule_array_len;
+               char  rule_array[8];
+               struct lv1 {
+                       u16 len;
+                       u8  data[VARDATASIZE];
+               } lv1;
+               u16 dummylen;
+       } *preqparm;
+       size_t parmbsize = sizeof(struct fqreqparm);
+       struct fqrepparm {
+               u8  subfunc_code[2];
+               u8  lvdata[0];
+       } *prepparm;
+
+       /* get already prepared memory for 2 cprbs with param block each */
+       rc = alloc_and_prep_cprbmem(parmbsize, &mem, &preqcblk, &prepcblk);
+       if (rc)
+               return rc;
+
+       /* fill request cprb struct */
+       preqcblk->domain = domain;
+
+       /* fill request cprb param block with FQ request */
+       preqparm = (struct fqreqparm *) preqcblk->req_parmb;
+       memcpy(preqparm->subfunc_code, "FQ", 2);
+       strncpy(preqparm->rule_array, keyword, sizeof(preqparm->rule_array));
+       preqparm->rule_array_len =
+               sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+       preqparm->lv1.len = sizeof(preqparm->lv1);
+       preqparm->dummylen = sizeof(preqparm->dummylen);
+       preqcblk->req_parml = parmbsize;
+
+       /* fill xcrb struct */
+       prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+       /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+       rc = _zcrypt_send_cprb(&xcrb);
+       if (rc) {
+               DEBUG_ERR(
+                       "query_crypto_facility zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+                       (int) cardnr, (int) domain, rc);
+               goto out;
+       }
+
+       /* check response returncode and reasoncode */
+       if (prepcblk->ccp_rtcode != 0) {
+               DEBUG_ERR(
+                       "query_crypto_facility unwrap secure key failure, card response %d/%d\n",
+                       (int) prepcblk->ccp_rtcode,
+                       (int) prepcblk->ccp_rscode);
+               rc = -EIO;
+               goto out;
+       }
+
+       /* process response cprb param block */
+       prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+       prepparm = (struct fqrepparm *) prepcblk->rpl_parmb;
+       ptr = prepparm->lvdata;
+
+       /* check and possibly copy reply rule array */
+       len = *((u16 *) ptr);
+       if (len > sizeof(u16)) {
+               ptr += sizeof(u16);
+               len -= sizeof(u16);
+               if (rarray && rarraylen && *rarraylen > 0) {
+                       *rarraylen = (len > *rarraylen ? *rarraylen : len);
+                       memcpy(rarray, ptr, *rarraylen);
+               }
+               ptr += len;
+       }
+       /* check and possible copy reply var array */
+       len = *((u16 *) ptr);
+       if (len > sizeof(u16)) {
+               ptr += sizeof(u16);
+               len -= sizeof(u16);
+               if (varray && varraylen && *varraylen > 0) {
+                       *varraylen = (len > *varraylen ? *varraylen : len);
+                       memcpy(varray, ptr, *varraylen);
+               }
+               ptr += len;
+       }
+
+out:
+       free_cprbmem(mem, parmbsize, 0);
+       return rc;
+}
+
+/*
+ * Fetch just the mkvp value via query_crypto_facility from adapter.
+ */
+static int fetch_mkvp(u16 cardnr, u16 domain, u64 *mkvp)
+{
+       int rc, found = 0;
+       size_t rlen, vlen;
+       u8 *rarray, *varray, *pg;
+
+       pg = (u8 *) __get_free_page(GFP_KERNEL);
+       if (!pg)
+               return -ENOMEM;
+       rarray = pg;
+       varray = pg + PAGE_SIZE/2;
+       rlen = vlen = PAGE_SIZE/2;
+
+       rc = query_crypto_facility(cardnr, domain, "STATICSA",
+                                  rarray, &rlen, varray, &vlen);
+       if (rc == 0 && rlen > 8*8 && vlen > 184+8) {
+               if (rarray[64] == '2') {
+                       /* current master key state is valid */
+                       *mkvp = *((u64 *)(varray + 184));
+                       found = 1;
+               }
+       }
+
+       free_page((unsigned long) pg);
+
+       return found ? 0 : -ENOENT;
+}
+
+/* struct to hold cached mkvp info for each card/domain */
+struct mkvp_info {
+       struct list_head list;
+       u16 cardnr;
+       u16 domain;
+       u64 mkvp;
+};
+
+/* a list with mkvp_info entries */
+static LIST_HEAD(mkvp_list);
+static DEFINE_SPINLOCK(mkvp_list_lock);
+
+static int mkvp_cache_fetch(u16 cardnr, u16 domain, u64 *mkvp)
+{
+       int rc = -ENOENT;
+       struct mkvp_info *ptr;
+
+       spin_lock_bh(&mkvp_list_lock);
+       list_for_each_entry(ptr, &mkvp_list, list) {
+               if (ptr->cardnr == cardnr &&
+                   ptr->domain == domain) {
+                       *mkvp = ptr->mkvp;
+                       rc = 0;
+                       break;
+               }
+       }
+       spin_unlock_bh(&mkvp_list_lock);
+
+       return rc;
+}
+
+static void mkvp_cache_update(u16 cardnr, u16 domain, u64 mkvp)
+{
+       int found = 0;
+       struct mkvp_info *ptr;
+
+       spin_lock_bh(&mkvp_list_lock);
+       list_for_each_entry(ptr, &mkvp_list, list) {
+               if (ptr->cardnr == cardnr &&
+                   ptr->domain == domain) {
+                       ptr->mkvp = mkvp;
+                       found = 1;
+                       break;
+               }
+       }
+       if (!found) {
+               ptr = kmalloc(sizeof(*ptr), GFP_ATOMIC);
+               if (!ptr) {
+                       spin_unlock_bh(&mkvp_list_lock);
+                       return;
+               }
+               ptr->cardnr = cardnr;
+               ptr->domain = domain;
+               ptr->mkvp = mkvp;
+               list_add(&ptr->list, &mkvp_list);
+       }
+       spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void mkvp_cache_scrub(u16 cardnr, u16 domain)
+{
+       struct mkvp_info *ptr;
+
+       spin_lock_bh(&mkvp_list_lock);
+       list_for_each_entry(ptr, &mkvp_list, list) {
+               if (ptr->cardnr == cardnr &&
+                   ptr->domain == domain) {
+                       list_del(&ptr->list);
+                       kfree(ptr);
+                       break;
+               }
+       }
+       spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void __exit mkvp_cache_free(void)
+{
+       struct mkvp_info *ptr, *pnext;
+
+       spin_lock_bh(&mkvp_list_lock);
+       list_for_each_entry_safe(ptr, pnext, &mkvp_list, list) {
+               list_del(&ptr->list);
+               kfree(ptr);
+       }
+       spin_unlock_bh(&mkvp_list_lock);
+}
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+                 u16 *pcardnr, u16 *pdomain, int verify)
+{
+       struct secaeskeytoken *t = (struct secaeskeytoken *) seckey;
+       struct zcrypt_device_matrix *device_matrix;
+       u16 card, dom;
+       u64 mkvp;
+       int i, rc;
+
+       /* mkvp must not be zero */
+       if (t->mkvp == 0)
+               return -EINVAL;
+
+       /* fetch status of all crypto cards */
+       device_matrix = kmalloc(sizeof(struct zcrypt_device_matrix),
+                               GFP_KERNEL);
+       if (!device_matrix)
+               return -ENOMEM;
+       zcrypt_device_status_mask(device_matrix);
+
+       /* walk through all crypto cards */
+       for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+               card = AP_QID_CARD(device_matrix->device[i].qid);
+               dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+               if (device_matrix->device[i].online &&
+                   device_matrix->device[i].functions & 0x04) {
+                       /* an enabled CCA Coprocessor card */
+                       /* try cached mkvp */
+                       if (mkvp_cache_fetch(card, dom, &mkvp) == 0 &&
+                           t->mkvp == mkvp) {
+                               if (!verify)
+                                       break;
+                               /* verify: fetch mkvp from adapter */
+                               if (fetch_mkvp(card, dom, &mkvp) == 0) {
+                                       mkvp_cache_update(card, dom, mkvp);
+                                       if (t->mkvp == mkvp)
+                                               break;
+                               }
+                       }
+               } else {
+                       /* Card is offline and/or not a CCA card. */
+                       /* del mkvp entry from cache if it exists */
+                       mkvp_cache_scrub(card, dom);
+               }
+       }
+       if (i >= MAX_ZDEV_ENTRIES) {
+               /* nothing found, so this time without cache */
+               for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+                       if (!(device_matrix->device[i].online &&
+                             device_matrix->device[i].functions & 0x04))
+                               continue;
+                       card = AP_QID_CARD(device_matrix->device[i].qid);
+                       dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+                       /* fresh fetch mkvp from adapter */
+                       if (fetch_mkvp(card, dom, &mkvp) == 0) {
+                               mkvp_cache_update(card, dom, mkvp);
+                               if (t->mkvp == mkvp)
+                                       break;
+                       }
+               }
+       }
+       if (i < MAX_ZDEV_ENTRIES) {
+               if (pcardnr)
+                       *pcardnr = card;
+               if (pdomain)
+                       *pdomain = dom;
+               rc = 0;
+       } else
+               rc = -ENODEV;
+
+       kfree(device_matrix);
+       return rc;
+}
+EXPORT_SYMBOL(pkey_findcard);
+
+/*
+ * Find card and transform secure key into protected key.
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+                  struct pkey_protkey *protkey)
+{
+       u16 cardnr, domain;
+       int rc, verify;
+
+       /*
+        * The pkey_sec2protkey call may fail when a card has been
+        * addressed where the master key was changed after last fetch
+        * of the mkvp into the cache. So first try without verify then
+        * with verify enabled (thus refreshing the mkvp for each card).
+        */
+       for (verify = 0; verify < 2; verify++) {
+               rc = pkey_findcard(seckey, &cardnr, &domain, verify);
+               if (rc)
+                       continue;
+               rc = pkey_sec2protkey(cardnr, domain, seckey, protkey);
+               if (rc == 0)
+                       break;
+       }
+
+       if (rc)
+               DEBUG_DBG("pkey_skey2pkey failed rc=%d\n", rc);
+
+       return rc;
+}
+EXPORT_SYMBOL(pkey_skey2pkey);
+
+/*
+ * File io functions
+ */
+
+static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
+                               unsigned long arg)
+{
+       int rc;
+
+       switch (cmd) {
+       case PKEY_GENSECK: {
+               struct pkey_genseck __user *ugs = (void __user *) arg;
+               struct pkey_genseck kgs;
+
+               if (copy_from_user(&kgs, ugs, sizeof(kgs)))
+                       return -EFAULT;
+               rc = pkey_genseckey(kgs.cardnr, kgs.domain,
+                                   kgs.keytype, &kgs.seckey);
+               DEBUG_DBG("pkey_ioctl pkey_genseckey()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(ugs, &kgs, sizeof(kgs)))
+                       return -EFAULT;
+               break;
+       }
+       case PKEY_CLR2SECK: {
+               struct pkey_clr2seck __user *ucs = (void __user *) arg;
+               struct pkey_clr2seck kcs;
+
+               if (copy_from_user(&kcs, ucs, sizeof(kcs)))
+                       return -EFAULT;
+               rc = pkey_clr2seckey(kcs.cardnr, kcs.domain, kcs.keytype,
+                                    &kcs.clrkey, &kcs.seckey);
+               DEBUG_DBG("pkey_ioctl pkey_clr2seckey()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(ucs, &kcs, sizeof(kcs)))
+                       return -EFAULT;
+               memzero_explicit(&kcs, sizeof(kcs));
+               break;
+       }
+       case PKEY_SEC2PROTK: {
+               struct pkey_sec2protk __user *usp = (void __user *) arg;
+               struct pkey_sec2protk ksp;
+
+               if (copy_from_user(&ksp, usp, sizeof(ksp)))
+                       return -EFAULT;
+               rc = pkey_sec2protkey(ksp.cardnr, ksp.domain,
+                                     &ksp.seckey, &ksp.protkey);
+               DEBUG_DBG("pkey_ioctl pkey_sec2protkey()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(usp, &ksp, sizeof(ksp)))
+                       return -EFAULT;
+               break;
+       }
+       case PKEY_CLR2PROTK: {
+               struct pkey_clr2protk __user *ucp = (void __user *) arg;
+               struct pkey_clr2protk kcp;
+
+               if (copy_from_user(&kcp, ucp, sizeof(kcp)))
+                       return -EFAULT;
+               rc = pkey_clr2protkey(kcp.keytype,
+                                     &kcp.clrkey, &kcp.protkey);
+               DEBUG_DBG("pkey_ioctl pkey_clr2protkey()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(ucp, &kcp, sizeof(kcp)))
+                       return -EFAULT;
+               memzero_explicit(&kcp, sizeof(kcp));
+               break;
+       }
+       case PKEY_FINDCARD: {
+               struct pkey_findcard __user *ufc = (void __user *) arg;
+               struct pkey_findcard kfc;
+
+               if (copy_from_user(&kfc, ufc, sizeof(kfc)))
+                       return -EFAULT;
+               rc = pkey_findcard(&kfc.seckey,
+                                  &kfc.cardnr, &kfc.domain, 1);
+               DEBUG_DBG("pkey_ioctl pkey_findcard()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(ufc, &kfc, sizeof(kfc)))
+                       return -EFAULT;
+               break;
+       }
+       case PKEY_SKEY2PKEY: {
+               struct pkey_skey2pkey __user *usp = (void __user *) arg;
+               struct pkey_skey2pkey ksp;
+
+               if (copy_from_user(&ksp, usp, sizeof(ksp)))
+                       return -EFAULT;
+               rc = pkey_skey2pkey(&ksp.seckey, &ksp.protkey);
+               DEBUG_DBG("pkey_ioctl pkey_skey2pkey()=%d\n", rc);
+               if (rc)
+                       break;
+               if (copy_to_user(usp, &ksp, sizeof(ksp)))
+                       return -EFAULT;
+               break;
+       }
+       default:
+               /* unknown/unsupported ioctl cmd */
+               return -ENOTTY;
+       }
+
+       return rc;
+}
+
+/*
+ * Sysfs and file io operations
+ */
+static const struct file_operations pkey_fops = {
+       .owner          = THIS_MODULE,
+       .open           = nonseekable_open,
+       .llseek         = no_llseek,
+       .unlocked_ioctl = pkey_unlocked_ioctl,
+};
+
+static struct miscdevice pkey_dev = {
+       .name   = "pkey",
+       .minor  = MISC_DYNAMIC_MINOR,
+       .mode   = 0666,
+       .fops   = &pkey_fops,
+};
+
+/*
+ * Module init
+ */
+int __init pkey_init(void)
+{
+       cpacf_mask_t pckmo_functions;
+
+       /* check for pckmo instructions available */
+       if (!cpacf_query(CPACF_PCKMO, &pckmo_functions))
+               return -EOPNOTSUPP;
+       if (!cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_128_KEY) ||
+           !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_192_KEY) ||
+           !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_256_KEY))
+               return -EOPNOTSUPP;
+
+       pkey_debug_init();
+
+       return misc_register(&pkey_dev);
+}
+
+/*
+ * Module exit
+ */
+static void __exit pkey_exit(void)
+{
+       misc_deregister(&pkey_dev);
+       mkvp_cache_free();
+       pkey_debug_exit();
+}
+
+module_init(pkey_init);
+module_exit(pkey_exit);
index 144a179..93015f8 100644 (file)
@@ -374,7 +374,7 @@ out:
        return rc;
 }
 
-static long zcrypt_send_cprb(struct ica_xcRB *xcRB)
+long zcrypt_send_cprb(struct ica_xcRB *xcRB)
 {
        struct zcrypt_card *zc, *pref_zc;
        struct zcrypt_queue *zq, *pref_zq;
@@ -444,6 +444,7 @@ out:
                              AP_QID_CARD(qid), AP_QID_QUEUE(qid));
        return rc;
 }
+EXPORT_SYMBOL(zcrypt_send_cprb);
 
 static bool is_desired_ep11_card(unsigned int dev_id,
                                 unsigned short target_num,
@@ -619,7 +620,7 @@ out:
        return rc;
 }
 
-static void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
 {
        struct zcrypt_card *zc;
        struct zcrypt_queue *zq;
index 274a590..6c94efd 100644 (file)
@@ -190,5 +190,7 @@ void zcrypt_msgtype_unregister(struct zcrypt_ops *);
 struct zcrypt_ops *zcrypt_msgtype(unsigned char *, int);
 int zcrypt_api_init(void);
 void zcrypt_api_exit(void);
+long zcrypt_send_cprb(struct ica_xcRB *xcRB);
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *devstatus);
 
 #endif /* _ZCRYPT_API_H_ */
index 137d22d..838347c 100644 (file)
@@ -1630,7 +1630,7 @@ static int aac_acquire_resources(struct aac_dev *dev)
 
        if (!dev->sync_mode) {
                /* After EEH recovery or suspend resume, max_msix count
-                * may change, therfore updating in init as well.
+                * may change, therefore updating in init as well.
                 */
                dev->init->r7.no_of_msix_vectors = cpu_to_le32(dev->max_msix);
                aac_adapter_start(dev);
index ae5bfe0..ccbd9e3 100644 (file)
@@ -680,7 +680,7 @@ struct bfi_ioim_req_s {
 
        /*
         * SG elements array within the IO request must be double word
-        * aligned. This aligment is required to optimize SGM setup for the IO.
+        * aligned. This alignment is required to optimize SGM setup for the IO.
         */
        struct bfi_sge_s        sges[BFI_SGE_INLINE_MAX];
        u8      io_timeout;
index cea57e2..656463f 100644 (file)
@@ -1387,7 +1387,7 @@ static void fcoe_ctlr_recv_clr_vlink(struct fcoe_ctlr *fip,
        /*
         * Actually need to subtract 'sizeof(*mp) - sizeof(*wp)' from 'rlen'
         * before determining max Vx_Port descriptor but a buggy FCF could have
-        * omited either or both MAC Address and Name Identifier descriptors
+        * omitted either or both MAC Address and Name Identifier descriptors
         */
        num_vlink_desc = rlen / sizeof(*vp);
        if (num_vlink_desc)
index 835c59c..b29afaf 100644 (file)
@@ -9330,7 +9330,7 @@ static pci_ers_result_t ipr_pci_error_detected(struct pci_dev *pdev,
  * ipr_probe_ioa_part2 - Initializes IOAs found in ipr_probe_ioa(..)
  * @ioa_cfg:   ioa cfg struct
  *
- * Description: This is the second phase of adapter intialization
+ * Description: This is the second phase of adapter initialization
  * This function takes care of initilizing the adapter to the point
  * where it can accept new commands.
 
index 50cf402..03cb05a 100644 (file)
@@ -3329,7 +3329,7 @@ static DEVICE_ATTR(lpfc_static_vport, S_IRUGO,
  * @buf: Data buffer.
  * @count: Size of the data buffer.
  *
- * This function get called when an user write to the lpfc_stat_data_ctrl
+ * This function get called when a user write to the lpfc_stat_data_ctrl
  * sysfs file. This function parse the command written to the sysfs file
  * and take appropriate action. These commands are used for controlling
  * driver statistical data collection.
index d977a47..8e886ca 100644 (file)
@@ -4510,7 +4510,7 @@ lpfc_sli4_rb_setup(struct lpfc_hba *phba)
  * @phba: Pointer to HBA context object.
  * @sli_mode: sli mode - 2/3
  *
- * This function is called by the sli intialization code path
+ * This function is called by the sli initialization code path
  * to issue config_port mailbox command. This function restarts the
  * HBA firmware and issues a config_port mailbox command to configure
  * the SLI interface in the sli mode specified by sli_mode
@@ -4650,11 +4650,11 @@ do_prep_failed:
 
 
 /**
- * lpfc_sli_hba_setup - SLI intialization function
+ * lpfc_sli_hba_setup - SLI initialization function
  * @phba: Pointer to HBA context object.
  *
- * This function is the main SLI intialization function. This function
- * is called by the HBA intialization code, HBA reset code and HBA
+ * This function is the main SLI initialization function. This function
+ * is called by the HBA initialization code, HBA reset code and HBA
  * error attention handler code. Caller is not required to hold any
  * locks. This function issues config_port mailbox command to configure
  * the SLI, setup iocb rings and HBQ rings. In the end the function
@@ -6324,11 +6324,11 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox)
 }
 
 /**
- * lpfc_sli4_hba_setup - SLI4 device intialization PCI function
+ * lpfc_sli4_hba_setup - SLI4 device initialization PCI function
  * @phba: Pointer to HBA context object.
  *
- * This function is the main SLI4 device intialization PCI function. This
- * function is called by the HBA intialization code, HBA reset code and
+ * This function is the main SLI4 device initialization PCI function. This
+ * function is called by the HBA initialization code, HBA reset code and
  * HBA error attention handler code. Caller is not required to hold any
  * locks.
  **/
@@ -12079,7 +12079,7 @@ lpfc_sli4_sp_handle_els_wcqe(struct lpfc_hba *phba, struct lpfc_queue *cq,
  * @phba: Pointer to HBA context object.
  * @wcqe: Pointer to work-queue completion queue entry.
  *
- * This routine handles slow-path WQ entry comsumed event by invoking the
+ * This routine handles slow-path WQ entry consumed event by invoking the
  * proper WQ release routine to the slow-path WQ.
  **/
 static void
@@ -12451,7 +12451,7 @@ lpfc_sli4_fp_handle_fcp_wcqe(struct lpfc_hba *phba, struct lpfc_queue *cq,
  * @cq: Pointer to completion queue.
  * @wcqe: Pointer to work-queue completion queue entry.
  *
- * This routine handles an fast-path WQ entry comsumed event by invoking the
+ * This routine handles an fast-path WQ entry consumed event by invoking the
  * proper WQ release routine to the slow-path WQ.
  **/
 static void
index 02fe1c4..bdffb69 100644 (file)
@@ -1925,7 +1925,7 @@ mpt3sas_send_diag_release(struct MPT3SAS_ADAPTER *ioc, u8 buffer_type,
  *
  * This allows ownership of the specified buffer to returned to the driver,
  * allowing an application to read the buffer without fear that firmware is
- * overwritting information in the buffer.
+ * overwriting information in the buffer.
  */
 static long
 _ctl_diag_release(struct MPT3SAS_ADAPTER *ioc, void __user *arg)
index f3e17a8..a44046c 100644 (file)
@@ -390,7 +390,7 @@ struct mpt3_diag_query {
  *
  * This allows ownership of the specified buffer to returned to the driver,
  * allowing an application to read the buffer without fear that firmware is
- * overwritting information in the buffer.
+ * overwriting information in the buffer.
  */
 struct mpt3_diag_release {
        struct mpt3_ioctl_header hdr;
index 30b9050..6903f03 100644 (file)
@@ -1290,7 +1290,7 @@ int osd_req_add_get_attr_list(struct osd_request *or,
        or->enc_get_attr.total_bytes = total_bytes;
 
        OSD_DEBUG(
-              "get_attr.total_bytes=%u(%u) enc_get_attr.total_bytes=%u(%Zu)\n",
+              "get_attr.total_bytes=%u(%u) enc_get_attr.total_bytes=%u(%zu)\n",
               or->get_attr.total_bytes,
               or->get_attr.total_bytes - _osd_req_sizeof_alist_header(or),
               or->enc_get_attr.total_bytes,
@@ -1677,7 +1677,7 @@ int osd_finalize_request(struct osd_request *or,
                }
        } else {
                /* TODO: I think that for the GET_ATTR command these 2 should
-                * be reversed to keep them in execution order (for embeded
+                * be reversed to keep them in execution order (for embedded
                 * targets with low memory footprint)
                 */
                ret = _osd_req_finalize_set_attr_list(or);
index 451de6c..75ac662 100644 (file)
@@ -3435,7 +3435,7 @@ static ssize_t osst_write(struct file * filp, const char __user * buf, size_t co
 
        /* Write must be integral number of blocks */
        if (STp->block_size != 0 && (count % STp->block_size) != 0) {
-               printk(KERN_ERR "%s:E: Write (%Zd bytes) not multiple of tape block size (%d%c).\n",
+               printk(KERN_ERR "%s:E: Write (%zd bytes) not multiple of tape block size (%d%c).\n",
                                       name, count, STp->block_size<1024?
                                       STp->block_size:STp->block_size/1024, STp->block_size<1024?'b':'k');
                retval = (-EINVAL);
@@ -3756,7 +3756,7 @@ static ssize_t osst_read(struct file * filp, char __user * buf, size_t count, lo
 
        if ((count % STp->block_size) != 0) {
                printk(KERN_WARNING
-                   "%s:W: Read (%Zd bytes) not multiple of tape block size (%d%c).\n", name, count,
+                   "%s:W: Read (%zd bytes) not multiple of tape block size (%d%c).\n", name, count,
                    STp->block_size<1024?STp->block_size:STp->block_size/1024, STp->block_size<1024?'b':'k');
        }
 
@@ -3815,7 +3815,7 @@ static ssize_t osst_read(struct file * filp, char __user * buf, size_t count, lo
 
                        if (transfer == 0) {
                                printk(KERN_WARNING
-                                 "%s:W: Nothing can be transferred, requested %Zd, tape block size (%d%c).\n",
+                                 "%s:W: Nothing can be transferred, requested %zd, tape block size (%d%c).\n",
                                        name, count, STp->block_size < 1024?
                                        STp->block_size:STp->block_size/1024,
                                        STp->block_size<1024?'b':'k');
index 7b6317c..265e139 100644 (file)
@@ -5669,7 +5669,7 @@ qla2x00_load_risc(scsi_qla_host_t *vha, uint32_t *srisc_addr)
        /* Validate firmware image by checking version. */
        if (blob->fw->size < 8 * sizeof(uint16_t)) {
                ql_log(ql_log_fatal, vha, 0x0085,
-                   "Unable to verify integrity of firmware image (%Zd).\n",
+                   "Unable to verify integrity of firmware image (%zd).\n",
                    blob->fw->size);
                goto fail_fw_integrity;
        }
@@ -5697,7 +5697,7 @@ qla2x00_load_risc(scsi_qla_host_t *vha, uint32_t *srisc_addr)
                if (blob->fw->size < fwclen) {
                        ql_log(ql_log_fatal, vha, 0x0088,
                            "Unable to verify integrity of firmware image "
-                           "(%Zd).\n", blob->fw->size);
+                           "(%zd).\n", blob->fw->size);
                        goto fail_fw_integrity;
                }
 
@@ -5778,7 +5778,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
        /* Validate firmware image by checking version. */
        if (blob->fw->size < 8 * sizeof(uint32_t)) {
                ql_log(ql_log_fatal, vha, 0x0093,
-                   "Unable to verify integrity of firmware image (%Zd).\n",
+                   "Unable to verify integrity of firmware image (%zd).\n",
                    blob->fw->size);
                return QLA_FUNCTION_FAILED;
        }
@@ -5789,7 +5789,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
            (dcode[0] == 0 && dcode[1] == 0 && dcode[2] == 0 &&
                dcode[3] == 0)) {
                ql_log(ql_log_fatal, vha, 0x0094,
-                   "Unable to verify integrity of firmware image (%Zd).\n",
+                   "Unable to verify integrity of firmware image (%zd).\n",
                    blob->fw->size);
                ql_log(ql_log_fatal, vha, 0x0095,
                    "Firmware data: %08x %08x %08x %08x.\n",
@@ -5807,7 +5807,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
                if (blob->fw->size < fwclen) {
                        ql_log(ql_log_fatal, vha, 0x0096,
                            "Unable to verify integrity of firmware image "
-                           "(%Zd).\n", blob->fw->size);
+                           "(%zd).\n", blob->fw->size);
                        return QLA_FUNCTION_FAILED;
                }
 
index f945351..cdbb293 100644 (file)
@@ -1475,7 +1475,7 @@ static void sas_end_device_release(struct device *dev)
 }
 
 /**
- * sas_rphy_initialize - common rphy intialization
+ * sas_rphy_initialize - common rphy initialization
  * @rphy:      rphy to initialise
  *
  * Used by both sas_end_device_alloc() and sas_expander_alloc() to
index 585e54f..638e5f4 100644 (file)
@@ -280,7 +280,7 @@ static const struct vmstor_protocol vmstor_protocols[] = {
 
 
 /*
- * This structure is sent during the intialization phase to get the different
+ * This structure is sent during the initialization phase to get the different
  * properties of the channel.
  */
 
index 7b8cc3a..cd1eb2c 100644 (file)
@@ -39,7 +39,7 @@ struct fpgaimage {
        const struct    firmware        *fw_entry;
 
        /*
-        * the followings can be read from bitstream,
+        * the following can be read from bitstream,
         * but other image format should have as well
         */
        char    filename[MAX_STR];
index b0eb80d..60b827e 100644 (file)
@@ -1704,7 +1704,7 @@ struct ost_lvb {
  *   lquota data structures
  */
 
-/* The lquota_id structure is an union of all the possible identifier types that
+/* The lquota_id structure is a union of all the possible identifier types that
  * can be used with quota, this includes:
  * - 64-bit user ID
  * - 64-bit group ID
index 0d24705..0971470 100644 (file)
@@ -1953,7 +1953,7 @@ struct ieee80211_device {
 
        /* ask to the driver to retune the radio .
         * This function can sleep. the driver should ensure
-        * the radio has been swithced before return.
+        * the radio has been switched before return.
         */
        void (*set_chan)(struct net_device *dev, short ch);
 
@@ -1964,7 +1964,7 @@ struct ieee80211_device {
         * The syncro version is similar to the start_scan but
         * does not return until all channels has been scanned.
         * this is called in user context and should sleep,
-        * it is called in a work_queue when swithcing to ad-hoc mode
+        * it is called in a work_queue when switching to ad-hoc mode
         * or in behalf of iwlist scan when the card is associated
         * and root user ask for a scan.
         * the function stop_scan should stop both the syncro and
index 1bff0e9..0ea90aa 100644 (file)
@@ -2364,7 +2364,7 @@ static void ieee80211_start_ibss_wq(struct work_struct *work)
 //     if((IS_DOT11D_ENABLE(ieee)) && (ieee->state == IEEE80211_NOLINK))
        if (ieee->state == IEEE80211_NOLINK)
                ieee->current_network.channel = 6;
-       /* if not then the state is not linked. Maybe the user swithced to
+       /* if not then the state is not linked. Maybe the user switched to
         * ad-hoc mode just after being in monitor mode, or just after
         * being very few time in managed mode (so the card have had no
         * time to scan all the chans..) or we have just run up the iface
index 1dc8627..cb0b7ca 100644 (file)
@@ -1875,8 +1875,8 @@ vchiq_arm_init_state(VCHIQ_STATE_T *state, VCHIQ_ARM_STATE_T *arm_state)
 **
 ** VC_RESUME_IDLE - Initialise the resume completion at the same time.  The
 **                     resume completion is in it's 'done' state whenever
-**                     videcore is running.  Therfore, the VC_RESUME_IDLE state
-**                     implies that videocore is suspended.
+**                     videcore is running.  Therefore, the VC_RESUME_IDLE
+**                     state implies that videocore is suspended.
 **                     Hence, any thread which needs to wait until videocore is
 **                     running can wait on this completion - it will only block
 **                     if videocore is suspended.
index 9ab4393..2eebc62 100644 (file)
@@ -213,7 +213,7 @@ static void deinit_irq(struct net_device *dev)
        vif = netdev_priv(dev);
        wilc = vif->wilc;
 
-       /* Deintialize IRQ */
+       /* Deinitialize IRQ */
        if (wilc->dev_irq_num) {
                free_irq(wilc->dev_irq_num, wilc);
                gpio_free(wilc->gpio);
index f7ce47c..7961d1c 100644 (file)
@@ -2357,7 +2357,7 @@ int wilc_deinit_host_int(struct net_device *net)
                del_timer_sync(&wilc_during_ip_timer);
 
        if (s32Error)
-               netdev_err(net, "Error while deintializing host interface\n");
+               netdev_err(net, "Error while deinitializing host interface\n");
 
        return s32Error;
 }
index 5c1cb2d..c3adefe 100644 (file)
@@ -642,9 +642,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
                WARN_ON(tcmu_hdr_get_op(entry->hdr.len_op) != TCMU_OP_CMD);
 
                spin_lock(&udev->commands_lock);
-               cmd = idr_find(&udev->commands, entry->hdr.cmd_id);
-               if (cmd)
-                       idr_remove(&udev->commands, cmd->cmd_id);
+               cmd = idr_remove(&udev->commands, entry->hdr.cmd_id);
                spin_unlock(&udev->commands_lock);
 
                if (!cmd) {
index eb27883..1bacbc3 100644 (file)
@@ -667,7 +667,7 @@ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file,
        struct n_hdlc_buf *tbuf;
 
        if (debuglevel >= DEBUG_LEVEL_INFO)     
-               printk("%s(%d)n_hdlc_tty_write() called count=%Zd\n",
+               printk("%s(%d)n_hdlc_tty_write() called count=%zd\n",
                        __FILE__,__LINE__,count);
                
        /* Verify pointers */
index 6ad26f8..f96bcf9 100644 (file)
 #define IOC4_SSCR_PAUSE_STATE   0x40000000  /* Sets when PAUSE takes effect */
 #define IOC4_SSCR_RESET                0x80000000  /* Reset DMA channels */
 
-/* All producer/comsumer pointers are the same bitfield */
+/* All producer/consumer pointers are the same bitfield */
 #define IOC4_PROD_CONS_PTR_4K   0x00000ff8     /* For 4K buffers */
 #define IOC4_PROD_CONS_PTR_1K   0x000003f8     /* For 1K buffers */
 #define IOC4_PROD_CONS_PTR_OFF           3
index 52747b6..ca425e8 100644 (file)
@@ -2335,7 +2335,7 @@ static int proc_drop_privileges(struct usb_dev_state *ps, void __user *arg)
        if (copy_from_user(&data, arg, sizeof(data)))
                return -EFAULT;
 
-       /* This is an one way operation. Once privileges are
+       /* This is a one way operation. Once privileges are
         * dropped, you cannot regain them. You may however reissue
         * this ioctl to shrink the allowed interfaces mask.
         */
index 6bde439..a2615d6 100644 (file)
@@ -1848,7 +1848,7 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 
 fail:
        spin_unlock_irq (&dev->lock);
-       pr_debug ("%s: %s fail %Zd, %p\n", shortname, __func__, value, dev);
+       pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev);
        kfree (dev->buf);
        dev->buf = NULL;
        return value;
index 2e41ef3..b76fcdb 100644 (file)
@@ -520,7 +520,7 @@ static void struct_ep_qh_setup(struct fsl_udc *udc, unsigned char ep_num,
 /* Setup qh structure and ep register for ep0. */
 static void ep0_setup(struct fsl_udc *udc)
 {
-       /* the intialization of an ep includes: fields in QH, Regs,
+       /* the initialization of an ep includes: fields in QH, Regs,
         * fsl_ep struct */
        struct_ep_qh_setup(udc, 0, USB_RECV, USB_ENDPOINT_XFER_CONTROL,
                        USB_MAX_CTRL_PAYLOAD, 0, 0);
@@ -2349,7 +2349,7 @@ static int struct_ep_setup(struct fsl_udc *udc, unsigned char index,
 }
 
 /* Driver probe function
- * all intialization operations implemented here except enabling usb_intr reg
+ * all initialization operations implemented here except enabling usb_intr reg
  * board setup should have been done in the platform code
  */
 static int fsl_udc_probe(struct platform_device *pdev)
index fb8fc34..2218f91 100644 (file)
@@ -1791,7 +1791,7 @@ static int renesas_usb3_init_ep(struct renesas_usb3 *usb3, struct device *dev,
 
        dev_dbg(dev, "%s: num_usb3_eps = %d\n", __func__, usb3->num_usb3_eps);
        /*
-        * This driver prepares pipes as the followings:
+        * This driver prepares pipes as follows:
         *  - odd pipes = IN pipe
         *  - even pipes = OUT pipe (except pipe 0)
         */
@@ -1841,7 +1841,7 @@ static void renesas_usb3_init_ram(struct renesas_usb3 *usb3, struct device *dev,
        memset(basead, 0, sizeof(basead));
 
        /*
-        * This driver prepares pipes as the followings:
+        * This driver prepares pipes as follows:
         *  - all pipes = the same size as "ramsize_per_pipe"
         * Please refer to the "Method of Specifying RAM Mapping"
         */
index 0630648..ac2c4ea 100644 (file)
@@ -1322,7 +1322,7 @@ static int __init ehci_hcd_init(void)
                printk(KERN_WARNING "Warning! ehci_hcd should always be loaded"
                                " before uhci_hcd and ohci_hcd, not after\n");
 
-       pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd sitd %Zd\n",
+       pr_debug("%s: block sizes: qh %zd qtd %zd itd %zd sitd %zd\n",
                 hcd_name,
                 sizeof(struct ehci_qh), sizeof(struct ehci_qtd),
                 sizeof(struct ehci_itd), sizeof(struct ehci_sitd));
index 9d0b051..1c5b34b 100644 (file)
@@ -5697,7 +5697,7 @@ static int __init fotg210_hcd_init(void)
                        test_bit(USB_OHCI_LOADED, &usb_hcds_loaded))
                pr_warn("Warning! fotg210_hcd should always be loaded before uhci_hcd and ohci_hcd, not after\n");
 
-       pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd\n",
+       pr_debug("%s: block sizes: qh %zd qtd %zd itd %zd\n",
                        hcd_name, sizeof(struct fotg210_qh),
                        sizeof(struct fotg210_qtd),
                        sizeof(struct fotg210_itd));
index 8685cf3..b6daf2e 100644 (file)
@@ -1252,7 +1252,7 @@ static int __init ohci_hcd_mod_init(void)
                return -ENODEV;
 
        printk(KERN_INFO "%s: " DRIVER_DESC "\n", hcd_name);
-       pr_debug ("%s: block sizes: ed %Zd td %Zd\n", hcd_name,
+       pr_debug ("%s: block sizes: ed %zd td %zd\n", hcd_name,
                sizeof (struct ed), sizeof (struct td));
        set_bit(USB_OHCI_LOADED, &usb_hcds_loaded);
 
index a540e4f..c5fa584 100644 (file)
@@ -563,20 +563,20 @@ static ssize_t adu_write(struct file *file, const __user char *buffer,
                        }
 
                        dev_dbg(&dev->udev->dev,
-                               "%s : in progress, count = %Zd\n",
+                               "%s : in progress, count = %zd\n",
                                __func__, count);
                } else {
                        spin_unlock_irqrestore(&dev->buflock, flags);
                        set_current_state(TASK_RUNNING);
                        remove_wait_queue(&dev->write_wait, &waita);
-                       dev_dbg(&dev->udev->dev, "%s : sending, count = %Zd\n",
+                       dev_dbg(&dev->udev->dev, "%s : sending, count = %zd\n",
                                __func__, count);
 
                        /* write the data into interrupt_out_buffer from userspace */
                        buffer_size = usb_endpoint_maxp(dev->interrupt_out_endpoint);
                        bytes_to_write = count > buffer_size ? buffer_size : count;
                        dev_dbg(&dev->udev->dev,
-                               "%s : buffer_size = %Zd, count = %Zd, bytes_to_write = %Zd\n",
+                               "%s : buffer_size = %zd, count = %zd, bytes_to_write = %zd\n",
                                __func__, buffer_size, count, bytes_to_write);
 
                        if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write) != 0) {
index b10e26c..322a042 100644 (file)
@@ -673,7 +673,7 @@ static ssize_t tower_write (struct file *file, const char __user *buffer, size_t
 
        /* write the data into interrupt_out_buffer from userspace */
        bytes_to_write = min_t(int, count, write_buffer_size);
-       dev_dbg(&dev->udev->dev, "%s: count = %Zd, bytes_to_write = %Zd\n",
+       dev_dbg(&dev->udev->dev, "%s: count = %zd, bytes_to_write = %zd\n",
                __func__, count, bytes_to_write);
 
        if (copy_from_user (dev->interrupt_out_buffer, buffer, bytes_to_write)) {
index 356d312..0a643fa 100644 (file)
@@ -526,7 +526,7 @@ static size_t parport_uss720_epp_write_data(struct parport *pp, const void *buf,
                return 0;
        i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buf, length, &rlen, 20000);
        if (i)
-               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buf, length, rlen);
+               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buf, length, rlen);
        change_mode(pp, ECR_PS2);
        return rlen;
 #endif
@@ -587,7 +587,7 @@ static size_t parport_uss720_ecp_write_data(struct parport *pp, const void *buff
                return 0;
        i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buffer, len, &rlen, 20000);
        if (i)
-               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buffer, len, rlen);
        change_mode(pp, ECR_PS2);
        return rlen;
 }
@@ -605,7 +605,7 @@ static size_t parport_uss720_ecp_read_data(struct parport *pp, void *buffer, siz
                return 0;
        i = usb_bulk_msg(usbdev, usb_rcvbulkpipe(usbdev, 2), buffer, len, &rlen, 20000);
        if (i)
-               printk(KERN_ERR "uss720: recvbulk ep 2 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+               printk(KERN_ERR "uss720: recvbulk ep 2 buf %p len %zu rlen %u\n", buffer, len, rlen);
        change_mode(pp, ECR_PS2);
        return rlen;
 }
@@ -638,7 +638,7 @@ static size_t parport_uss720_write_compat(struct parport *pp, const void *buffer
                return 0;
        i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buffer, len, &rlen, 20000);
        if (i)
-               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+               printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buffer, len, rlen);
        change_mode(pp, ECR_PS2);
        return rlen;
 }
index 8b23229..1a6f78d 100644 (file)
@@ -707,7 +707,7 @@ void usbip_pad_iso(struct usbip_device *ud, struct urb *urb)
                return;
 
        /*
-        * loop over all packets from last to first (to prevent overwritting
+        * loop over all packets from last to first (to prevent overwriting
         * memory when padding) and move them into the proper place
         */
        for (i = np-1; i > 0; i--) {
index 278b421..dd823f5 100644 (file)
@@ -646,7 +646,7 @@ void radeon_probe_screens(struct radeonfb_info *rinfo,
 
 
 /*
- * This functions applyes any arch/model/machine specific fixups
+ * This function applies any arch/model/machine specific fixups
  * to the panel info. It may eventually alter EDID block as
  * well or whatever is specific to a given model and not probed
  * properly by the default code
index abb6bbf..9085e95 100644 (file)
@@ -187,7 +187,7 @@ static int load_waveform(u8 *mem, size_t size, int m, int t,
                epd_frame_table[par->dt].wfm_size = user_wfm_size;
 
        if (size != epd_frame_table[par->dt].wfm_size) {
-               dev_err(dev, "Error: unexpected size %Zd != %d\n", size,
+               dev_err(dev, "Error: unexpected size %zd != %d\n", size,
                                        epd_frame_table[par->dt].wfm_size);
                return -EINVAL;
        }
index 496f6c1..b339e0e 100644 (file)
@@ -36,9 +36,9 @@
 #define PM_RSTC_RESET                  0x00000102
 
 /*
- * The Raspberry Pi firmware uses the RSTS register to know which partiton
- * to boot from. The partiton value is spread into bits 0, 2, 4, 6, 8, 10.
- * Partiton 63 is a special partition used by the firmware to indicate halt.
+ * The Raspberry Pi firmware uses the RSTS register to know which partition
+ * to boot from. The partition value is spread into bits 0, 2, 4, 6, 8, 10.
+ * Partition 63 is a special partition used by the firmware to indicate halt.
  */
 #define PM_RSTS_RASPBERRYPI_HALT       0x555
 
index 2f08877..2f8bab3 100644 (file)
@@ -138,9 +138,9 @@ extern int  affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
 extern int     affs_remove_header(struct dentry *dentry);
 extern u32     affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
 extern void    affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
-extern void    secs_to_datestamp(time64_t secs, struct affs_date *ds);
-extern umode_t prot_to_mode(u32 prot);
-extern void    mode_to_prot(struct inode *inode);
+extern void    affs_secs_to_datestamp(time64_t secs, struct affs_date *ds);
+extern umode_t affs_prot_to_mode(u32 prot);
+extern void    affs_mode_to_prot(struct inode *inode);
 __printf(3, 4)
 extern void    affs_error(struct super_block *sb, const char *function,
                           const char *fmt, ...);
@@ -162,6 +162,7 @@ extern void affs_free_bitmap(struct super_block *sb);
 
 /* namei.c */
 
+extern const struct export_operations affs_export_ops;
 extern int     affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
 extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
 extern int     affs_unlink(struct inode *dir, struct dentry *dentry);
@@ -178,7 +179,6 @@ extern int  affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 /* inode.c */
 
-extern unsigned long            affs_parent_ino(struct inode *dir);
 extern struct inode            *affs_new_inode(struct inode *dir);
 extern int                      affs_notify_change(struct dentry *dentry, struct iattr *attr);
 extern void                     affs_evict_inode(struct inode *inode);
@@ -213,6 +213,12 @@ extern const struct address_space_operations        affs_aops_ofs;
 extern const struct dentry_operations   affs_dentry_operations;
 extern const struct dentry_operations   affs_intl_dentry_operations;
 
+static inline bool affs_validblock(struct super_block *sb, int block)
+{
+       return(block >= AFFS_SB(sb)->s_reserved &&
+              block < AFFS_SB(sb)->s_partition_size);
+}
+
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
 {
@@ -222,7 +228,7 @@ static inline struct buffer_head *
 affs_bread(struct super_block *sb, int block)
 {
        pr_debug("%s: %d\n", __func__, block);
-       if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+       if (affs_validblock(sb, block))
                return sb_bread(sb, block);
        return NULL;
 }
@@ -230,7 +236,7 @@ static inline struct buffer_head *
 affs_getblk(struct super_block *sb, int block)
 {
        pr_debug("%s: %d\n", __func__, block);
-       if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+       if (affs_validblock(sb, block))
                return sb_getblk(sb, block);
        return NULL;
 }
@@ -239,7 +245,7 @@ affs_getzeroblk(struct super_block *sb, int block)
 {
        struct buffer_head *bh;
        pr_debug("%s: %d\n", __func__, block);
-       if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+       if (affs_validblock(sb, block)) {
                bh = sb_getblk(sb, block);
                lock_buffer(bh);
                memset(bh->b_data, 0 , sb->s_blocksize);
@@ -254,7 +260,7 @@ affs_getemptyblk(struct super_block *sb, int block)
 {
        struct buffer_head *bh;
        pr_debug("%s: %d\n", __func__, block);
-       if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+       if (affs_validblock(sb, block)) {
                bh = sb_getblk(sb, block);
                wait_on_buffer(bh);
                set_buffer_uptodate(bh);
index 0ec65c1..b573c3b 100644 (file)
@@ -367,7 +367,7 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
 }
 
 void
-secs_to_datestamp(time64_t secs, struct affs_date *ds)
+affs_secs_to_datestamp(time64_t secs, struct affs_date *ds)
 {
        u32      days;
        u32      minute;
@@ -386,55 +386,55 @@ secs_to_datestamp(time64_t secs, struct affs_date *ds)
 }
 
 umode_t
-prot_to_mode(u32 prot)
+affs_prot_to_mode(u32 prot)
 {
        umode_t mode = 0;
 
        if (!(prot & FIBF_NOWRITE))
-               mode |= S_IWUSR;
+               mode |= 0200;
        if (!(prot & FIBF_NOREAD))
-               mode |= S_IRUSR;
+               mode |= 0400;
        if (!(prot & FIBF_NOEXECUTE))
-               mode |= S_IXUSR;
+               mode |= 0100;
        if (prot & FIBF_GRP_WRITE)
-               mode |= S_IWGRP;
+               mode |= 0020;
        if (prot & FIBF_GRP_READ)
-               mode |= S_IRGRP;
+               mode |= 0040;
        if (prot & FIBF_GRP_EXECUTE)
-               mode |= S_IXGRP;
+               mode |= 0010;
        if (prot & FIBF_OTR_WRITE)
-               mode |= S_IWOTH;
+               mode |= 0002;
        if (prot & FIBF_OTR_READ)
-               mode |= S_IROTH;
+               mode |= 0004;
        if (prot & FIBF_OTR_EXECUTE)
-               mode |= S_IXOTH;
+               mode |= 0001;
 
        return mode;
 }
 
 void
-mode_to_prot(struct inode *inode)
+affs_mode_to_prot(struct inode *inode)
 {
        u32 prot = AFFS_I(inode)->i_protect;
        umode_t mode = inode->i_mode;
 
-       if (!(mode & S_IXUSR))
+       if (!(mode & 0100))
                prot |= FIBF_NOEXECUTE;
-       if (!(mode & S_IRUSR))
+       if (!(mode & 0400))
                prot |= FIBF_NOREAD;
-       if (!(mode & S_IWUSR))
+       if (!(mode & 0200))
                prot |= FIBF_NOWRITE;
-       if (mode & S_IXGRP)
+       if (mode & 0010)
                prot |= FIBF_GRP_EXECUTE;
-       if (mode & S_IRGRP)
+       if (mode & 0040)
                prot |= FIBF_GRP_READ;
-       if (mode & S_IWGRP)
+       if (mode & 0020)
                prot |= FIBF_GRP_WRITE;
-       if (mode & S_IXOTH)
+       if (mode & 0001)
                prot |= FIBF_OTR_EXECUTE;
-       if (mode & S_IROTH)
+       if (mode & 0004)
                prot |= FIBF_OTR_READ;
-       if (mode & S_IWOTH)
+       if (mode & 0002)
                prot |= FIBF_OTR_WRITE;
 
        AFFS_I(inode)->i_protect = prot;
index fe4e129..a5e6097 100644 (file)
@@ -69,7 +69,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
        if (affs_test_opt(sbi->s_flags, SF_SETMODE))
                inode->i_mode = sbi->s_mode;
        else
-               inode->i_mode = prot_to_mode(prot);
+               inode->i_mode = affs_prot_to_mode(prot);
 
        id = be16_to_cpu(tail->uid);
        if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID))
@@ -184,11 +184,12 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
        }
        tail = AFFS_TAIL(sb, bh);
        if (tail->stype == cpu_to_be32(ST_ROOT)) {
-               secs_to_datestamp(inode->i_mtime.tv_sec,&AFFS_ROOT_TAIL(sb, bh)->root_change);
+               affs_secs_to_datestamp(inode->i_mtime.tv_sec,
+                                      &AFFS_ROOT_TAIL(sb, bh)->root_change);
        } else {
                tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
                tail->size = cpu_to_be32(inode->i_size);
-               secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
+               affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change);
                if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
                        uid = i_uid_read(inode);
                        gid = i_gid_read(inode);
@@ -249,7 +250,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
        mark_inode_dirty(inode);
 
        if (attr->ia_valid & ATTR_MODE)
-               mode_to_prot(inode);
+               affs_mode_to_prot(inode);
 out:
        return error;
 }
index 29186d2..96dd1d0 100644 (file)
@@ -9,29 +9,10 @@
  */
 
 #include "affs.h"
+#include <linux/exportfs.h>
 
 typedef int (*toupper_t)(int);
 
-static int      affs_toupper(int ch);
-static int      affs_hash_dentry(const struct dentry *, struct qstr *);
-static int       affs_compare_dentry(const struct dentry *dentry,
-               unsigned int len, const char *str, const struct qstr *name);
-static int      affs_intl_toupper(int ch);
-static int      affs_intl_hash_dentry(const struct dentry *, struct qstr *);
-static int       affs_intl_compare_dentry(const struct dentry *dentry,
-               unsigned int len, const char *str, const struct qstr *name);
-
-const struct dentry_operations affs_dentry_operations = {
-       .d_hash         = affs_hash_dentry,
-       .d_compare      = affs_compare_dentry,
-};
-
-const struct dentry_operations affs_intl_dentry_operations = {
-       .d_hash         = affs_intl_hash_dentry,
-       .d_compare      = affs_intl_compare_dentry,
-};
-
-
 /* Simple toupper() for DOS\1 */
 
 static int
@@ -271,7 +252,7 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
                return -ENOSPC;
 
        inode->i_mode = mode;
-       mode_to_prot(inode);
+       affs_mode_to_prot(inode);
        mark_inode_dirty(inode);
 
        inode->i_op = &affs_file_inode_operations;
@@ -301,7 +282,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                return -ENOSPC;
 
        inode->i_mode = S_IFDIR | mode;
-       mode_to_prot(inode);
+       affs_mode_to_prot(inode);
 
        inode->i_op = &affs_dir_inode_operations;
        inode->i_fop = &affs_dir_operations;
@@ -347,7 +328,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        inode_nohighmem(inode);
        inode->i_data.a_ops = &affs_symlink_aops;
        inode->i_mode = S_IFLNK | 0777;
-       mode_to_prot(inode);
+       affs_mode_to_prot(inode);
 
        error = -EIO;
        bh = affs_bread(sb, inode->i_ino);
@@ -465,3 +446,71 @@ done:
        affs_brelse(bh);
        return retval;
 }
+
+static struct dentry *affs_get_parent(struct dentry *child)
+{
+       struct inode *parent;
+       struct buffer_head *bh;
+
+       bh = affs_bread(child->d_sb, d_inode(child)->i_ino);
+       if (!bh)
+               return ERR_PTR(-EIO);
+
+       parent = affs_iget(child->d_sb,
+                          be32_to_cpu(AFFS_TAIL(child->d_sb, bh)->parent));
+       brelse(bh);
+       if (IS_ERR(parent))
+               return ERR_CAST(parent);
+
+       return d_obtain_alias(parent);
+}
+
+static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino,
+                                       u32 generation)
+{
+       struct inode *inode;
+
+       if (!affs_validblock(sb, ino))
+               return ERR_PTR(-ESTALE);
+
+       inode = affs_iget(sb, ino);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
+
+       if (generation && inode->i_generation != generation) {
+               iput(inode);
+               return ERR_PTR(-ESTALE);
+       }
+
+       return inode;
+}
+
+static struct dentry *affs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                   affs_nfs_get_inode);
+}
+
+static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                   affs_nfs_get_inode);
+}
+
+const struct export_operations affs_export_ops = {
+       .fh_to_dentry = affs_fh_to_dentry,
+       .fh_to_parent = affs_fh_to_parent,
+       .get_parent = affs_get_parent,
+};
+
+const struct dentry_operations affs_dentry_operations = {
+       .d_hash         = affs_hash_dentry,
+       .d_compare      = affs_compare_dentry,
+};
+
+const struct dentry_operations affs_intl_dentry_operations = {
+       .d_hash         = affs_intl_hash_dentry,
+       .d_compare      = affs_intl_compare_dentry,
+};
index d638486..3753253 100644 (file)
@@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait)
        struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
 
        lock_buffer(bh);
-       secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
+       affs_secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
        affs_fix_checksum(sb, bh);
        unlock_buffer(bh);
 
@@ -507,6 +507,7 @@ got_root:
                return -ENOMEM;
        }
 
+       sb->s_export_op = &affs_export_ops;
        pr_debug("s_flags=%lX\n", sb->s_flags);
        return 0;
 }
index 51a241e..949f960 100644 (file)
@@ -252,7 +252,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
                /* skip entries marked unused in the bitmap */
                if (!(block->pagehdr.bitmap[offset / 8] &
                      (1 << (offset % 8)))) {
-                       _debug("ENT[%Zu.%u]: unused",
+                       _debug("ENT[%zu.%u]: unused",
                               blkoff / sizeof(union afs_dir_block), offset);
                        if (offset >= curr)
                                ctx->pos = blkoff +
@@ -266,7 +266,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
                               sizeof(*block) -
                               offset * sizeof(union afs_dirent));
 
-               _debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
+               _debug("ENT[%zu.%u]: %s %zu \"%s\"",
                       blkoff / sizeof(union afs_dir_block), offset,
                       (offset < curr ? "skip" : "fill"),
                       nlen, dire->u.name);
@@ -274,23 +274,23 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
                /* work out where the next possible entry is */
                for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) {
                        if (next >= AFS_DIRENT_PER_BLOCK) {
-                               _debug("ENT[%Zu.%u]:"
+                               _debug("ENT[%zu.%u]:"
                                       " %u travelled beyond end dir block"
-                                      " (len %u/%Zu)",
+                                      " (len %u/%zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
                        }
                        if (!(block->pagehdr.bitmap[next / 8] &
                              (1 << (next % 8)))) {
-                               _debug("ENT[%Zu.%u]:"
-                                      " %u unmarked extension (len %u/%Zu)",
+                               _debug("ENT[%zu.%u]:"
+                                      " %u unmarked extension (len %u/%zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
                        }
 
-                       _debug("ENT[%Zu.%u]: ext %u/%Zu",
+                       _debug("ENT[%zu.%u]: ext %u/%zu",
                               blkoff / sizeof(union afs_dir_block),
                               next, tmp, nlen);
                        next++;
index 6f48d67..806df74 100644 (file)
@@ -38,8 +38,6 @@
  * which have been left busy at at service shutdown.
  */
 
-#define AUTOFS_DEV_IOCTL_SIZE  sizeof(struct autofs_dev_ioctl)
-
 typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
                        struct autofs_dev_ioctl *);
 
index 82e8f6e..d79ced9 100644 (file)
@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
                pr_debug("waiting for mount name=%pd\n", path->dentry);
                status = autofs4_wait(sbi, path, NFY_MOUNT);
                pr_debug("mount wait done status=%d\n", status);
+               ino->last_used = jiffies;
        }
-       ino->last_used = jiffies;
        return status;
 }
 
@@ -321,16 +321,21 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
         */
        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                struct dentry *parent = dentry->d_parent;
-               struct autofs_info *ino;
                struct dentry *new;
 
                new = d_lookup(parent, &dentry->d_name);
                if (!new)
                        return NULL;
-               ino = autofs4_dentry_ino(new);
-               ino->last_used = jiffies;
-               dput(path->dentry);
-               path->dentry = new;
+               if (new == dentry)
+                       dput(new);
+               else {
+                       struct autofs_info *ino;
+
+                       ino = autofs4_dentry_ino(new);
+                       ino->last_used = jiffies;
+                       dput(path->dentry);
+                       path->dentry = new;
+               }
        }
        return path->dentry;
 }
index 1c62845..77c30f1 100644 (file)
@@ -989,7 +989,7 @@ struct block_device *bdget(dev_t dev)
                bdev->bd_super = NULL;
                bdev->bd_inode = inode;
                bdev->bd_bdi = &noop_backing_dev_info;
-               bdev->bd_block_size = (1 << inode->i_blkbits);
+               bdev->bd_block_size = i_blocksize(inode);
                bdev->bd_part_count = 0;
                bdev->bd_invalidated = 0;
                inode->i_mode = S_IFBLK;
index 18e5146..c1d2a07 100644 (file)
@@ -2875,7 +2875,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                if (!ret)
                        ret = btrfs_prealloc_file_range(inode, mode,
                                        range->start,
-                                       range->len, 1 << inode->i_blkbits,
+                                       range->len, i_blocksize(inode),
                                        offset + len, &alloc_hint);
                else
                        btrfs_free_reserved_data_space(inode, range->start,
index 0e87401..28484b3 100644 (file)
@@ -2395,7 +2395,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
 {
        struct inode *inode = mapping->host;
-       unsigned blocksize = 1 << inode->i_blkbits;
+       unsigned int blocksize = i_blocksize(inode);
        struct page *page;
        void *fsdata;
        pgoff_t index, curidx;
@@ -2475,8 +2475,8 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
                        get_block_t *get_block, loff_t *bytes)
 {
        struct inode *inode = mapping->host;
-       unsigned blocksize = 1 << inode->i_blkbits;
-       unsigned zerofrom;
+       unsigned int blocksize = i_blocksize(inode);
+       unsigned int zerofrom;
        int err;
 
        err = cont_expand_zero(file, mapping, pos, bytes);
@@ -2838,7 +2838,7 @@ int nobh_truncate_page(struct address_space *mapping,
        struct buffer_head map_bh;
        int err;
 
-       blocksize = 1 << inode->i_blkbits;
+       blocksize = i_blocksize(inode);
        length = offset & (blocksize - 1);
 
        /* Block boundary? Nothing to do */
@@ -2916,7 +2916,7 @@ int block_truncate_page(struct address_space *mapping,
        struct buffer_head *bh;
        int err;
 
-       blocksize = 1 << inode->i_blkbits;
+       blocksize = i_blocksize(inode);
        length = offset & (blocksize - 1);
 
        /* Block boundary? Nothing to do */
@@ -3028,7 +3028,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
        struct inode *inode = mapping->host;
        tmp.b_state = 0;
        tmp.b_blocknr = 0;
-       tmp.b_size = 1 << inode->i_blkbits;
+       tmp.b_size = i_blocksize(inode);
        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
 }
index 09860c0..f297a9e 100644 (file)
@@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
                        nr_pages = i;
                        if (nr_pages > 0) {
                                len = nr_pages << PAGE_SHIFT;
+                               osd_req_op_extent_update(req, 0, len);
                                break;
                        }
                        goto out_pages;
@@ -751,7 +752,7 @@ static int ceph_writepages_start(struct address_space *mapping,
        struct pagevec pvec;
        int done = 0;
        int rc = 0;
-       unsigned wsize = 1 << inode->i_blkbits;
+       unsigned int wsize = i_blocksize(inode);
        struct ceph_osd_request *req = NULL;
        int do_sync = 0;
        loff_t snap_size, i_size;
@@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                if (ci->i_wrbuffer_ref > 0) {
                        pr_warn_ratelimited(
                                "writepage_start %p %lld forced umount\n",
@@ -1017,8 +1018,7 @@ new_request:
                                        &ci->i_layout, vino,
                                        offset, &len, 0, num_ops,
                                        CEPH_OSD_OP_WRITE,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        snapc, truncate_seq,
                                        truncate_size, false);
                if (IS_ERR(req)) {
@@ -1028,8 +1028,7 @@ new_request:
                                                min(num_ops,
                                                    CEPH_OSD_SLAB_OPS),
                                                CEPH_OSD_OP_WRITE,
-                                               CEPH_OSD_FLAG_WRITE |
-                                               CEPH_OSD_FLAG_ONDISK,
+                                               CEPH_OSD_FLAG_WRITE,
                                                snapc, truncate_seq,
                                                truncate_size, true);
                        BUG_ON(IS_ERR(req));
@@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout(" page %p forced umount\n", page);
                unlock_page(page);
                return -EIO;
@@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 0, 1,
-                                   CEPH_OSD_OP_CREATE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
                                    NULL, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 1, 3,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    NULL, ci->i_truncate_seq,
                                    ci->i_truncate_size, false);
        if (IS_ERR(req)) {
@@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
                goto out_unlock;
        }
 
-       wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
        ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
        ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
index 5bc5d37..4e7421c 100644 (file)
@@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
                fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
                                inode);
                if (fscache_cookie_enabled(ci->fscache)) {
-                       dout("fscache_file_set_cookie %p %p enabing cache\n",
+                       dout("fscache_file_set_cookie %p %p enabling cache\n",
                             inode, filp);
                }
        }
index 94fd76d..cd966f2 100644 (file)
@@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 /*
  * Return caps we have registered with the MDS(s) as 'wanted'.
  */
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
 {
        struct ceph_cap *cap;
        struct rb_node *p;
@@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
-               if (!__cap_is_valid(cap))
+               if (check && !__cap_is_valid(cap))
                        continue;
                if (cap == ci->i_auth_cap)
                        mds_wanted |= cap->mds_wanted;
@@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                delayed = 1;
        }
        ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+       if (want & ~cap->mds_wanted) {
+               /* user space may open/close single file frequently.
+                * This avoids droping mds_wanted immediately after
+                * requesting new mds_wanted.
+                */
+               __cap_set_timeouts(mdsc, ci);
+       }
 
        cap->issued &= retain;  /* drop bits we don't want */
        if (cap->implemented & ~cap->issued) {
@@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 
-       ceph_sync_write_wait(inode);
-
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
                goto out;
@@ -2477,23 +2482,22 @@ again:
 
                if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
                        int mds_wanted;
-                       if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+                       if (READ_ONCE(mdsc->fsc->mount_state) ==
                            CEPH_MOUNT_SHUTDOWN) {
                                dout("get_cap_refs %p forced umount\n", inode);
                                *err = -EIO;
                                ret = 1;
                                goto out_unlock;
                        }
-                       mds_wanted = __ceph_caps_mds_wanted(ci);
-                       if ((mds_wanted & need) != need) {
+                       mds_wanted = __ceph_caps_mds_wanted(ci, false);
+                       if (need & ~(mds_wanted & need)) {
                                dout("get_cap_refs %p caps were dropped"
                                     " (session killed?)\n", inode);
                                *err = -ESTALE;
                                ret = 1;
                                goto out_unlock;
                        }
-                       if ((mds_wanted & file_wanted) ==
-                           (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                       if (!(file_wanted & ~mds_wanted))
                                ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
 
@@ -3404,6 +3408,7 @@ retry:
                        tcap->implemented |= issued;
                        if (cap == ci->i_auth_cap)
                                ci->i_auth_cap = tcap;
+
                        if (!list_empty(&ci->i_cap_flush_list) &&
                            ci->i_auth_cap == tcap) {
                                spin_lock(&mdsc->cap_dirty_lock);
@@ -3417,9 +3422,18 @@ retry:
        } else if (tsession) {
                /* add placeholder for the export tagert */
                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+               tcap = new_cap;
                ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
                             t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
+               if (!list_empty(&ci->i_cap_flush_list) &&
+                   ci->i_auth_cap == tcap) {
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       list_move_tail(&ci->i_flushing_item,
+                                      &tcap->session->s_cap_flushing);
+                       spin_unlock(&mdsc->cap_dirty_lock);
+               }
+
                __ceph_remove_cap(cap, false);
                goto out_unlock;
        }
@@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 }
 
 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                              struct inode *dir,
                               int mds, int drop, int unless)
 {
-       struct inode *dir = d_inode(dentry->d_parent);
+       struct dentry *parent = NULL;
        struct ceph_mds_request_release *rel = *p;
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        int force = 0;
@@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
        spin_lock(&dentry->d_lock);
        if (di->lease_session && di->lease_session->s_mds == mds)
                force = 1;
+       if (!dir) {
+               parent = dget(dentry->d_parent);
+               dir = d_inode(parent);
+       }
        spin_unlock(&dentry->d_lock);
 
        ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+       dput(parent);
 
        spin_lock(&dentry->d_lock);
        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
index 39ff678..f2ae393 100644 (file)
@@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
 
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        seq_puts(s, "\t(unsafe)");
                else
                        seq_puts(s, "\t");
index 8ab1fdf..3e9ad50 100644 (file)
@@ -371,7 +371,7 @@ more:
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
-               req->r_direct_is_hash = true;
+               __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
                if (fi->last_name) {
                        req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
                        if (!req->r_path2) {
@@ -417,7 +417,7 @@ more:
                fi->frag = frag;
                fi->last_readdir = req;
 
-               if (req->r_did_prepopulate) {
+               if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
                        fi->readdir_cache_idx = req->r_readdir_cache_idx;
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
@@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.getattr.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        err = ceph_handle_snapdir(req, dentry, err);
        dentry = ceph_finish_lookup(req, dentry, err);
@@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mknod.mode = cpu_to_le32(mode);
        req->r_args.mknod.rdev = cpu_to_le32(rdev);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                ceph_mdsc_put_request(req);
                goto out;
        }
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mkdir.mode = cpu_to_le32(mode);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        /* release LINK_SHARED on source inode (mds will lock it) */
@@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_inode_drop = drop_caps_for_unlink(inode);
@@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
        req->r_old_dentry_dir = old_dir;
-       req->r_locked_dir = new_dir;
+       req->r_parent = new_dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        struct inode *dir;
 
        if (flags & LOOKUP_RCU) {
-               parent = ACCESS_ONCE(dentry->d_parent);
+               parent = READ_ONCE(dentry->d_parent);
                dir = d_inode_rcu(parent);
                if (!dir)
                        return -ECHILD;
@@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                        return -ECHILD;
 
                op = ceph_snap(dir) == CEPH_SNAPDIR ?
-                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
+                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
                req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
                if (!IS_ERR(req)) {
                        req->r_dentry = dget(dentry);
-                       req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
+                       req->r_num_caps = 2;
+                       req->r_parent = dir;
 
                        mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
                        if (ceph_security_xattr_wanted(dir))
index 180bbef..e8f11fa 100644 (file)
@@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
        req->r_inode = d_inode(child);
        ihold(d_inode(child));
        req->r_ino2 = ceph_vino(d_inode(parent));
-       req->r_locked_dir = d_inode(parent);
+       req->r_parent = d_inode(parent);
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
 
index 045d30d..26cc954 100644 (file)
@@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file)
        spin_lock(&ci->i_ceph_lock);
        if (__ceph_is_any_real_caps(ci) &&
            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
-               int mds_wanted = __ceph_caps_mds_wanted(ci);
+               int mds_wanted = __ceph_caps_mds_wanted(ci, true);
                int issued = __ceph_caps_issued(ci, NULL);
 
                dout("open %p fmode %d want %s issued %s using existing\n",
@@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.open.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
@@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
                goto out;
        }
 
-       req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+       req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
        ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
@@ -794,89 +793,6 @@ out:
        kfree(aio_work);
 }
 
-/*
- * Write commit request unsafe callback, called to tell us when a
- * request is unsafe (that is, in flight--has been handed to the
- * messenger to send to its target osd).  It is called again when
- * we've received a response message indicating the request is
- * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
- * is completed early (and unsuccessfully) due to a timeout or
- * interrupt.
- *
- * This is used if we requested both an ACK and ONDISK commit reply
- * from the OSD.
- */
-static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
-{
-       struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
-       dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
-               unsafe ? "un" : "");
-       if (unsafe) {
-               ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-               spin_lock(&ci->i_unsafe_lock);
-               list_add_tail(&req->r_unsafe_item,
-                             &ci->i_unsafe_writes);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               complete_all(&req->r_completion);
-       } else {
-               spin_lock(&ci->i_unsafe_lock);
-               list_del_init(&req->r_unsafe_item);
-               spin_unlock(&ci->i_unsafe_lock);
-               ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-       }
-}
-
-/*
- * Wait on any unsafe replies for the given inode.  First wait on the
- * newest request, and make that the upper bound.  Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-void ceph_sync_write_wait(struct inode *inode)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_writes;
-       struct ceph_osd_request *req;
-       u64 last_tid;
-
-       if (!S_ISREG(inode->i_mode))
-               return;
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       /* set upper bound as _last_ entry in chain */
-
-       req = list_last_entry(head, struct ceph_osd_request,
-                             r_unsafe_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_osdc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               dout("sync_write_wait on tid %llu (until %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-
-               spin_lock(&ci->i_unsafe_lock);
-               /*
-                * from here on look at first entry in chain, since we
-                * only want to wait for anything older than last_tid
-                */
-               if (list_empty(head))
-                       break;
-               req = list_first_entry(head, struct ceph_osd_request,
-                                      r_unsafe_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-}
-
 static ssize_t
 ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                       struct ceph_snap_context *snapc,
@@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                if (ret2 < 0)
                        dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
-               flags = CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+               flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        } else {
                flags = CEPH_OSD_FLAG_READ;
        }
@@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
        if (ret < 0)
                dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-       flags = CEPH_OSD_FLAG_ORDERSNAP |
-               CEPH_OSD_FLAG_ONDISK |
-               CEPH_OSD_FLAG_WRITE |
-               CEPH_OSD_FLAG_ACK;
+       flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
 
        while ((len = iov_iter_count(from)) > 0) {
                size_t left;
@@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                        goto out;
                }
 
-               /* get a second commit callback */
-               req->r_unsafe_callback = ceph_sync_write_unsafe;
                req->r_inode = inode;
 
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
@@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                                        ceph_vino(inode),
                                        offset, length,
                                        0, 1, op,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        NULL, 0, 0, false);
        if (IS_ERR(req)) {
                ret = PTR_ERR(req);
index 5e659d0..fd8f771 100644 (file)
@@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_rdcache_gen = 0;
        ci->i_rdcache_revoking = 0;
 
-       INIT_LIST_HEAD(&ci->i_unsafe_writes);
        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
        INIT_LIST_HEAD(&ci->i_unsafe_iops);
        spin_lock_init(&ci->i_unsafe_lock);
@@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
-void ceph_evict_inode(struct inode *inode)
-{
-       /* wait unsafe sync writes */
-       ceph_sync_write_wait(inode);
-       truncate_inode_pages_final(&inode->i_data);
-       clear_inode(inode);
-}
-
 static inline blkcnt_t calc_inode_blocks(u64 size)
 {
        return (size + (1<<9) - 1) >> 9;
@@ -1016,7 +1007,9 @@ out:
 static void update_dentry_lease(struct dentry *dentry,
                                struct ceph_mds_reply_lease *lease,
                                struct ceph_mds_session *session,
-                               unsigned long from_time)
+                               unsigned long from_time,
+                               struct ceph_vino *tgt_vino,
+                               struct ceph_vino *dir_vino)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        long unsigned duration = le32_to_cpu(lease->duration_ms);
@@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry,
        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
        struct inode *dir;
 
+       /*
+        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
+        * we expect a negative dentry.
+        */
+       if (!tgt_vino && d_really_is_positive(dentry))
+               return;
+
+       if (tgt_vino && (d_really_is_negative(dentry) ||
+                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
+               return;
+
        spin_lock(&dentry->d_lock);
        dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
             dentry, duration, ttl);
 
-       /* make lease_rdcache_gen match directory */
        dir = d_inode(dentry->d_parent);
 
+       /* make sure parent matches dir_vino */
+       if (!ceph_ino_compare(dir, dir_vino))
+               goto out_unlock;
+
        /* only track leases on regular dentries */
        if (ceph_snap(dir) != CEPH_NOSNAP)
                goto out_unlock;
@@ -1108,61 +1115,27 @@ out:
  *
  * Called with snap_rwsem (read).
  */
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
-                   struct ceph_mds_session *session)
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 {
+       struct ceph_mds_session *session = req->r_session;
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct inode *in = NULL;
-       struct ceph_vino vino;
+       struct ceph_vino tvino, dvino;
        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int err = 0;
 
        dout("fill_trace %p is_dentry %d is_target %d\n", req,
             rinfo->head->is_dentry, rinfo->head->is_target);
 
-#if 0
-       /*
-        * Debugging hook:
-        *
-        * If we resend completed ops to a recovering mds, we get no
-        * trace.  Since that is very rare, pretend this is the case
-        * to ensure the 'no trace' handlers in the callers behave.
-        *
-        * Fill in inodes unconditionally to avoid breaking cap
-        * invariants.
-        */
-       if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
-               pr_info("fill_trace faking empty trace on %lld %s\n",
-                       req->r_tid, ceph_mds_op_name(rinfo->head->op));
-               if (rinfo->head->is_dentry) {
-                       rinfo->head->is_dentry = 0;
-                       err = fill_inode(req->r_locked_dir,
-                                        &rinfo->diri, rinfo->dirfrag,
-                                        session, req->r_request_started, -1);
-               }
-               if (rinfo->head->is_target) {
-                       rinfo->head->is_target = 0;
-                       ininfo = rinfo->targeti.in;
-                       vino.ino = le64_to_cpu(ininfo->ino);
-                       vino.snap = le64_to_cpu(ininfo->snapid);
-                       in = ceph_get_inode(sb, vino);
-                       err = fill_inode(in, &rinfo->targeti, NULL,
-                                        session, req->r_request_started,
-                                        req->r_fmode);
-                       iput(in);
-               }
-       }
-#endif
-
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-               if (rinfo->head->result == 0 && req->r_locked_dir)
+               if (rinfo->head->result == 0 && req->r_parent)
                        ceph_invalidate_dir_request(req);
                return 0;
        }
 
        if (rinfo->head->is_dentry) {
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                if (dir) {
                        err = fill_inode(dir, NULL,
@@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dname.name = rinfo->dname;
                        dname.len = rinfo->dname_len;
                        dname.hash = full_name_hash(parent, dname.name, dname.len);
-                       vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-                       vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 retry_lookup:
                        dn = d_lookup(parent, &dname);
                        dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1206,8 +1179,8 @@ retry_lookup:
                                }
                                err = 0;
                        } else if (d_really_is_positive(dn) &&
-                                  (ceph_ino(d_inode(dn)) != vino.ino ||
-                                   ceph_snap(d_inode(dn)) != vino.snap)) {
+                                  (ceph_ino(d_inode(dn)) != tvino.ino ||
+                                   ceph_snap(d_inode(dn)) != tvino.snap)) {
                                dout(" dn %p points to wrong inode %p\n",
                                     dn, d_inode(dn));
                                d_delete(dn);
@@ -1221,10 +1194,10 @@ retry_lookup:
        }
 
        if (rinfo->head->is_target) {
-               vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-               vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 
-               in = ceph_get_inode(sb, vino);
+               in = ceph_get_inode(sb, tvino);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        goto done;
@@ -1233,8 +1206,8 @@ retry_lookup:
 
                err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
                                session, req->r_request_started,
-                               (!req->r_aborted && rinfo->head->result == 0) ?
-                               req->r_fmode : -1,
+                               (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+                               rinfo->head->result == 0) ?  req->r_fmode : -1,
                                &req->r_caps_reservation);
                if (err < 0) {
                        pr_err("fill_inode badness %p %llx.%llx\n",
@@ -1247,8 +1220,9 @@ retry_lookup:
         * ignore null lease/binding on snapdir ENOENT, or else we
         * will have trouble splicing in the virtual snapdir later
         */
-       if (rinfo->head->is_dentry && !req->r_aborted &&
-           req->r_locked_dir &&
+       if (rinfo->head->is_dentry &&
+            !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
@@ -1257,17 +1231,19 @@ retry_lookup:
                 * mknod symlink mkdir  : null -> new inode
                 * unlink               : linked -> null
                 */
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
                struct dentry *dn = req->r_dentry;
                bool have_dir_cap, have_lease;
 
                BUG_ON(!dn);
                BUG_ON(!dir);
                BUG_ON(d_inode(dn->d_parent) != dir);
-               BUG_ON(ceph_ino(dir) !=
-                      le64_to_cpu(rinfo->diri.in->ino));
-               BUG_ON(ceph_snap(dir) !=
-                      le64_to_cpu(rinfo->diri.in->snapid));
+
+               dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+               dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+               BUG_ON(ceph_ino(dir) != dvino.ino);
+               BUG_ON(ceph_snap(dir) != dvino.snap);
 
                /* do we have a lease on the whole dir? */
                have_dir_cap =
@@ -1319,12 +1295,13 @@ retry_lookup:
                                ceph_dir_clear_ordered(dir);
                                dout("d_delete %p\n", dn);
                                d_delete(dn);
-                       } else {
-                               if (have_lease && d_unhashed(dn))
+                       } else if (have_lease) {
+                               if (d_unhashed(dn))
                                        d_add(dn, NULL);
                                update_dentry_lease(dn, rinfo->dlease,
                                                    session,
-                                                   req->r_request_started);
+                                                   req->r_request_started,
+                                                   NULL, &dvino);
                        }
                        goto done;
                }
@@ -1347,15 +1324,19 @@ retry_lookup:
                        have_lease = false;
                }
 
-               if (have_lease)
+               if (have_lease) {
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
                        update_dentry_lease(dn, rinfo->dlease, session,
-                                           req->r_request_started);
+                                           req->r_request_started,
+                                           &tvino, &dvino);
+               }
                dout(" final dn %p\n", dn);
-       } else if (!req->r_aborted &&
-                  (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-                   req->r_op == CEPH_MDS_OP_MKSNAP)) {
+       } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   req->r_op == CEPH_MDS_OP_MKSNAP) &&
+                  !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                struct dentry *dn = req->r_dentry;
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                /* fill out a snapdir LOOKUPSNAP dentry */
                BUG_ON(!dn);
@@ -1370,6 +1351,26 @@ retry_lookup:
                        goto done;
                }
                req->r_dentry = dn;  /* may have spliced */
+       } else if (rinfo->head->is_dentry) {
+               struct ceph_vino *ptvino = NULL;
+
+               if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
+                   le32_to_cpu(rinfo->dlease->duration_ms)) {
+                       dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+                       dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+                       if (rinfo->head->is_target) {
+                               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                               ptvino = &tvino;
+                       }
+
+                       update_dentry_lease(req->r_dentry, rinfo->dlease,
+                               session, req->r_request_started, ptvino,
+                               &dvino);
+               } else {
+                       dout("%s: no dentry lease or dir cap\n", __func__);
+               }
        }
 done:
        dout("fill_trace done err=%d\n", err);
@@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
 
-       if (req->r_aborted)
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                return readdir_prepopulate_inodes_only(req, session);
 
        if (rinfo->hash_order && req->r_path2) {
@@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-               struct ceph_vino vino;
+               struct ceph_vino tvino, dvino;
 
                dname.name = rde->name;
                dname.len = rde->name_len;
                dname.hash = full_name_hash(parent, dname.name, dname.len);
 
-               vino.ino = le64_to_cpu(rde->inode.in->ino);
-               vino.snap = le64_to_cpu(rde->inode.in->snapid);
+               tvino.ino = le64_to_cpu(rde->inode.in->ino);
+               tvino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                if (rinfo->hash_order) {
                        u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -1559,8 +1560,8 @@ retry_lookup:
                                goto out;
                        }
                } else if (d_really_is_positive(dn) &&
-                          (ceph_ino(d_inode(dn)) != vino.ino ||
-                           ceph_snap(d_inode(dn)) != vino.snap)) {
+                          (ceph_ino(d_inode(dn)) != tvino.ino ||
+                           ceph_snap(d_inode(dn)) != tvino.snap)) {
                        dout(" dn %p points to wrong inode %p\n",
                             dn, d_inode(dn));
                        d_delete(dn);
@@ -1572,7 +1573,7 @@ retry_lookup:
                if (d_really_is_positive(dn)) {
                        in = d_inode(dn);
                } else {
-                       in = ceph_get_inode(parent->d_sb, vino);
+                       in = ceph_get_inode(parent->d_sb, tvino);
                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_drop(dn);
@@ -1617,8 +1618,9 @@ retry_lookup:
 
                ceph_dentry(dn)->offset = rde->offset;
 
+               dvino = ceph_vino(d_inode(parent));
                update_dentry_lease(dn, rde->lease, req->r_session,
-                                   req->r_request_started);
+                                   req->r_request_started, &tvino, &dvino);
 
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
                        ret = fill_readdir_cache(d_inode(parent), dn,
@@ -1632,7 +1634,7 @@ next_item:
        }
 out:
        if (err == 0 && skipped == 0) {
-               req->r_did_prepopulate = true;
+               set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
                req->r_readdir_cache_idx = cache_ctl.index;
        }
        ceph_readdir_cache_release(&cache_ctl);
@@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work)
 
        mutex_lock(&ci->i_truncate_mutex);
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
                                    inode, ceph_ino(inode));
                mapping_set_error(inode->i_mapping, -EIO);
index 7d752d5..4c9c72f 100644 (file)
@@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
                l.stripe_count = ci->i_layout.stripe_count;
                l.object_size = ci->i_layout.object_size;
                l.data_pool = ci->i_layout.pool_id;
-               l.preferred_osd = (s32)-1;
+               l.preferred_osd = -1;
                if (copy_to_user(arg, &l, sizeof(l)))
                        return -EFAULT;
        }
@@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                nl.data_pool = ci->i_layout.pool_id;
 
        /* this is obsolete, and always -1 */
-       nl.preferred_osd = le64_to_cpu(-1);
+       nl.preferred_osd = -1;
 
        err = __validate_layout(mdsc, &nl);
        if (err)
index c9d2e55..c681762 100644 (file)
@@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref)
                ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
                iput(req->r_inode);
        }
-       if (req->r_locked_dir)
-               ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        iput(req->r_target_inode);
        if (req->r_dentry)
                dput(req->r_dentry);
@@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 
+       /* Never leave an unregistered request on an unsafe list! */
+       list_del_init(&req->r_unsafe_item);
+
        if (req->r_tid == mdsc->oldest_tid) {
                struct rb_node *p = rb_next(&req->r_node);
                mdsc->oldest_tid = 0;
@@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 
        erase_request(&mdsc->request_tree, req);
 
-       if (req->r_unsafe_dir && req->r_got_unsafe) {
+       if (req->r_unsafe_dir  &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
        }
-       if (req->r_target_inode && req->r_got_unsafe) {
+       if (req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_target_item);
@@ -668,6 +673,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 }
 
 /*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+       struct inode *inode = NULL;
+
+       while (dentry && !IS_ROOT(dentry)) {
+               inode = d_inode_rcu(dentry);
+               if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+                       break;
+               dentry = dentry->d_parent;
+       }
+       if (inode)
+               inode = igrab(inode);
+       return inode;
+}
+
+/*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
  * Otherwise, consult frag tree and/or caps to identify the
@@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
-       /*
-        * we don't need to worry about protecting the d_parent access
-        * here because we never renaming inside the snapped namespace
-        * except to resplice to another snapdir, and either the old or new
-        * result is a valid result.
-        */
-       while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
-               dentry = dentry->d_parent;
-       return dentry;
-}
-
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        int mode = req->r_direct_mode;
        int mds = -1;
        u32 hash = req->r_direct_hash;
-       bool is_hash = req->r_direct_is_hash;
+       bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
 
        /*
         * is there a specific mds we should try?  ignore hint if we have
@@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        inode = NULL;
        if (req->r_inode) {
                inode = req->r_inode;
+               ihold(inode);
        } else if (req->r_dentry) {
                /* ignore race with rename; old or new d_parent is okay */
-               struct dentry *parent = req->r_dentry->d_parent;
-               struct inode *dir = d_inode(parent);
+               struct dentry *parent;
+               struct inode *dir;
+
+               rcu_read_lock();
+               parent = req->r_dentry->d_parent;
+               dir = req->r_parent ? : d_inode_rcu(parent);
 
-               if (dir->i_sb != mdsc->fsc->sb) {
-                       /* not this fs! */
+               if (!dir || dir->i_sb != mdsc->fsc->sb) {
+                       /*  not this fs or parent went negative */
                        inode = d_inode(req->r_dentry);
+                       if (inode)
+                               ihold(inode);
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
                        /* direct snapped/virtual snapdir requests
                         * based on parent dir inode */
-                       struct dentry *dn = get_nonsnap_parent(parent);
-                       inode = d_inode(dn);
+                       inode = get_nonsnap_parent(parent);
                        dout("__choose_mds using nonsnap parent %p\n", inode);
                } else {
                        /* dentry target */
                        inode = d_inode(req->r_dentry);
                        if (!inode || mode == USE_AUTH_MDS) {
                                /* dir + name */
-                               inode = dir;
+                               inode = igrab(dir);
                                hash = ceph_dentry_hash(dir, req->r_dentry);
                                is_hash = true;
+                       } else {
+                               ihold(inode);
                        }
                }
+               rcu_read_unlock();
        }
 
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
@@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     (int)r, frag.ndist);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
 
                        /* since this file/dir wasn't known to be
@@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     inode, ceph_vinop(inode), frag.frag, mds);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
                }
        }
@@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
        if (!cap) {
                spin_unlock(&ci->i_ceph_lock);
+               iput(inode);
                goto random;
        }
        mds = cap->session->s_mds;
@@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
             inode, ceph_vinop(inode), mds,
             cap == ci->i_auth_cap ? "auth " : "", cap);
        spin_unlock(&ci->i_ceph_lock);
+out:
+       iput(inode);
        return mds;
 
 random:
@@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
        while (!list_empty(&session->s_unsafe)) {
                req = list_first_entry(&session->s_unsafe,
                                       struct ceph_mds_request, r_unsafe_item);
-               list_del_init(&req->r_unsafe_item);
                pr_warn_ratelimited(" dropping unsafe request %llu\n",
                                    req->r_tid);
                __unregister_request(mdsc, req);
@@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 
                if (ci->i_wrbuffer_ref > 0 &&
-                   ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                   READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                        invalidate = true;
 
                while (!list_empty(&ci->i_cap_flush_list)) {
@@ -1775,18 +1800,23 @@ retry:
        return path;
 }
 
-static int build_dentry_path(struct dentry *dentry,
+static int build_dentry_path(struct dentry *dentry, struct inode *dir,
                             const char **ppath, int *ppathlen, u64 *pino,
                             int *pfreepath)
 {
        char *path;
 
-       if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
-               *pino = ceph_ino(d_inode(dentry->d_parent));
+       rcu_read_lock();
+       if (!dir)
+               dir = d_inode_rcu(dentry->d_parent);
+       if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+               *pino = ceph_ino(dir);
+               rcu_read_unlock();
                *ppath = dentry->d_name.name;
                *ppathlen = dentry->d_name.len;
                return 0;
        }
+       rcu_read_unlock();
        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode,
  * an explicit ino+path.
  */
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
-                                 const char *rpath, u64 rino,
-                                 const char **ppath, int *pathlen,
+                                 struct inode *rdiri, const char *rpath,
+                                 u64 rino, const char **ppath, int *pathlen,
                                  u64 *ino, int *freepath)
 {
        int r = 0;
@@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
                dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
                     ceph_snap(rinode));
        } else if (rdentry) {
-               r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+                                       freepath);
                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
                     *ppath);
        } else if (rpath || rino) {
@@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        int ret;
 
        ret = set_request_path_attr(req->r_inode, req->r_dentry,
-                             req->r_path1, req->r_ino1.ino,
+                             req->r_parent, req->r_path1, req->r_ino1.ino,
                              &path1, &pathlen1, &ino1, &freepath1);
        if (ret < 0) {
                msg = ERR_PTR(ret);
@@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        }
 
        ret = set_request_path_attr(NULL, req->r_old_dentry,
+                             req->r_old_dentry_dir,
                              req->r_path2, req->r_ino2.ino,
                              &path2, &pathlen2, &ino2, &freepath2);
        if (ret < 0) {
@@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                      mds, req->r_inode_drop, req->r_inode_unless, 0);
        if (req->r_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_dentry,
-                      mds, req->r_dentry_drop, req->r_dentry_unless);
+                               req->r_parent, mds, req->r_dentry_drop,
+                               req->r_dentry_unless);
        if (req->r_old_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
-                      mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+                               req->r_old_dentry_dir, mds,
+                               req->r_old_dentry_drop,
+                               req->r_old_dentry_unless);
        if (req->r_old_inode_drop)
                releases += ceph_encode_inode_release(&p,
                      d_inode(req->r_old_dentry),
@@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
-       if (req->r_got_unsafe) {
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                void *p;
                /*
                 * Replay.  Do not regenerate message (and rebuild
@@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
        rhead = msg->front.iov_base;
        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
-       if (req->r_got_unsafe)
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                flags |= CEPH_MDS_FLAG_REPLAY;
-       if (req->r_locked_dir)
+       if (req->r_parent)
                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
        rhead->ino = 0;
 
-       dout(" r_locked_dir = %p\n", req->r_locked_dir);
+       dout(" r_parent = %p\n", req->r_parent);
        return 0;
 }
 
@@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = 0;
 
-       if (req->r_err || req->r_got_result) {
-               if (req->r_aborted)
+       if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+               if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                        __unregister_request(mdsc, req);
                goto out;
        }
@@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc,
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout("do_request forced umount\n");
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
                if (mdsc->mdsmap_err) {
                        err = mdsc->mdsmap_err;
                        dout("do_request mdsmap err %d\n", err);
@@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts > 0)
                        continue; /* only new requests */
@@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
        dout("do_request on %p\n", req);
 
-       /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+       /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
        if (req->r_inode)
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-       if (req->r_locked_dir)
-               ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        if (req->r_old_dentry_dir)
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
@@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
 
        /* only abort if we didn't race with a real reply */
-       if (req->r_got_result) {
+       if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
                err = le32_to_cpu(req->r_reply_info.head->result);
        } else if (err < 0) {
                dout("aborted request %lld with %d\n", req->r_tid, err);
@@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                 */
                mutex_lock(&req->r_fill_mutex);
                req->r_err = err;
-               req->r_aborted = true;
+               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
                mutex_unlock(&req->r_fill_mutex);
 
-               if (req->r_locked_dir &&
+               if (req->r_parent &&
                    (req->r_op & CEPH_MDS_OP_WRITE))
                        ceph_invalidate_dir_request(req);
        } else {
@@ -2323,7 +2358,7 @@ out:
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
-       struct inode *inode = req->r_locked_dir;
+       struct inode *inode = req->r_parent;
 
        dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
 
@@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 
        /* dup? */
-       if ((req->r_got_unsafe && !head->safe) ||
-           (req->r_got_safe && head->safe)) {
+       if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+           (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
                pr_warn("got a dup %s reply on %llu from mds%d\n",
                           head->safe ? "safe" : "unsafe", tid, mds);
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
-       if (req->r_got_safe) {
+       if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
                pr_warn("got unsafe after safe on %llu from mds%d\n",
                           tid, mds);
                mutex_unlock(&mdsc->mutex);
@@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 
        if (head->safe) {
-               req->r_got_safe = true;
+               set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
                __unregister_request(mdsc, req);
 
-               if (req->r_got_unsafe) {
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                        /*
                         * We already handled the unsafe response, now do the
                         * cleanup.  No need to examine the response; the MDS
@@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                         * useful we could do with a revised return value.
                         */
                        dout("got safe reply %llu, mds%d\n", tid, mds);
-                       list_del_init(&req->r_unsafe_item);
 
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
@@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                }
        } else {
-               req->r_got_unsafe = true;
+               set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
                if (req->r_unsafe_dir) {
                        struct ceph_inode_info *ci =
@@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
        current->journal_info = req;
-       err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+       err = ceph_fill_trace(mdsc->fsc->sb, req);
        if (err == 0) {
                if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
                                    req->r_op == CEPH_MDS_OP_LSSNAP))
@@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (realm)
                ceph_put_snap_realm(mdsc, realm);
 
-       if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+       if (err == 0 && req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
@@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 out_err:
        mutex_lock(&mdsc->mutex);
-       if (!req->r_aborted) {
+       if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                if (err) {
                        req->r_err = err;
                } else {
                        req->r_reply =  ceph_msg_get(msg);
-                       req->r_got_result = true;
+                       set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
                }
        } else {
                dout("reply arrived after request %lld was aborted\n", tid);
@@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                goto out;  /* dup reply? */
        }
 
-       if (req->r_aborted) {
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                dout("forward tid %llu aborted, unregistering\n", tid);
                __unregister_request(mdsc, req);
        } else if (fwd_seq <= req->r_num_fwd) {
@@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                /* resend. forward race not possible; mds would drop */
                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
                BUG_ON(req->r_err);
-               BUG_ON(req->r_got_result);
+               BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
                req->r_attempts = 0;
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
@@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts == 0)
                        continue; /* only old requests */
@@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
 
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
 
        dout("sync\n");
@@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 {
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return true;
        return atomic_read(&mdsc->num_sessions) <= skipped;
 }
index 3c6f77b..ac0475a 100644 (file)
@@ -202,9 +202,18 @@ struct ceph_mds_request {
        char *r_path1, *r_path2;
        struct ceph_vino r_ino1, r_ino2;
 
-       struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+       struct inode *r_parent;             /* parent dir inode */
        struct inode *r_target_inode;       /* resulting inode */
 
+#define CEPH_MDS_R_DIRECT_IS_HASH      (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED             (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE          (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE            (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT          (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE     (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED       (7) /* is r_parent->i_rwsem wlocked? */
+       unsigned long   r_req_flags;
+
        struct mutex r_fill_mutex;
 
        union ceph_mds_request_args r_args;
@@ -216,7 +225,6 @@ struct ceph_mds_request {
        /* for choosing which mds to send this request to */
        int r_direct_mode;
        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
-       bool r_direct_is_hash;  /* true if r_direct_hash is valid */
 
        /* data payload is used for xattr ops */
        struct ceph_pagelist *r_pagelist;
@@ -234,7 +242,6 @@ struct ceph_mds_request {
        struct ceph_mds_reply_info_parsed r_reply_info;
        struct page *r_locked_page;
        int r_err;
-       bool r_aborted;
 
        unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
        unsigned long r_started;  /* start time to measure timeout against */
@@ -262,9 +269,7 @@ struct ceph_mds_request {
        ceph_mds_request_callback_t r_callback;
        ceph_mds_request_wait_callback_t r_wait_for_completion;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-       bool              r_got_unsafe, r_got_safe, r_got_result;
 
-       bool              r_did_prepopulate;
        long long         r_dir_release_cnt;
        long long         r_dir_ordered_cnt;
        int               r_readdir_cache_idx;
index 6bd20d7..0ec8d01 100644 (file)
@@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = {
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
        .drop_inode     = ceph_drop_inode,
-       .evict_inode    = ceph_evict_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
@@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb,
                fsc->backing_dev_info.ra_pages =
                        VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 
+       if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
+           fsc->mount_options->rsize >= PAGE_SIZE)
+               fsc->backing_dev_info.io_pages =
+                       (fsc->mount_options->rsize + PAGE_SIZE - 1)
+                       >> PAGE_SHIFT;
+       else if (fsc->mount_options->rsize == 0)
+               fsc->backing_dev_info.io_pages = ULONG_MAX;
+
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
index 3373b61..e9410bc 100644 (file)
@@ -45,8 +45,8 @@
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define CEPH_RSIZE_DEFAULT             0           /* max read size */
-#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT              (64*1024*1024) /* max read size */
+#define CEPH_RASIZE_DEFAULT             (8192*1024)    /* max readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -343,7 +343,6 @@ struct ceph_inode_info {
        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
 
-       struct list_head i_unsafe_writes; /* uncommitted sync writes */
        struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
        struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
        spinlock_t i_unsafe_lock;
@@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 }
 
 /* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
 
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
@@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops;
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
 extern int ceph_drop_inode(struct inode *inode);
-extern void ceph_evict_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
                                    struct ceph_vino vino);
@@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
                                u64 time_warp_seq, struct timespec *ctime,
                                struct timespec *mtime, struct timespec *atime);
 extern int ceph_fill_trace(struct super_block *sb,
-                          struct ceph_mds_request *req,
-                          struct ceph_mds_session *session);
+                          struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                                    struct ceph_mds_session *session);
 
@@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
                                     int mds, int drop, int unless, int force);
 extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+                                     struct inode *dir,
                                      int mds, int drop, int unless);
 
 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
@@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                                  char *data, size_t len);
-extern void ceph_sync_write_wait(struct inode *inode);
+
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
index 5ae8b71..7436c98 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1436,7 +1436,8 @@ out:
        return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+                              const struct iomap_ops *ops)
 {
        return VM_FAULT_FALLBACK;
 }
index c87bae4..a04ebea 100644 (file)
@@ -587,7 +587,7 @@ static int dio_set_defer_completion(struct dio *dio)
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
  * of available blocks at sdio->blocks_available.  These are in units of the
- * fs blocksize, (1 << inode->i_blkbits).
+ * fs blocksize, i_blocksize(inode).
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
  * it uses the passed inode-relative block number as the file offset, as usual.
index 866bb18..e00d45a 100644 (file)
@@ -123,7 +123,7 @@ void ecryptfs_destroy_kthread(void)
  * @lower_dentry: Lower dentry for file to open
  * @lower_mnt: Lower vfsmount for file to open
  *
- * This function gets a r/w file opened againt the lower dentry.
+ * This function gets a r/w file opened against the lower dentry.
  *
  * Returns zero on success; non-zero otherwise
  */
index bcb68fc..5ec1631 100644 (file)
@@ -1895,7 +1895,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
-       if (epds.events & EPOLLEXCLUSIVE) {
+       if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
index 37e0592..e7f12a2 100644 (file)
@@ -84,7 +84,7 @@
  *   --        writeout
  *     Writeout looks up whole page cache to see if a buffer is
  *     mapped, If there are not very many delayed buffers, then it is
- *     time comsuming.
+ *     time consuming.
  *
  * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  * bigalloc and writeout can figure out if a block or a range of
index 41d8e53..971f663 100644 (file)
@@ -2221,7 +2221,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 {
        struct inode *inode = mpd->inode;
        int err;
-       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+       ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;
 
        do {
@@ -3577,7 +3577,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
        if (overwrite)
                get_block_func = ext4_dio_get_block_overwrite;
        else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-                  round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+                  round_down(offset, i_blocksize(inode)) >= inode->i_size) {
                get_block_func = ext4_dio_get_block;
                dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
        } else if (is_sync_kiocb(iocb)) {
@@ -5179,7 +5179,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
         * do. We do the check mainly to optimize the common PAGE_SIZE ==
         * blocksize case
         */
-       if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
+       if (offset > PAGE_SIZE - i_blocksize(inode))
                return;
        while (1) {
                page = find_lock_page(inode->i_mapping,
index 10c62de..354dc1a 100644 (file)
@@ -838,7 +838,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
        inode = page->mapping->host;
        sb = inode->i_sb;
        ngroups = ext4_get_groups_count(sb);
-       blocksize = 1 << inode->i_blkbits;
+       blocksize = i_blocksize(inode);
        blocks_per_page = PAGE_SIZE / blocksize;
 
        groups_per_page = blocks_per_page >> 1;
index 6fc14de..578f8c3 100644 (file)
@@ -187,7 +187,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
        if (PageUptodate(page))
                return 0;
 
-       blocksize = 1 << inode->i_blkbits;
+       blocksize = i_blocksize(inode);
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
 
index a3ec3ae..482081b 100644 (file)
@@ -38,7 +38,7 @@ static int hfs_get_last_session(struct super_block *sb,
 
        /* default values */
        *start = 0;
-       *size = sb->s_bdev->bd_inode->i_size >> 9;
+       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
 
        if (HFS_SB(sb)->session >= 0) {
                te.cdte_track = HFS_SB(sb)->session;
index ebb85e5..e254fa0 100644 (file)
@@ -132,7 +132,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 
        /* default values */
        *start = 0;
-       *size = sb->s_bdev->bd_inode->i_size >> 9;
+       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
 
        if (HFSPLUS_SB(sb)->session >= 0) {
                te.cdte_track = HFSPLUS_SB(sb)->session;
index d209f42..0f85f24 100644 (file)
@@ -420,8 +420,8 @@ int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops)
 {
-       unsigned blocksize = (1 << inode->i_blkbits);
-       unsigned off = pos & (blocksize - 1);
+       unsigned int blocksize = i_blocksize(inode);
+       unsigned int off = pos & (blocksize - 1);
 
        /* Block boundary? Nothing to do */
        if (!off)
@@ -735,9 +735,9 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
                void *data, struct iomap *iomap)
 {
        struct iomap_dio *dio = data;
-       unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-       unsigned fs_block_size = (1 << inode->i_blkbits), pad;
-       unsigned align = iov_iter_alignment(dio->submit.iter);
+       unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+       unsigned int fs_block_size = i_blocksize(inode), pad;
+       unsigned int align = iov_iter_alignment(dio->submit.iter);
        struct iov_iter iter;
        struct bio *bio;
        bool need_zeroout = false;
index 2be7c9c..c64c257 100644 (file)
@@ -758,7 +758,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
                                sb->s_blocksize - offset : toread;
 
                tmp_bh.b_state = 0;
-               tmp_bh.b_size = 1 << inode->i_blkbits;
+               tmp_bh.b_size = i_blocksize(inode);
                err = jfs_get_block(inode, blk, &tmp_bh, 0);
                if (err)
                        return err;
@@ -798,7 +798,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
                                sb->s_blocksize - offset : towrite;
 
                tmp_bh.b_state = 0;
-               tmp_bh.b_size = 1 << inode->i_blkbits;
+               tmp_bh.b_size = i_blocksize(inode);
                err = jfs_get_block(inode, blk, &tmp_bh, 1);
                if (err)
                        goto out;
index 439b946..db5900a 100644 (file)
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
                rwsem_release(&kn->dep_map, 1, _RET_IP_);
        }
 
-       kernfs_unmap_bin_file(kn);
+       kernfs_drain_open_files(kn);
 
        mutex_lock(&kernfs_mutex);
 }
index 4f05358..35043a8 100644 (file)
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
                goto out_put;
 
        rc = 0;
-       of->mmapped = 1;
+       of->mmapped = true;
        of->vm_ops = vma->vm_ops;
        vma->vm_ops = &kernfs_vm_ops;
 out_put:
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
        if (error)
                goto err_free;
 
-       ((struct seq_file *)file->private_data)->private = of;
+       of->seq_file = file->private_data;
+       of->seq_file->private = of;
 
        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
        if (file->f_mode & FMODE_WRITE)
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
        /* make sure we have open node struct */
        error = kernfs_get_open_node(kn, of);
        if (error)
-               goto err_close;
+               goto err_seq_release;
+
+       if (ops->open) {
+               /* nobody has access to @of yet, skip @of->mutex */
+               error = ops->open(of);
+               if (error)
+                       goto err_put_node;
+       }
 
        /* open succeeded, put active references */
        kernfs_put_active(kn);
        return 0;
 
-err_close:
+err_put_node:
+       kernfs_put_open_node(kn, of);
+err_seq_release:
        seq_release(inode, file);
 err_free:
        kfree(of->prealloc_buf);
@@ -732,11 +742,41 @@ err_out:
        return error;
 }
 
+/* used from release/drain to ensure that ->release() is called exactly once */
+static void kernfs_release_file(struct kernfs_node *kn,
+                               struct kernfs_open_file *of)
+{
+       /*
+        * @of is guaranteed to have no other file operations in flight and
+        * we just want to synchronize release and drain paths.
+        * @kernfs_open_file_mutex is enough.  @of->mutex can't be used
+        * here because drain path may be called from places which can
+        * cause circular dependency.
+        */
+       lockdep_assert_held(&kernfs_open_file_mutex);
+
+       if (!of->released) {
+               /*
+                * A file is never detached without being released and we
+                * need to be able to release files which are deactivated
+                * and being drained.  Don't use kernfs_ops().
+                */
+               kn->attr.ops->release(of);
+               of->released = true;
+       }
+}
+
 static int kernfs_fop_release(struct inode *inode, struct file *filp)
 {
        struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
        struct kernfs_open_file *of = kernfs_of(filp);
 
+       if (kn->flags & KERNFS_HAS_RELEASE) {
+               mutex_lock(&kernfs_open_file_mutex);
+               kernfs_release_file(kn, of);
+               mutex_unlock(&kernfs_open_file_mutex);
+       }
+
        kernfs_put_open_node(kn, of);
        seq_release(inode, filp);
        kfree(of->prealloc_buf);
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
        return 0;
 }
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_drain_open_files(struct kernfs_node *kn)
 {
        struct kernfs_open_node *on;
        struct kernfs_open_file *of;
 
-       if (!(kn->flags & KERNFS_HAS_MMAP))
+       if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
                return;
 
        spin_lock_irq(&kernfs_open_node_lock);
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
                return;
 
        mutex_lock(&kernfs_open_file_mutex);
+
        list_for_each_entry(of, &on->files, list) {
                struct inode *inode = file_inode(of->file);
-               unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+               if (kn->flags & KERNFS_HAS_MMAP)
+                       unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+               kernfs_release_file(kn, of);
        }
+
        mutex_unlock(&kernfs_open_file_mutex);
 
        kernfs_put_open_node(kn, NULL);
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                kn->flags |= KERNFS_HAS_SEQ_SHOW;
        if (ops->mmap)
                kn->flags |= KERNFS_HAS_MMAP;
+       if (ops->release)
+               kn->flags |= KERNFS_HAS_RELEASE;
 
        rc = kernfs_add_one(kn);
        if (rc) {
index bfd551b..3100987 100644 (file)
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_drain_open_files(struct kernfs_node *kn);
 
 /*
  * symlink.c
index 1c13dd8..7e4ea3b 100644 (file)
@@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
                dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
                sin6.sin6_family = AF_INET6;
                sin6.sin6_addr = ifa->addr;
+               if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                       sin6.sin6_scope_id = ifa->idev->dev->ifindex;
                svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
                        (struct sockaddr *)&sin6);
        }
index 28af984..baff8f8 100644 (file)
@@ -115,7 +115,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
                        SetPageUptodate(page);    
                        return;
                }
-               create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+               create_empty_buffers(page, i_blocksize(inode), 0);
        }
        head = page_buffers(page);
        page_bh = head;
index f32f272..97b111d 100644 (file)
@@ -525,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
                return result;
        }
        if (result > len) {
-               pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+               pr_err("tcp: bug in recvmsg (%u > %zu)\n", result, len);
                return -EIO;                    
        }
        return result;
@@ -619,7 +619,7 @@ skipdata:;
                                        goto skipdata2;
                                }
                                if (datalen > req->datalen + 8) {
-                                       pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+                                       pr_err("tcp: Unexpected reply len %d (expected at most %zd)\n", datalen, req->datalen + 8);
                                        server->rcv.state = 3;
                                        goto skipdata;
                                }
index 2905479..0ca370d 100644 (file)
@@ -381,7 +381,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
        struct blk_plug plug;
        int i;
 
-       dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+       dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
 
        /* At this point, header->page_aray is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
index eb094c6..fd0284c 100644 (file)
@@ -1083,7 +1083,8 @@ struct svc_version nfs4_callback_version1 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
-       .vs_hidden = 1,
+       .vs_hidden = true,
+       .vs_need_cong_ctrl = true,
 };
 
 struct svc_version nfs4_callback_version4 = {
@@ -1092,5 +1093,6 @@ struct svc_version nfs4_callback_version4 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
-       .vs_hidden = 1,
+       .vs_hidden = true,
+       .vs_need_cong_ctrl = true,
 };
index a3fc48b..18f98e0 100644 (file)
@@ -482,7 +482,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
        u32 j, idx;
        struct nfs_fh *fh;
 
-       dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+       dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
                __func__, hdr->inode->i_ino,
                hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
@@ -540,7 +540,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
        if (IS_ERR(ds_clnt))
                return PNFS_NOT_ATTEMPTED;
 
-       dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
+       dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
index 0ca4af8..d6acc68 100644 (file)
@@ -1751,7 +1751,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        int vers;
        struct nfs_fh *fh;
 
-       dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+       dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
                __func__, hdr->inode->i_ino,
                hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
@@ -1828,7 +1828,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
 
-       dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+       dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
                vers);
index 2a4cdce..8f3d2ac 100644 (file)
@@ -291,7 +291,7 @@ objlayout_read_pagelist(struct nfs_pgio_header *hdr)
                              &hdr->args.pgbase,
                              hdr->args.offset, hdr->args.count);
 
-       dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+       dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n",
                __func__, inode->i_ino, offset, count, hdr->res.eof);
 
        err = objio_read_pagelist(hdr);
index a06115e..92b4b41 100644 (file)
@@ -24,7 +24,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 {
        struct nfsd4_layout_seg *seg = &args->lg_seg;
        struct super_block *sb = inode->i_sb;
-       u32 block_size = (1 << inode->i_blkbits);
+       u32 block_size = i_blocksize(inode);
        struct pnfs_block_extent *bex;
        struct iomap iomap;
        u32 device_generation = 0;
@@ -181,7 +181,7 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
        int nr_iomaps;
 
        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-                       lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+                       lcp->lc_up_len, &iomaps, i_blocksize(inode));
        if (nr_iomaps < 0)
                return nfserrno(nr_iomaps);
 
@@ -375,7 +375,7 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
        int nr_iomaps;
 
        nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
-                       lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+                       lcp->lc_up_len, &iomaps, i_blocksize(inode));
        if (nr_iomaps < 0)
                return nfserrno(nr_iomaps);
 
index 43e109c..e71f11b 100644 (file)
@@ -1102,6 +1102,7 @@ static struct flags {
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
        { NFSEXP_PNFS, {"pnfs", ""}},
+       { NFSEXP_SECURITY_LABEL, {"security_label", ""}},
        { 0, {"", ""}}
 };
 
index d08cd88..838f90f 100644 (file)
@@ -376,5 +376,4 @@ struct svc_version  nfsd_acl_version2 = {
                .vs_proc        = nfsd_acl_procedures2,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-               .vs_hidden      = 0,
 };
index 0c89034..dcb5f79 100644 (file)
@@ -266,6 +266,5 @@ struct svc_version  nfsd_acl_version3 = {
                .vs_proc        = nfsd_acl_procedures3,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-               .vs_hidden      = 0,
 };
 
index d818e4f..045c908 100644 (file)
@@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 
        fh_copy(&resp->fh, &argp->fh);
        resp->committed = argp->stable;
-       nfserr = nfsd_write(rqstp, &resp->fh, NULL,
-                                  argp->offset,
-                                  rqstp->rq_vec, argp->vlen,
-                                  &cnt,
-                                  &resp->committed);
+       nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
+                               rqstp->rq_vec, argp->vlen,
+                               &cnt, resp->committed);
        resp->count = cnt;
        RETURN_STATUS(nfserr);
 }
index eb78109..0274db6 100644 (file)
@@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, length + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
+       p += XDR_QUADLEN(length);
        hdr->nops = be32_to_cpup(p);
        return 0;
 out_overflow:
@@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
                                    struct nfsd4_callback *cb)
 {
        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
-       struct nfs4_sessionid id;
-       int status;
+       int status = -ESERVERFAULT;
        __be32 *p;
        u32 dummy;
 
-       status = -ESERVERFAULT;
-
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
@@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
-       memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-       if (memcmp(id.data, session->se_sessionid.data,
-                                       NFS4_MAX_SESSIONID_LEN) != 0) {
+
+       if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
@@ -753,6 +750,14 @@ int set_callback_cred(void)
        return 0;
 }
 
+void cleanup_callback_cred(void)
+{
+       if (callback_cred) {
+               put_rpccred(callback_cred);
+               callback_cred = NULL;
+       }
+}
+
 static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
 {
        if (clp->cl_minorversion == 0) {
index 5b20577..6b9b6cc 100644 (file)
@@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
        __be32 status;
        u32 id = -1;
+
+       if (name == NULL || namelen == 0)
+               return nfserr_inval;
+
        status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
        *uid = make_kuid(&init_user_ns, id);
        if (!uid_valid(*uid))
@@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
        __be32 status;
        u32 id = -1;
+
+       if (name == NULL || namelen == 0)
+               return nfserr_inval;
+
        status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
        *gid = make_kgid(&init_user_ns, id);
        if (!gid_valid(*gid))
index 74a6e57..cbeeda1 100644 (file)
@@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   u32 *bmval, u32 *writable)
 {
        struct dentry *dentry = cstate->current_fh.fh_dentry;
+       struct svc_export *exp = cstate->current_fh.fh_export;
 
        if (!nfsd_attrs_supported(cstate->minorversion, bmval))
                return nfserr_attrnotsupp;
        if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
                return nfserr_attrnotsupp;
+       if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+                       !(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+               return nfserr_attrnotsupp;
        if (writable && !bmval_is_subset(bmval, writable))
                return nfserr_inval;
        if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
@@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
        status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
                                write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
-                               &write->wr_how_written);
+                               write->wr_how_written);
        fput(filp);
 
        write->wr_bytes_written = cnt;
@@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd
        return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
 }
 
+static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       /* ac_supported, ac_resp_access */
+       return (op_encode_hdr_size + 2)* sizeof(__be32);
+}
+
 static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
@@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
        return ret;
 }
 
+static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+}
+
 static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
                XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+}
+
 static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
                + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+               * sizeof(__be32);
+}
+
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+               (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+}
+
 static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
@@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 }
 
 #ifdef CONFIG_NFSD_PNFS
+static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       u32 maxcount = 0, rlen = 0;
+
+       maxcount = svc_max_payload(rqstp);
+       rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+
+       return (op_encode_hdr_size +
+               1 /* gd_layout_type*/ +
+               XDR_QUADLEN(rlen) +
+               2 /* gd_notify_types */) * sizeof(__be32);
+}
+
 /*
  * At this stage we don't really know what layout driver will handle the request,
  * so we need to define an arbitrary upper bound here.
@@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
 }
 #endif /* CONFIG_NFSD_PNFS */
 
+
+static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+       return (op_encode_hdr_size + 3) * sizeof(__be32);
+}
+
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
                .op_name = "OP_ACCESS",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize,
        },
        [OP_CLOSE] = {
                .op_func = (nfsd4op_func)nfsd4_close,
@@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_GETFH] = {
                .op_func = (nfsd4op_func)nfsd4_getfh,
                .op_name = "OP_GETFH",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize,
        },
        [OP_LINK] = {
                .op_func = (nfsd4op_func)nfsd4_link,
@@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_LOCKT] = {
                .op_func = (nfsd4op_func)nfsd4_lockt,
                .op_name = "OP_LOCKT",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
        },
        [OP_LOCKU] = {
                .op_func = (nfsd4op_func)nfsd4_locku,
@@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_lookup,
                .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
                .op_name = "OP_LOOKUP",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_LOOKUPP] = {
                .op_func = (nfsd4op_func)nfsd4_lookupp,
                .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
                .op_name = "OP_LOOKUPP",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_NVERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_nverify,
                .op_name = "OP_NVERIFY",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_OPEN] = {
                .op_func = (nfsd4op_func)nfsd4_open,
@@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_READLINK] = {
                .op_func = (nfsd4op_func)nfsd4_readlink,
                .op_name = "OP_READLINK",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize,
        },
        [OP_REMOVE] = {
                .op_func = (nfsd4op_func)nfsd4_remove,
@@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo,
                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
        },
        [OP_SETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_setattr,
@@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_VERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_verify,
                .op_name = "OP_VERIFY",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_WRITE] = {
                .op_func = (nfsd4op_func)nfsd4_write,
@@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO_NO_NAME",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
        },
        [OP_TEST_STATEID] = {
                .op_func = (nfsd4op_func)nfsd4_test_stateid,
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_TEST_STATEID",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize,
        },
        [OP_FREE_STATEID] = {
                .op_func = (nfsd4op_func)nfsd4_free_stateid,
@@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_GETDEVICEINFO",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize,
        },
        [OP_LAYOUTGET] = {
                .op_func = (nfsd4op_func)nfsd4_layoutget,
@@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_SEEK] = {
                .op_func = (nfsd4op_func)nfsd4_seek,
                .op_name = "OP_SEEK",
+               .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize,
        },
 };
 
@@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
 
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-       struct nfsd4_operation *opdesc;
-       nfsd4op_rsize estimator;
-
        if (op->opnum == OP_ILLEGAL)
                return op_encode_hdr_size * sizeof(__be32);
-       opdesc = OPDESC(op);
-       estimator = opdesc->op_rsize_bop;
-       return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+
+       BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
+       return OPDESC(op)->op_rsize_bop(rqstp, op);
 }
 
 void warn_on_nonidempotent_op(struct nfsd4_op *op)
@@ -2476,12 +2537,13 @@ static struct svc_procedure             nfsd_procedures4[2] = {
 };
 
 struct svc_version     nfsd_version4 = {
-               .vs_vers        = 4,
-               .vs_nproc       = 2,
-               .vs_proc        = nfsd_procedures4,
-               .vs_dispatch    = nfsd_dispatch,
-               .vs_xdrsize     = NFS4_SVC_XDRSIZE,
-               .vs_rpcb_optnl  = 1,
+       .vs_vers                = 4,
+       .vs_nproc               = 2,
+       .vs_proc                = nfsd_procedures4,
+       .vs_dispatch            = nfsd_dispatch,
+       .vs_xdrsize             = NFS4_SVC_XDRSIZE,
+       .vs_rpcb_optnl          = true,
+       .vs_need_cong_ctrl      = true,
 };
 
 /*
index a0dee8a..e9ef50a 100644 (file)
@@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
 out_err:
        conn->cb_addr.ss_family = AF_UNSPEC;
        conn->cb_addrlen = 0;
-       dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+       dprintk("NFSD: this client (clientid %08x/%08x) "
                "will not receive delegations\n",
                clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
 
@@ -7012,23 +7012,24 @@ nfs4_state_start(void)
 
        ret = set_callback_cred();
        if (ret)
-               return -ENOMEM;
+               return ret;
+
        laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
        if (laundry_wq == NULL) {
                ret = -ENOMEM;
-               goto out_recovery;
+               goto out_cleanup_cred;
        }
        ret = nfsd4_create_callback_queue();
        if (ret)
                goto out_free_laundry;
 
        set_max_delegations();
-
        return 0;
 
 out_free_laundry:
        destroy_workqueue(laundry_wq);
-out_recovery:
+out_cleanup_cred:
+       cleanup_callback_cred();
        return ret;
 }
 
@@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void)
 {
        destroy_workqueue(laundry_wq);
        nfsd4_destroy_callback_queue();
+       cleanup_callback_cred();
 }
 
 static void
index 8fae53c..382c1fd 100644 (file)
@@ -58,7 +58,7 @@
 
 #define NFSDDBG_FACILITY               NFSDDBG_XDR
 
-u32 nfsd_suppattrs[3][3] = {
+const u32 nfsd_suppattrs[3][3] = {
        {NFSD4_SUPPORTED_ATTRS_WORD0,
         NFSD4_SUPPORTED_ATTRS_WORD1,
         NFSD4_SUPPORTED_ATTRS_WORD2},
@@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
        READ_BUF(16);
        p = xdr_decode_hyper(p, &write->wr_offset);
        write->wr_stable_how = be32_to_cpup(p++);
-       if (write->wr_stable_how > 2)
+       if (write->wr_stable_how > NFS_FILE_SYNC)
                goto xdr_error;
        write->wr_buflen = be32_to_cpup(p++);
 
@@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                } else
                        max_reply += nfsd4_max_reply(argp->rqstp, op);
                /*
-                * OP_LOCK may return a conflicting lock.  (Special case
-                * because it will just skip encoding this if it runs
-                * out of xdr buffer space, and it is the only operation
-                * that behaves this way.)
+                * OP_LOCK and OP_LOCKT may return a conflicting lock.
+                * (Special case because it will just skip encoding this
+                * if it runs out of xdr buffer space, and it is the only
+                * operation that behaves this way.)
                 */
-               if (op->opnum == OP_LOCK)
+               if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT)
                        max_reply += NFS4_OPAQUE_LIMIT;
 
                if (op->status) {
@@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        DECODE_TAIL;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+                            struct svc_export *exp)
 {
-       if (IS_I_VERSION(inode)) {
+       if (exp->ex_flags & NFSEXP_V4ROOT) {
+               *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+               *p++ = 0;
+       } else if (IS_I_VERSION(inode)) {
                p = xdr_encode_hyper(p, inode->i_version);
        } else {
                *p++ = cpu_to_be32(stat->ctime.tv_sec);
@@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
        if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
             bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-               err = security_inode_getsecctx(d_inode(dentry),
+               if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+                       err = security_inode_getsecctx(d_inode(dentry),
                                                &context, &contextlen);
+               else
+                       err = -EOPNOTSUPP;
                contextsupport = (err == 0);
                if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
                        if (err == -EOPNOTSUPP)
@@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
                p = xdr_reserve_space(xdr, 8);
                if (!p)
                        goto out_resource;
-               p = encode_change(p, &stat, d_inode(dentry));
+               p = encode_change(p, &stat, d_inode(dentry), exp);
        }
        if (bmval0 & FATTR4_WORD0_SIZE) {
                p = xdr_reserve_space(xdr, 8);
index d6b97b4..96fd159 100644 (file)
@@ -578,7 +578,7 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
        struct kvec     *vec = &rqstp->rq_res.head[0];
 
        if (vec->iov_len + data->iov_len > PAGE_SIZE) {
-               printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n",
+               printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
                                data->iov_len);
                return 0;
        }
index f3b2f34..73e75ac 100644 (file)
@@ -536,6 +536,19 @@ out_free:
        return rv;
 }
 
+static ssize_t
+nfsd_print_version_support(char *buf, int remaining, const char *sep,
+               unsigned vers, unsigned minor)
+{
+       const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u";
+       bool supported = !!nfsd_vers(vers, NFSD_TEST);
+
+       if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST))
+               supported = false;
+       return snprintf(buf, remaining, format, sep,
+                       supported ? '+' : '-', vers, minor);
+}
+
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
@@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                len = qword_get(&mesg, vers, size);
                if (len <= 0) return -EINVAL;
                do {
+                       enum vers_op cmd;
                        sign = *vers;
                        if (sign == '+' || sign == '-')
                                num = simple_strtol((vers+1), &minorp, 0);
@@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        if (*minorp == '.') {
                                if (num != 4)
                                        return -EINVAL;
-                               minor = simple_strtoul(minorp+1, NULL, 0);
-                               if (minor == 0)
-                                       return -EINVAL;
-                               if (nfsd_minorversion(minor, sign == '-' ?
-                                                    NFSD_CLEAR : NFSD_SET) < 0)
+                               if (kstrtouint(minorp+1, 0, &minor) < 0)
                                        return -EINVAL;
-                               goto next;
-                       }
+                       } else
+                               minor = 0;
+                       cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
                        switch(num) {
                        case 2:
                        case 3:
-                       case 4:
-                               nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+                               nfsd_vers(num, cmd);
                                break;
+                       case 4:
+                               if (nfsd_minorversion(minor, cmd) >= 0)
+                                       break;
                        default:
                                return -EINVAL;
                        }
-               next:
                        vers += len + 1;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
                /* If all get turned off, turn them back on, as
@@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        len = 0;
        sep = "";
        remaining = SIMPLE_TRANSACTION_LIMIT;
-       for (num=2 ; num <= 4 ; num++)
-               if (nfsd_vers(num, NFSD_AVAIL)) {
-                       len = snprintf(buf, remaining, "%s%c%d", sep,
-                                      nfsd_vers(num, NFSD_TEST)?'+':'-',
-                                      num);
-                       sep = " ";
-
-                       if (len >= remaining)
-                               break;
-                       remaining -= len;
-                       buf += len;
-                       tlen += len;
-               }
-       if (nfsd_vers(4, NFSD_AVAIL))
-               for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
-                    minor++) {
-                       len = snprintf(buf, remaining, " %c4.%u",
-                                       (nfsd_vers(4, NFSD_TEST) &&
-                                        nfsd_minorversion(minor, NFSD_TEST)) ?
-                                               '+' : '-',
-                                       minor);
-
+       for (num=2 ; num <= 4 ; num++) {
+               if (!nfsd_vers(num, NFSD_AVAIL))
+                       continue;
+               minor = 0;
+               do {
+                       len = nfsd_print_version_support(buf, remaining,
+                                       sep, num, minor);
                        if (len >= remaining)
-                               break;
+                               goto out;
                        remaining -= len;
                        buf += len;
                        tlen += len;
-               }
-
+                       minor++;
+                       sep = " ";
+               } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION);
+       }
+out:
        len = snprintf(buf, remaining, "\n");
        if (len >= remaining)
                return -EINVAL;
index d74c8c4..d966068 100644 (file)
@@ -362,16 +362,16 @@ void              nfsd_lockd_shutdown(void);
        FATTR4_WORD2_MODE_UMASK | \
        NFSD4_2_SECURITY_ATTRS)
 
-extern u32 nfsd_suppattrs[3][3];
+extern const u32 nfsd_suppattrs[3][3];
 
-static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
+static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2)
 {
        return !((bm1[0] & ~bm2[0]) ||
                 (bm1[1] & ~bm2[1]) ||
                 (bm1[2] & ~bm2[2]));
 }
 
-static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
+static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 {
        return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
 }
index 010aff5..fa82b77 100644 (file)
@@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
                                        struct nfsd_attrstat  *resp)
 {
        __be32  nfserr;
-       int     stable = 1;
        unsigned long cnt = argp->len;
 
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
                argp->len, argp->offset);
 
-       nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
-                                  argp->offset,
-                                  rqstp->rq_vec, argp->vlen,
-                                  &cnt,
-                                  &stable);
+       nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
+                               rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
        return nfsd_return_attrs(nfserr, resp);
 }
 
index e6bfd96..efd66da 100644 (file)
@@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change)
        return 0;
 }
 
+static void
+nfsd_adjust_nfsd_versions4(void)
+{
+       unsigned i;
+
+       for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
+               if (nfsd_supported_minorversions[i])
+                       return;
+       }
+       nfsd_vers(4, NFSD_CLEAR);
+}
+
 int nfsd_minorversion(u32 minorversion, enum vers_op change)
 {
        if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
@@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
        switch(change) {
        case NFSD_SET:
                nfsd_supported_minorversions[minorversion] = true;
+               nfsd_vers(4, NFSD_SET);
                break;
        case NFSD_CLEAR:
                nfsd_supported_minorversions[minorversion] = false;
+               nfsd_adjust_nfsd_versions4();
                break;
        case NFSD_TEST:
                return nfsd_supported_minorversions[minorversion];
@@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
                dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
                sin6.sin6_family = AF_INET6;
                sin6.sin6_addr = ifa->addr;
+               if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                       sin6.sin6_scope_id = ifa->idev->dev->ifindex;
                svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
        }
 
index 4516e8b..005c911 100644 (file)
@@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
                struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
+extern void cleanup_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
index 26c6fdb..19d50f6 100644 (file)
@@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        __be32          err;
        int             host_err;
        bool            get_write_count;
-       int             size_change = 0;
+       bool            size_change = (iap->ia_valid & ATTR_SIZE);
 
        if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
                accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        /* Get inode */
        err = fh_verify(rqstp, fhp, ftype, accmode);
        if (err)
-               goto out;
+               return err;
        if (get_write_count) {
                host_err = fh_want_write(fhp);
                if (host_err)
-                       return nfserrno(host_err);
+                       goto out;
        }
 
        dentry = fhp->fh_dentry;
@@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                iap->ia_valid &= ~ATTR_MODE;
 
        if (!iap->ia_valid)
-               goto out;
+               return 0;
 
        nfsd_sanitize_attrs(inode, iap);
 
+       if (check_guard && guardtime != inode->i_ctime.tv_sec)
+               return nfserr_notsync;
+
        /*
         * The size case is special, it changes the file in addition to the
-        * attributes.
+        * attributes, and file systems don't expect it to be mixed with
+        * "random" attribute changes.  We thus split out the size change
+        * into a separate call to ->setattr, and do the rest as a separate
+        * setattr call.
         */
-       if (iap->ia_valid & ATTR_SIZE) {
+       if (size_change) {
                err = nfsd_get_write_access(rqstp, fhp, iap);
                if (err)
-                       goto out;
-               size_change = 1;
+                       return err;
+       }
 
+       fh_lock(fhp);
+       if (size_change) {
                /*
                 * RFC5661, Section 18.30.4:
                 *   Changing the size of a file with SETATTR indirectly
@@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                 *
                 * (and similar for the older RFCs)
                 */
-               if (iap->ia_size != i_size_read(inode))
-                       iap->ia_valid |= ATTR_MTIME;
-       }
+               struct iattr size_attr = {
+                       .ia_valid       = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+                       .ia_size        = iap->ia_size,
+               };
 
-       iap->ia_valid |= ATTR_CTIME;
+               host_err = notify_change(dentry, &size_attr, NULL);
+               if (host_err)
+                       goto out_unlock;
+               iap->ia_valid &= ~ATTR_SIZE;
 
-       if (check_guard && guardtime != inode->i_ctime.tv_sec) {
-               err = nfserr_notsync;
-               goto out_put_write_access;
+               /*
+                * Avoid the additional setattr call below if the only other
+                * attribute that the client sends is the mtime, as we update
+                * it as part of the size change above.
+                */
+               if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+                       goto out_unlock;
        }
 
-       fh_lock(fhp);
+       iap->ia_valid |= ATTR_CTIME;
        host_err = notify_change(dentry, iap, NULL);
-       fh_unlock(fhp);
-       err = nfserrno(host_err);
 
-out_put_write_access:
+out_unlock:
+       fh_unlock(fhp);
        if (size_change)
                put_write_access(inode);
-       if (!err)
-               err = nfserrno(commit_metadata(fhp));
 out:
-       return err;
+       if (!host_err)
+               host_err = commit_metadata(fhp);
+       return nfserrno(host_err);
 }
 
 #if defined(CONFIG_NFSD_V4)
@@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file)
 __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
-                               unsigned long *cnt, int *stablep)
+                               unsigned long *cnt, int stable)
 {
        struct svc_export       *exp;
-       struct inode            *inode;
        mm_segment_t            oldfs;
        __be32                  err = 0;
        int                     host_err;
-       int                     stable = *stablep;
        int                     use_wgather;
        loff_t                  pos = offset;
        unsigned int            pflags = current->flags;
@@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                 */
                current->flags |= PF_LESS_THROTTLE;
 
-       inode = file_inode(file);
-       exp   = fhp->fh_export;
-
+       exp = fhp->fh_export;
        use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
 
        if (!EX_ISSYNC(exp))
-               stable = 0;
+               stable = NFS_UNSTABLE;
 
        if (stable && !use_wgather)
                flags |= RWF_SYNC;
@@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-               loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
-               int *stablep)
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+          struct kvec *vec, int vlen, unsigned long *cnt, int stable)
 {
-       __be32                  err = 0;
+       struct file *file = NULL;
+       __be32 err = 0;
 
        trace_write_start(rqstp, fhp, offset, vlen);
 
-       if (file) {
-               err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-                               NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
-               if (err)
-                       goto out;
-               trace_write_opened(rqstp, fhp, offset, vlen);
-               err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
-                               stablep);
-               trace_write_io_done(rqstp, fhp, offset, vlen);
-       } else {
-               err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
-               if (err)
-                       goto out;
+       err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+       if (err)
+               goto out;
 
-               trace_write_opened(rqstp, fhp, offset, vlen);
-               if (cnt)
-                       err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
-                                            cnt, stablep);
-               trace_write_io_done(rqstp, fhp, offset, vlen);
-               fput(file);
-       }
+       trace_write_opened(rqstp, fhp, offset, vlen);
+       err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
+       trace_write_io_done(rqstp, fhp, offset, vlen);
+       fput(file);
 out:
        trace_write_done(rqstp, fhp, offset, vlen);
        return err;
index 0bf9e7b..db98c48 100644 (file)
@@ -83,12 +83,12 @@ __be32              nfsd_readv(struct file *, loff_t, struct kvec *, int,
                                unsigned long *);
 __be32                 nfsd_read(struct svc_rqst *, struct svc_fh *,
                                loff_t, struct kvec *, int, unsigned long *);
-__be32                 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
-                               loff_t, struct kvec *,int, unsigned long *, int *);
+__be32                 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
+                               struct kvec *, int, unsigned long *, int);
 __be32         nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                struct file *file, loff_t offset,
                                struct kvec *vec, int vlen, unsigned long *cnt,
-                               int *stablep);
+                               int stable);
 __be32         nfsd_readlink(struct svc_rqst *, struct svc_fh *,
                                char *, int *);
 __be32         nfsd_symlink(struct svc_rqst *, struct svc_fh *,
index 2c90e28..03b8ba9 100644 (file)
@@ -34,7 +34,7 @@
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
-       return (1UL << inode->i_blkbits) /
+       return i_blocksize(inode) /
                sizeof(struct nilfs_palloc_group_desc);
 }
 
index d5c23da..c21e0b4 100644 (file)
@@ -50,7 +50,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
                brelse(bh);
                BUG();
        }
-       memset(bh->b_data, 0, 1 << inode->i_blkbits);
+       memset(bh->b_data, 0, i_blocksize(inode));
        bh->b_bdev = inode->i_sb->s_bdev;
        bh->b_blocknr = blocknr;
        set_buffer_mapped(bh);
index 2e315f9..06ffa13 100644 (file)
@@ -119,7 +119,7 @@ nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 
 static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
 {
-       return 1 << btree->b_inode->i_blkbits;
+       return i_blocksize(btree->b_inode);
 }
 
 static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
@@ -1870,7 +1870,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                di = &dreq;
                ni = NULL;
        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
-                          1 << btree->b_inode->i_blkbits)) {
+                          nilfs_btree_node_size(btree))) {
                di = &dreq;
                ni = &nreq;
        } else {
index c7f4fef..7ffe71a 100644 (file)
@@ -51,7 +51,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
 {
        struct nilfs_root *root = NILFS_I(inode)->i_root;
 
-       inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+       inode_add_bytes(inode, i_blocksize(inode) * n);
        if (root)
                atomic64_add(n, &root->blocks_count);
 }
@@ -60,7 +60,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
 {
        struct nilfs_root *root = NILFS_I(inode)->i_root;
 
-       inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+       inode_sub_bytes(inode, i_blocksize(inode) * n);
        if (root)
                atomic64_sub(n, &root->blocks_count);
 }
index d56d3a5..98835ed 100644 (file)
@@ -57,7 +57,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
        set_buffer_mapped(bh);
 
        kaddr = kmap_atomic(bh->b_page);
-       memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+       memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
        if (init_block)
                init_block(inode, bh, kaddr);
        flush_dcache_page(bh->b_page);
@@ -501,7 +501,7 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
        mi->mi_entry_size = entry_size;
-       mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+       mi->mi_entries_per_block = i_blocksize(inode) / entry_size;
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
index bedcae2..7d18d62 100644 (file)
@@ -723,7 +723,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 
                lock_page(page);
                if (!page_has_buffers(page))
-                       create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+                       create_empty_buffers(page, i_blocksize(inode), 0);
                unlock_page(page);
 
                bh = head = page_buffers(page);
index 11556b7..88a31e9 100644 (file)
@@ -608,7 +608,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
        int ret = 0;
        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
        unsigned int block_end, block_start;
-       unsigned int bsize = 1 << inode->i_blkbits;
+       unsigned int bsize = i_blocksize(inode);
 
        if (!page_has_buffers(page))
                create_empty_buffers(page, bsize, 0);
index 7025d8c..3e04279 100644 (file)
@@ -2924,7 +2924,7 @@ again:
        /*
         * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
         * another try; otherwise, we are sure the MIGRATING state is there,
-        * drop the unneded state which blocked threads trying to DIRTY
+        * drop the unneeded state which blocked threads trying to DIRTY
         */
        spin_lock(&res->spinlock);
        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
index 7b6a146..8836305 100644 (file)
@@ -808,7 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        /* We know that zero_from is block aligned */
        for (block_start = zero_from; block_start < zero_to;
             block_start = block_end) {
-               block_end = block_start + (1 << inode->i_blkbits);
+               block_end = block_start + i_blocksize(inode);
 
                /*
                 * block_start is block-aligned.  Bump it by one to force
index 06af81f..9b96b99 100644 (file)
@@ -306,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
                break;
        case S_IFDIR:
                inode->i_size = PAGE_SIZE;
-               orangefs_inode->blksize = (1 << inode->i_blkbits);
+               orangefs_inode->blksize = i_blocksize(inode);
                spin_lock(&inode->i_lock);
                inode_set_bytes(inode, inode->i_size);
                spin_unlock(&inode->i_lock);
@@ -316,7 +316,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
                if (new) {
                        inode->i_size = (loff_t)strlen(new_op->
                            downcall.resp.getattr.link_target);
-                       orangefs_inode->blksize = (1 << inode->i_blkbits);
+                       orangefs_inode->blksize = i_blocksize(inode);
                        ret = strscpy(orangefs_inode->link_target,
                            new_op->downcall.resp.getattr.link_target,
                            ORANGEFS_NAME_MAX);
index b8f0627..1e1e182 100644 (file)
@@ -766,7 +766,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 
                if (!IS_ERR_OR_NULL(mm)) {
                        /* ensure this mm_struct can't be freed */
-                       atomic_inc(&mm->mm_count);
+                       mmgrab(mm);
                        /* but do not pin its memory */
                        mmput(mm);
                }
@@ -813,7 +813,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
                return -ENOMEM;
 
        copied = 0;
-       if (!atomic_inc_not_zero(&mm->mm_users))
+       if (!mmget_not_zero(mm))
                goto free;
 
        /* Maybe we should limit FOLL_FORCE to actual ptrace users? */
@@ -921,7 +921,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
                return -ENOMEM;
 
        ret = 0;
-       if (!atomic_inc_not_zero(&mm->mm_users))
+       if (!mmget_not_zero(mm))
                goto free;
 
        down_read(&mm->mmap_sem);
@@ -1064,7 +1064,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
                if (p) {
                        if (atomic_read(&p->mm->mm_users) > 1) {
                                mm = p->mm;
-                               atomic_inc(&mm->mm_count);
+                               mmgrab(mm);
                        }
                        task_unlock(p);
                }
index 0b80ad8..ea9f3d1 100644 (file)
@@ -373,7 +373,10 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
                phdr->p_flags   = PF_R|PF_W|PF_X;
                phdr->p_offset  = kc_vaddr_to_offset(m->addr) + dataoff;
                phdr->p_vaddr   = (size_t)m->addr;
-               phdr->p_paddr   = 0;
+               if (m->type == KCORE_RAM || m->type == KCORE_TEXT)
+                       phdr->p_paddr   = __pa(m->addr);
+               else
+                       phdr->p_paddr   = (elf_addr_t)-1;
                phdr->p_filesz  = phdr->p_memsz = m->size;
                phdr->p_align   = PAGE_SIZE;
        }
index 8f96a49..ee3efb2 100644 (file)
@@ -167,7 +167,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
                return ERR_PTR(-ESRCH);
 
        mm = priv->mm;
-       if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+       if (!mm || !mmget_not_zero(mm))
                return NULL;
 
        down_read(&mm->mmap_sem);
@@ -1352,7 +1352,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        unsigned long end_vaddr;
        int ret = 0, copied = 0;
 
-       if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+       if (!mm || !mmget_not_zero(mm))
                goto out;
 
        ret = -EINVAL;
index 3717562..1ef97cf 100644 (file)
@@ -219,7 +219,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
                return ERR_PTR(-ESRCH);
 
        mm = priv->mm;
-       if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+       if (!mm || !mmget_not_zero(mm))
                return NULL;
 
        down_read(&mm->mmap_sem);
index 2f8c5c9..b396eb0 100644 (file)
@@ -189,7 +189,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
        int ret = 0;
 
        th.t_trans_id = 0;
-       blocksize = 1 << inode->i_blkbits;
+       blocksize = i_blocksize(inode);
 
        if (logit) {
                reiserfs_write_lock(s);
index cfeae9b..a6ab9d6 100644 (file)
@@ -525,7 +525,7 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
         * referenced in convert_tail_for_hole() that may be called from
         * reiserfs_get_block()
         */
-       bh_result->b_size = (1 << inode->i_blkbits);
+       bh_result->b_size = i_blocksize(inode);
 
        ret = reiserfs_get_block(inode, iblock, bh_result,
                                 create | GET_BLOCK_NO_DANGLE);
index e314cb3..feabcde 100644 (file)
@@ -1166,7 +1166,7 @@ static int reiserfs_parse_options(struct super_block *s,
                        if (!strcmp(arg, "auto")) {
                                /* From JFS code, to auto-get the size. */
                                *blocks =
-                                   s->s_bdev->bd_inode->i_size >> s->
+                                   i_size_read(s->s_bdev->bd_inode) >> s->
                                    s_blocksize_bits;
                        } else {
                                *blocks = simple_strtoul(arg, &p, 0);
index a268b7f..3f14d1e 100644 (file)
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -31,7 +31,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
        stat->atime = inode->i_atime;
        stat->mtime = inode->i_mtime;
        stat->ctime = inode->i_ctime;
-       stat->blksize = (1 << inode->i_blkbits);
+       stat->blksize = i_blocksize(inode);
        stat->blocks = inode->i_blocks;
 }
 
index 8ec6b3d..a8d8f71 100644 (file)
@@ -1193,7 +1193,7 @@ int udf_setsize(struct inode *inode, loff_t newsize)
 {
        int err;
        struct udf_inode_info *iinfo;
-       int bsize = 1 << inode->i_blkbits;
+       int bsize = i_blocksize(inode);
 
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
              S_ISLNK(inode->i_mode)))
index 625b728..3c421d0 100644 (file)
@@ -1807,17 +1807,17 @@ static void init_once_userfaultfd_ctx(void *mem)
 }
 
 /**
- * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * userfaultfd_file_create - Creates a userfaultfd file pointer.
  * @flags: Flags for the userfaultfd file.
  *
- * This function creates an userfaultfd file pointer, w/out installing
+ * This function creates a userfaultfd file pointer, w/out installing
  * it into the fd table. This is useful when the userfaultfd file is
  * used during the initialization of data structures that require
  * extra setup after the userfaultfd creation. So the userfaultfd
  * creation is split into the file pointer creation phase, and the
  * file descriptor installation phase.  In this way races with
  * userspace closing the newly installed file descriptor can be
- * avoided.  Returns an userfaultfd file pointer, or a proper error
+ * avoided.  Returns a userfaultfd file pointer, or a proper error
  * pointer.
  */
 static struct file *userfaultfd_file_create(int flags)
@@ -1847,7 +1847,7 @@ static struct file *userfaultfd_file_create(int flags)
        ctx->released = false;
        ctx->mm = current->mm;
        /* prevent the mm struct to be freed */
-       atomic_inc(&ctx->mm->mm_count);
+       mmgrab(ctx->mm);
 
        file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                                  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
index 1ff9df7..bf65a9e 100644 (file)
@@ -103,9 +103,9 @@ xfs_finish_page_writeback(
        unsigned int            bsize;
 
        ASSERT(bvec->bv_offset < PAGE_SIZE);
-       ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+       ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
        ASSERT(end < PAGE_SIZE);
-       ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+       ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 
        bh = head = page_buffers(bvec->bv_page);
 
@@ -349,7 +349,7 @@ xfs_map_blocks(
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 count = 1 << inode->i_blkbits;
+       ssize_t                 count = i_blocksize(inode);
        xfs_fileoff_t           offset_fsb, end_fsb;
        int                     error = 0;
        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
@@ -758,7 +758,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-               offset += 1 << inode->i_blkbits;
+               offset += i_blocksize(inode);
 
        } while ((bh = bh->b_this_page) != head);
 
@@ -846,7 +846,7 @@ xfs_writepage_map(
        LIST_HEAD(submit_list);
        struct xfs_ioend        *ioend, *next;
        struct buffer_head      *bh, *head;
-       ssize_t                 len = 1 << inode->i_blkbits;
+       ssize_t                 len = i_blocksize(inode);
        int                     error = 0;
        int                     count = 0;
        int                     uptodate = 1;
@@ -1210,7 +1210,7 @@ xfs_map_trim_size(
            offset + mapping_size >= i_size_read(inode)) {
                /* limit mapping to block that spans EOF */
                mapping_size = roundup_64(i_size_read(inode) - offset,
-                                         1 << inode->i_blkbits);
+                                         i_blocksize(inode));
        }
        if (mapping_size > LONG_MAX)
                mapping_size = LONG_MAX;
@@ -1241,7 +1241,7 @@ xfs_get_blocks(
                return -EIO;
 
        offset = (xfs_off_t)iblock << inode->i_blkbits;
-       ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+       ASSERT(bh_result->b_size >= i_blocksize(inode));
        size = bh_result->b_size;
 
        if (offset >= i_size_read(inode))
@@ -1389,7 +1389,7 @@ xfs_vm_set_page_dirty(
                        if (offset < end_offset)
                                set_buffer_dirty(bh);
                        bh = bh->b_this_page;
-                       offset += 1 << inode->i_blkbits;
+                       offset += i_blocksize(inode);
                } while (bh != head);
        }
        /*
index a50eca6..35703a8 100644 (file)
@@ -754,7 +754,7 @@ xfs_file_fallocate(
                if (error)
                        goto out_unlock;
        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
-               unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+               unsigned int blksize_mask = i_blocksize(inode) - 1;
 
                if (offset & blksize_mask || len & blksize_mask) {
                        error = -EINVAL;
@@ -776,7 +776,7 @@ xfs_file_fallocate(
                if (error)
                        goto out_unlock;
        } else if (mode & FALLOC_FL_INSERT_RANGE) {
-               unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+               unsigned int blksize_mask = i_blocksize(inode) - 1;
 
                new_size = i_size_read(inode) + len;
                if (offset & blksize_mask || len & blksize_mask) {
diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h
new file mode 100644 (file)
index 0000000..57af9f2
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef _ASM_GENERIC_KPROBES_H
+#define _ASM_GENERIC_KPROBES_H
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_KPROBES
+/*
+ * Blacklist ganerating macro. Specify functions which is not probed
+ * by using this macro.
+ */
+# define __NOKPROBE_SYMBOL(fname)                              \
+static unsigned long __used                                    \
+       __attribute__((__section__("_kprobe_blacklist")))       \
+       _kbl_addr_##fname = (unsigned long)fname;
+# define NOKPROBE_SYMBOL(fname)        __NOKPROBE_SYMBOL(fname)
+/* Use this to forbid a kprobes attach on very low level functions */
+# define __kprobes     __attribute__((__section__(".kprobes.text")))
+# define nokprobe_inline       __always_inline
+#else
+# define NOKPROBE_SYMBOL(fname)
+# define __kprobes
+# define nokprobe_inline       inline
+#endif
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
+#endif /* _ASM_GENERIC_KPROBES_H */
index 03a6653..2ea0c28 100644 (file)
@@ -22,7 +22,6 @@ struct ceph_osd_client;
  * completion callback for async writepages
  */
 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD      -1
 
@@ -170,15 +169,12 @@ struct ceph_osd_request {
        unsigned int            r_num_ops;
 
        int               r_result;
-       bool              r_got_reply;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-       struct completion r_completion;
-       struct completion r_done_completion;  /* fsync waiter */
+       struct completion r_completion;       /* private to osd_client.c */
        ceph_osdc_callback_t r_callback;
-       ceph_osdc_unsafe_callback_t r_unsafe_callback;
        struct list_head  r_unsafe_item;
 
        struct inode *r_inode;                /* for use by callbacks */
index 9a90417..938656f 100644 (file)
@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
        case CEPH_POOL_TYPE_EC:
                return false;
        default:
-               BUG_ON(1);
+               BUG();
        }
 }
 
@@ -82,13 +82,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
 void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 
 /*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
-/*
  * 51-char inline_name is long enough for all cephfs and all but one
  * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
  * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
@@ -173,8 +166,8 @@ struct ceph_osdmap {
         * the list of osds that store+replicate them. */
        struct crush_map *crush;
 
-       struct mutex crush_scratch_mutex;
-       int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+       struct mutex crush_workspace_mutex;
+       void *crush_workspace;
 };
 
 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
index 5c0da61..5d00187 100644 (file)
@@ -50,7 +50,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE      32  /* max # osds in a single pg */
 
 /*
  * placement group.
index 861b467..3c02404 100644 (file)
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-       /* Reference count */
-       atomic_t refcount;
-
        /*
-        * List running through all cgroup groups in the same hash
-        * slot. Protected by css_set_lock
+        * Set of subsystem states, one for each subsystem. This array is
+        * immutable after creation apart from the init_css_set during
+        * subsystem registration (at boot time).
         */
-       struct hlist_node hlist;
+       struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+       /* reference count */
+       atomic_t refcount;
+
+       /* the default cgroup associated with this css_set */
+       struct cgroup *dfl_cgrp;
 
        /*
         * Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
        struct list_head tasks;
        struct list_head mg_tasks;
 
+       /* all css_task_iters currently walking this cset */
+       struct list_head task_iters;
+
        /*
-        * List of cgrp_cset_links pointing at cgroups referenced from this
-        * css_set.  Protected by css_set_lock.
+        * On the default hierarhcy, ->subsys[ssid] may point to a css
+        * attached to an ancestor instead of the cgroup this css_set is
+        * associated with.  The following node is anchored at
+        * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+        * iterate through all css's attached to a given cgroup.
         */
-       struct list_head cgrp_links;
+       struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
-       /* the default cgroup associated with this css_set */
-       struct cgroup *dfl_cgrp;
+       /*
+        * List running through all cgroup groups in the same hash
+        * slot. Protected by css_set_lock
+        */
+       struct hlist_node hlist;
 
        /*
-        * Set of subsystem states, one for each subsystem. This array is
-        * immutable after creation apart from the init_css_set during
-        * subsystem registration (at boot time).
+        * List of cgrp_cset_links pointing at cgroups referenced from this
+        * css_set.  Protected by css_set_lock.
         */
-       struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+       struct list_head cgrp_links;
 
        /*
         * List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
        struct cgroup *mg_dst_cgrp;
        struct css_set *mg_dst_cset;
 
-       /*
-        * On the default hierarhcy, ->subsys[ssid] may point to a css
-        * attached to an ancestor instead of the cgroup this css_set is
-        * associated with.  The following node is anchored at
-        * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-        * iterate through all css's attached to a given cgroup.
-        */
-       struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-       /* all css_task_iters currently walking this cset */
-       struct list_head task_iters;
-
        /* dead and being drained, ignore for migration */
        bool dead;
 
@@ -388,6 +388,9 @@ struct cftype {
        struct list_head node;          /* anchored at ss->cfts */
        struct kernfs_ops *kf_ops;
 
+       int (*open)(struct kernfs_open_file *of);
+       void (*release)(struct kernfs_open_file *of);
+
        /*
         * read_u64() is a shortcut for the common case of returning a
         * single integer. Use it in place of read()
index c83c23f..f6b43fb 100644 (file)
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
  * @leader: the loop cursor
  * @dst_css: the destination css
- * @tset: takset to iterate
+ * @tset: taskset to iterate
  *
  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
  * may not contain any.
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644 (file)
index 0000000..e94290b
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+       RDMACG_RESOURCE_HCA_HANDLE,
+       RDMACG_RESOURCE_HCA_OBJECT,
+       RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+       struct cgroup_subsys_state      css;
+
+       /*
+        * head to keep track of all resource pools
+        * that belongs to this cgroup.
+        */
+       struct list_head                rpools;
+};
+
+struct rdmacg_device {
+       struct list_head        dev_node;
+       struct list_head        rpools;
+       char                    *name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+                     struct rdmacg_device *device,
+                     enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device,
+                    enum rdmacg_resource_type index);
+#endif /* CONFIG_CGROUP_RDMA */
+#endif /* _CGROUP_RDMA_H */
index 0df0336..d0e597c 100644 (file)
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
index 9e40be5..aef47be 100644 (file)
@@ -711,8 +711,10 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
        compat_stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-       put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+       put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
        put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+       if (t->sas_ss_flags & SS_AUTODISARM) \
+               sas_ss_reset(t); \
 } while (0);
 
 asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
index 811f7a9..76e28c2 100644 (file)
 #endif
 #endif
 
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_unreachable() ({                                      \
+       asm("%c0:\t\n"                                                  \
+           ".pushsection __unreachable, \"a\"\t\n"                     \
+           ".long %c0b\t\n"                                            \
+           ".popsection\t\n" : : "i" (__LINE__));                      \
+})
+#else
+#define annotate_unreachable()
+#endif
+
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
  * this in the preprocessor, but we can live with this because they're
  * unreleased.  Really, we need to have autoconf for the kernel.
  */
-#define unreachable() __builtin_unreachable()
+#define unreachable() \
+       do { annotate_unreachable(); __builtin_unreachable(); } while (0)
 
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone      __attribute__((__noclone__, __optimize__("no-tracer")))
index 627e697..f811005 100644 (file)
@@ -577,12 +577,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
        (_________p1); \
 })
 
-/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
-#ifdef CONFIG_KPROBES
-# define __kprobes     __attribute__((__section__(".kprobes.text")))
-# define nokprobe_inline       __always_inline
-#else
-# define __kprobes
-# define nokprobe_inline       inline
-#endif
 #endif /* __LINUX_COMPILER_H */
index be8f12b..fbecbd0 100644 (file)
@@ -135,13 +135,6 @@ struct crush_bucket {
        __u32 size;      /* num items */
        __s32 *items;
 
-       /*
-        * cached random permutation: used for uniform bucket and for
-        * the linear search fallback for the other bucket types.
-        */
-       __u32 perm_x;  /* @x for which *perm is defined */
-       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-       __u32 *perm;
 };
 
 struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
         * device fails. */
        __u8 chooseleaf_stable;
 
+       /*
+        * This value is calculated after decode or construction by
+        * the builder. It is exposed here (rather than having a
+        * 'build CRUSH working space' function) so that callers can
+        * reserve a static buffer, allocate space on the stack, or
+        * otherwise avoid calling into the heap allocator if they
+        * want to. The size of the working space depends on the map,
+        * while the size of the scratch vector passed to the mapper
+        * depends on the size of the desired result set.
+        *
+        * Nothing stops the caller from allocating both in one swell
+        * foop and passing in two points, though.
+        */
+       size_t working_size;
+
 #ifndef __KERNEL__
        /*
         * version 0 (original) of straw_calc has various flaws.  version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
        return ((i+1) << 1)-1;
 }
 
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+       __u32 perm_x; /* @x for which *perm is defined */
+       __u32 perm_n; /* num elements of *perm that are permuted/defined */
+       __u32 *perm;  /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+       struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
 #endif
index 5dfd5b1..c95e19e 100644 (file)
@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map,
                         int ruleno,
                         int x, int *result, int result_max,
                         const __u32 *weights, int weight_max,
-                        int *scratch);
+                        void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+                                    int result_max)
+{
+       return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+void crush_init_workspace(const struct crush_map *map, void *v);
 
 #endif
index c965e44..591b6c1 100644 (file)
@@ -562,7 +562,7 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
  * @inode: inode to select the dentry from multiple layers (can be NULL)
  * @flags: open flags to control copy-up behavior
  *
- * If dentry is on an union/overlay, then return the underlying, real dentry.
+ * If dentry is on a union/overlay, then return the underlying, real dentry.
  * Otherwise return the dentry itself.
  *
  * See also: Documentation/filesystems/vfs.txt
@@ -581,7 +581,7 @@ static inline struct dentry *d_real(struct dentry *dentry,
  * d_real_inode - Return the real inode
  * @dentry: The dentry to query
  *
- * If dentry is on an union/overlay, then return the underlying, real inode.
+ * If dentry is on a union/overlay, then return the underlying, real inode.
  * Otherwise return d_inode().
  */
 static inline struct inode *d_real_inode(const struct dentry *dentry)
index c930cbc..c64f2cb 100644 (file)
@@ -655,6 +655,11 @@ struct inode {
        void                    *i_private; /* fs or device private pointer */
 };
 
+static inline unsigned int i_blocksize(const struct inode *node)
+{
+       return (1 << node->i_blkbits);
+}
+
 static inline int inode_unhashed(struct inode *inode)
 {
        return hlist_unhashed(&inode->i_hash);
index 3c01b89..bf70b3e 100644 (file)
 #ifndef __IDR_H__
 #define __IDR_H__
 
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/init.h>
-#include <linux/rcupdate.h>
+#include <linux/radix-tree.h>
+#include <linux/gfp.h>
+#include <linux/percpu.h>
+
+struct idr {
+       struct radix_tree_root  idr_rt;
+       unsigned int            idr_next;
+};
 
 /*
- * Using 6 bits at each layer allows us to allocate 7 layers out of each page.
- * 8 bits only gave us 3 layers out of every pair of pages, which is less
- * efficient except for trees with a largest element between 192-255 inclusive.
+ * The IDR API does not expose the tagging functionality of the radix tree
+ * to users.  Use tag 0 to track whether a node has free space below it.
  */
-#define IDR_BITS 6
-#define IDR_SIZE (1 << IDR_BITS)
-#define IDR_MASK ((1 << IDR_BITS)-1)
-
-struct idr_layer {
-       int                     prefix; /* the ID prefix of this idr_layer */
-       int                     layer;  /* distance from leaf */
-       struct idr_layer __rcu  *ary[1<<IDR_BITS];
-       int                     count;  /* When zero, we can release it */
-       union {
-               /* A zero bit means "space here" */
-               DECLARE_BITMAP(bitmap, IDR_SIZE);
-               struct rcu_head         rcu_head;
-       };
-};
+#define IDR_FREE       0
 
-struct idr {
-       struct idr_layer __rcu  *hint;  /* the last layer allocated from */
-       struct idr_layer __rcu  *top;
-       int                     layers; /* only valid w/o concurrent changes */
-       int                     cur;    /* current pos for cyclic allocation */
-       spinlock_t              lock;
-       int                     id_free_cnt;
-       struct idr_layer        *id_free;
-};
+/* Set the IDR flag and the IDR_FREE tag */
+#define IDR_RT_MARKER          ((__force gfp_t)(3 << __GFP_BITS_SHIFT))
 
-#define IDR_INIT(name)                                                 \
+#define IDR_INIT                                                       \
 {                                                                      \
-       .lock                   = __SPIN_LOCK_UNLOCKED(name.lock),      \
+       .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER)                        \
 }
-#define DEFINE_IDR(name)       struct idr name = IDR_INIT(name)
+#define DEFINE_IDR(name)       struct idr name = IDR_INIT
 
 /**
  * idr_get_cursor - Return the current position of the cyclic allocator
@@ -62,9 +44,9 @@ struct idr {
  * idr_alloc_cyclic() if it is free (otherwise the search will start from
  * this position).
  */
-static inline unsigned int idr_get_cursor(struct idr *idr)
+static inline unsigned int idr_get_cursor(const struct idr *idr)
 {
-       return READ_ONCE(idr->cur);
+       return READ_ONCE(idr->idr_next);
 }
 
 /**
@@ -77,7 +59,7 @@ static inline unsigned int idr_get_cursor(struct idr *idr)
  */
 static inline void idr_set_cursor(struct idr *idr, unsigned int val)
 {
-       WRITE_ONCE(idr->cur, val);
+       WRITE_ONCE(idr->idr_next, val);
 }
 
 /**
@@ -97,22 +79,31 @@ static inline void idr_set_cursor(struct idr *idr, unsigned int val)
  * period).
  */
 
-/*
- * This is what we export.
- */
-
-void *idr_find_slowpath(struct idr *idp, int id);
 void idr_preload(gfp_t gfp_mask);
-int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask);
-int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask);
-int idr_for_each(struct idr *idp,
+int idr_alloc(struct idr *, void *entry, int start, int end, gfp_t);
+int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
+int idr_for_each(const struct idr *,
                 int (*fn)(int id, void *p, void *data), void *data);
-void *idr_get_next(struct idr *idp, int *nextid);
-void *idr_replace(struct idr *idp, void *ptr, int id);
-void idr_remove(struct idr *idp, int id);
-void idr_destroy(struct idr *idp);
-void idr_init(struct idr *idp);
-bool idr_is_empty(struct idr *idp);
+void *idr_get_next(struct idr *, int *nextid);
+void *idr_replace(struct idr *, void *, int id);
+void idr_destroy(struct idr *);
+
+static inline void *idr_remove(struct idr *idr, int id)
+{
+       return radix_tree_delete_item(&idr->idr_rt, id, NULL);
+}
+
+static inline void idr_init(struct idr *idr)
+{
+       INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
+       idr->idr_next = 0;
+}
+
+static inline bool idr_is_empty(const struct idr *idr)
+{
+       return radix_tree_empty(&idr->idr_rt) &&
+               radix_tree_tagged(&idr->idr_rt, IDR_FREE);
+}
 
 /**
  * idr_preload_end - end preload section started with idr_preload()
@@ -137,19 +128,14 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find(struct idr *idr, int id)
+static inline void *idr_find(const struct idr *idr, int id)
 {
-       struct idr_layer *hint = rcu_dereference_raw(idr->hint);
-
-       if (hint && (id & ~IDR_MASK) == hint->prefix)
-               return rcu_dereference_raw(hint->ary[id & IDR_MASK]);
-
-       return idr_find_slowpath(idr, id);
+       return radix_tree_lookup(&idr->idr_rt, id);
 }
 
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
- * @idp:     idr handle
+ * @idr:     idr handle
  * @entry:   the type * to use as cursor
  * @id:      id entry's key
  *
@@ -157,57 +143,60 @@ static inline void *idr_find(struct idr *idr, int id)
  * after normal terminatinon @entry is left with the value NULL.  This
  * is convenient for a "not found" value.
  */
-#define idr_for_each_entry(idp, entry, id)                     \
-       for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
+#define idr_for_each_entry(idr, entry, id)                     \
+       for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
 
 /**
- * idr_for_each_entry - continue iteration over an idr's elements of a given type
- * @idp:     idr handle
+ * idr_for_each_entry_continue - continue iteration over an idr's elements of a given type
+ * @idr:     idr handle
  * @entry:   the type * to use as cursor
  * @id:      id entry's key
  *
  * Continue to iterate over list of given type, continuing after
  * the current position.
  */
-#define idr_for_each_entry_continue(idp, entry, id)                    \
-       for ((entry) = idr_get_next((idp), &(id));                      \
+#define idr_for_each_entry_continue(idr, entry, id)                    \
+       for ((entry) = idr_get_next((idr), &(id));                      \
             entry;                                                     \
-            ++id, (entry) = idr_get_next((idp), &(id)))
+            ++id, (entry) = idr_get_next((idr), &(id)))
 
 /*
  * IDA - IDR based id allocator, use when translation from id to
  * pointer isn't necessary.
- *
- * IDA_BITMAP_LONGS is calculated to be one less to accommodate
- * ida_bitmap->nr_busy so that the whole struct fits in 128 bytes.
  */
 #define IDA_CHUNK_SIZE         128     /* 128 bytes per chunk */
-#define IDA_BITMAP_LONGS       (IDA_CHUNK_SIZE / sizeof(long) - 1)
+#define IDA_BITMAP_LONGS       (IDA_CHUNK_SIZE / sizeof(long))
 #define IDA_BITMAP_BITS        (IDA_BITMAP_LONGS * sizeof(long) * 8)
 
 struct ida_bitmap {
-       long                    nr_busy;
        unsigned long           bitmap[IDA_BITMAP_LONGS];
 };
 
+DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
+
 struct ida {
-       struct idr              idr;
-       struct ida_bitmap       *free_bitmap;
+       struct radix_tree_root  ida_rt;
 };
 
-#define IDA_INIT(name)         { .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
-#define DEFINE_IDA(name)       struct ida name = IDA_INIT(name)
+#define IDA_INIT       {                                               \
+       .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT),          \
+}
+#define DEFINE_IDA(name)       struct ida name = IDA_INIT
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);
 void ida_remove(struct ida *ida, int id);
 void ida_destroy(struct ida *ida);
-void ida_init(struct ida *ida);
 
 int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
                   gfp_t gfp_mask);
 void ida_simple_remove(struct ida *ida, unsigned int id);
 
+static inline void ida_init(struct ida *ida)
+{
+       INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT);
+}
+
 /**
  * ida_get_new - allocate new ID
  * @ida:       idr handle
@@ -220,11 +209,8 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
        return ida_get_new_above(ida, 0, p_id);
 }
 
-static inline bool ida_is_empty(struct ida *ida)
+static inline bool ida_is_empty(const struct ida *ida)
 {
-       return idr_is_empty(&ida->idr);
+       return radix_tree_empty(&ida->ida_rt);
 }
-
-void __init idr_init_cache(void);
-
 #endif /* __IDR_H__ */
index 78c5d5a..f1045b2 100644 (file)
@@ -100,7 +100,7 @@ struct ipmi_user_hndl {
 
 /* Create a new user of the IPMI layer on the given interface number. */
 int ipmi_create_user(unsigned int          if_num,
-                    struct ipmi_user_hndl *handler,
+                    const struct ipmi_user_hndl *handler,
                     void                  *handler_data,
                     ipmi_user_t           *user);
 
index 8f2e059..4d74860 100644 (file)
@@ -8,7 +8,7 @@
 
 /*
  * The use of "&&" / "||" is limited in certain expressions.
- * The followings enable to calculate "and" / "or" with macro expansion only.
+ * The following enable to calculate "and" / "or" with macro expansion only.
  */
 #define __and(x, y)                    ___and(x, y)
 #define ___and(x, y)                   ____and(__ARG_PLACEHOLDER_##x, y)
index 7056238..a9b11b8 100644 (file)
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
        KERNFS_SUICIDAL         = 0x0400,
        KERNFS_SUICIDED         = 0x0800,
        KERNFS_EMPTY_DIR        = 0x1000,
+       KERNFS_HAS_RELEASE      = 0x2000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
        /* published fields */
        struct kernfs_node      *kn;
        struct file             *file;
+       struct seq_file         *seq_file;
        void                    *priv;
 
        /* private fields, do not use outside kernfs proper */
@@ -185,12 +187,20 @@ struct kernfs_open_file {
        char                    *prealloc_buf;
 
        size_t                  atomic_write_len;
-       bool                    mmapped;
+       bool                    mmapped:1;
+       bool                    released:1;
        const struct vm_operations_struct *vm_ops;
 };
 
 struct kernfs_ops {
        /*
+        * Optional open/release methods.  Both are called with
+        * @of->seq_file populated.
+        */
+       int (*open)(struct kernfs_open_file *of);
+       void (*release)(struct kernfs_open_file *of);
+
+       /*
         * Read is handled by either seq_file or raw_read().
         *
         * If seq_show() is present, seq_file path is active.  Other seq
index 16ddfb8..c328e4f 100644 (file)
@@ -29,7 +29,7 @@
  *             <jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *             <prasanna@in.ibm.com> added function-return probes.
  */
-#include <linux/compiler.h>    /* for __kprobes */
+#include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
@@ -40,9 +40,9 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
-#include <asm/kprobes.h>
 
 /* kprobe_status settings */
 #define KPROBE_HIT_ACTIVE      0x00000001
@@ -51,6 +51,7 @@
 #define KPROBE_HIT_SSDONE      0x00000008
 
 #else /* CONFIG_KPROBES */
+#include <asm-generic/kprobes.h>
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
        int dummy;
@@ -509,18 +510,4 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif
 
-#ifdef CONFIG_KPROBES
-/*
- * Blacklist ganerating macro. Specify functions which is not probed
- * by using this macro.
- */
-#define __NOKPROBE_SYMBOL(fname)                       \
-static unsigned long __used                            \
-       __attribute__((section("_kprobe_blacklist")))   \
-       _kbl_addr_##fname = (unsigned long)fname;
-#define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname)
-#else
-#define NOKPROBE_SYMBOL(fname)
-#endif
-
 #endif /* _LINUX_KPROBES_H */
index 6483a6f..ffb21e7 100644 (file)
 
 /* RTC_CTRL_REG bitfields */
 #define TPS65910_RTC_CTRL_STOP_RTC                     0x01 /*0=stop, 1=run */
+#define TPS65910_RTC_CTRL_AUTO_COMP                    0x04
 #define TPS65910_RTC_CTRL_GET_TIME                     0x40
 
 /* RTC_STATUS_REG bitfields */
index 808751d..4f6d440 100644 (file)
@@ -407,8 +407,27 @@ struct mm_struct {
        unsigned long task_size;                /* size of task vm space */
        unsigned long highest_vm_end;           /* highest vma end address */
        pgd_t * pgd;
-       atomic_t mm_users;                      /* How many users with user space? */
-       atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
+
+       /**
+        * @mm_users: The number of users including userspace.
+        *
+        * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
+        * to 0 (i.e. when the task exits and there are no other temporary
+        * reference holders), we also release a reference on @mm_count
+        * (which may then free the &struct mm_struct if @mm_count also
+        * drops to 0).
+        */
+       atomic_t mm_users;
+
+       /**
+        * @mm_count: The number of references to &struct mm_struct
+        * (@mm_users count as 1).
+        *
+        * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
+        * &struct mm_struct is freed.
+        */
+       atomic_t mm_count;
+
        atomic_long_t nr_ptes;                  /* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
        atomic_long_t nr_pmds;                  /* PMD page table pages */
index 7b3d487..b532ce5 100644 (file)
@@ -14,7 +14,7 @@
  * @DevId - Chip Device ID
  * @qinfo - pointer to qinfo records describing the chip
  * @numchips - number of chips including virual RWW partitions
- * @chipshift - Chip/partiton size 2^chipshift
+ * @chipshift - Chip/partition size 2^chipshift
  * @chips - per-chip data structure
  */
 struct lpddr_private {
index 23705a5..298ead5 100644 (file)
@@ -191,10 +191,10 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_thread(pid, type, task)                            \
        do_each_pid_task(pid, type, task) {                             \
                struct task_struct *tg___ = task;                       \
-               do {
+               for_each_thread(tg___, task) {
 
 #define while_each_pid_thread(pid, type, task)                         \
-               } while_each_thread(tg___, task);                       \
+               }                                                       \
                task = tg___;                                           \
        } while_each_pid_task(pid, type, task)
 #endif /* _LINUX_PID_H */
diff --git a/include/linux/platform_data/rtc-m48t86.h b/include/linux/platform_data/rtc-m48t86.h
deleted file mode 100644 (file)
index 915d6b4..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * ST M48T86 / Dallas DS12887 RTC driver
- * Copyright (c) 2006 Tower Technologies
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-struct m48t86_ops
-{
-       void (*writebyte)(unsigned char value, unsigned long addr);
-       unsigned char (*readbyte)(unsigned long addr);
-};
index 52bda85..3e57350 100644 (file)
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/preempt.h>
-#include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/preempt.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
 
 /*
  * The bottom two bits of the slot determine how the remaining bits in the
@@ -94,7 +96,7 @@ struct radix_tree_node {
        unsigned char   count;          /* Total entry count */
        unsigned char   exceptional;    /* Exceptional entry count */
        struct radix_tree_node *parent;         /* Used when ascending tree */
-       void *private_data;                     /* For tree user */
+       struct radix_tree_root *root;           /* The tree we belong to */
        union {
                struct list_head private_list;  /* For tree user */
                struct rcu_head rcu_head;       /* Used when freeing node */
@@ -103,7 +105,10 @@ struct radix_tree_node {
        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
-/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+/* The top bits of gfp_mask are used to store the root tags and the IDR flag */
+#define ROOT_IS_IDR    ((__force gfp_t)(1 << __GFP_BITS_SHIFT))
+#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1)
+
 struct radix_tree_root {
        gfp_t                   gfp_mask;
        struct radix_tree_node  __rcu *rnode;
@@ -123,7 +128,7 @@ do {                                                                        \
        (root)->rnode = NULL;                                           \
 } while (0)
 
-static inline bool radix_tree_empty(struct radix_tree_root *root)
+static inline bool radix_tree_empty(const struct radix_tree_root *root)
 {
        return root->rnode == NULL;
 }
@@ -216,10 +221,8 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
  */
 
 /**
- * radix_tree_deref_slot       - dereference a slot
- * @pslot:     pointer to slot, returned by radix_tree_lookup_slot
- * Returns:    item that was stored in that slot with any direct pointer flag
- *             removed.
+ * radix_tree_deref_slot - dereference a slot
+ * @slot: slot pointer, returned by radix_tree_lookup_slot
  *
  * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
  * locked across slot lookup and dereference. Not required if write lock is
@@ -227,26 +230,27 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
  *
  * radix_tree_deref_retry must be used to confirm validity of the pointer if
  * only the read lock is held.
+ *
+ * Return: entry stored in that slot.
  */
-static inline void *radix_tree_deref_slot(void **pslot)
+static inline void *radix_tree_deref_slot(void __rcu **slot)
 {
-       return rcu_dereference(*pslot);
+       return rcu_dereference(*slot);
 }
 
 /**
- * radix_tree_deref_slot_protected     - dereference a slot without RCU lock but with tree lock held
- * @pslot:     pointer to slot, returned by radix_tree_lookup_slot
- * Returns:    item that was stored in that slot with any direct pointer flag
- *             removed.
- *
- * Similar to radix_tree_deref_slot but only used during migration when a pages
- * mapping is being moved. The caller does not hold the RCU read lock but it
- * must hold the tree lock to prevent parallel updates.
+ * radix_tree_deref_slot_protected - dereference a slot with tree lock held
+ * @slot: slot pointer, returned by radix_tree_lookup_slot
+ *
+ * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
+ * lock but it must hold the tree lock to prevent parallel updates.
+ *
+ * Return: entry stored in that slot.
  */
-static inline void *radix_tree_deref_slot_protected(void **pslot,
+static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
 {
-       return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+       return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
 }
 
 /**
@@ -282,9 +286,9 @@ static inline int radix_tree_exception(void *arg)
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 
-int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
+int __radix_tree_create(struct radix_tree_root *, unsigned long index,
                        unsigned order, struct radix_tree_node **nodep,
-                       void ***slotp);
+                       void __rcu ***slotp);
 int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        unsigned order, void *);
 static inline int radix_tree_insert(struct radix_tree_root *root,
@@ -292,55 +296,56 @@ static inline int radix_tree_insert(struct radix_tree_root *root,
 {
        return __radix_tree_insert(root, index, 0, entry);
 }
-void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
-                         struct radix_tree_node **nodep, void ***slotp);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
+                         struct radix_tree_node **nodep, void __rcu ***slotp);
+void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
+void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
+                                       unsigned long index);
 typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
-void __radix_tree_replace(struct radix_tree_root *root,
-                         struct radix_tree_node *node,
-                         void **slot, void *item,
+void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
+                         void __rcu **slot, void *entry,
                          radix_tree_update_node_t update_node, void *private);
 void radix_tree_iter_replace(struct radix_tree_root *,
-               const struct radix_tree_iter *, void **slot, void *item);
-void radix_tree_replace_slot(struct radix_tree_root *root,
-                            void **slot, void *item);
-void __radix_tree_delete_node(struct radix_tree_root *root,
-                             struct radix_tree_node *node,
+               const struct radix_tree_iter *, void __rcu **slot, void *entry);
+void radix_tree_replace_slot(struct radix_tree_root *,
+                            void __rcu **slot, void *entry);
+void __radix_tree_delete_node(struct radix_tree_root *,
+                             struct radix_tree_node *,
                              radix_tree_update_node_t update_node,
                              void *private);
+void radix_tree_iter_delete(struct radix_tree_root *,
+                       struct radix_tree_iter *iter, void __rcu **slot);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-void radix_tree_clear_tags(struct radix_tree_root *root,
-                          struct radix_tree_node *node,
-                          void **slot);
-unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *,
+                          void __rcu **slot);
+unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
-unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
-                       void ***results, unsigned long *indices,
+unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
+                       void __rcu ***results, unsigned long *indices,
                        unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
 void radix_tree_init(void);
-void *radix_tree_tag_set(struct radix_tree_root *root,
+void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
-void *radix_tree_tag_clear(struct radix_tree_root *root,
+void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
-int radix_tree_tag_get(struct radix_tree_root *root,
+int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
-void radix_tree_iter_tag_set(struct radix_tree_root *root,
+void radix_tree_iter_tag_set(struct radix_tree_root *,
+               const struct radix_tree_iter *iter, unsigned int tag);
+void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-               unsigned long first_index, unsigned int max_items,
-               unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
-               unsigned long first_index, unsigned int max_items,
-               unsigned int tag);
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
+               void **results, unsigned long first_index,
+               unsigned int max_items, unsigned int tag);
+unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
+               void __rcu ***results, unsigned long first_index,
+               unsigned int max_items, unsigned int tag);
+int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
@@ -352,10 +357,14 @@ int radix_tree_split(struct radix_tree_root *, unsigned long index,
                        unsigned new_order);
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
                        unsigned new_order, void *);
+void __rcu **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
+                       gfp_t, int end);
 
-#define RADIX_TREE_ITER_TAG_MASK       0x00FF  /* tag index in lower byte */
-#define RADIX_TREE_ITER_TAGGED         0x0100  /* lookup tagged slots */
-#define RADIX_TREE_ITER_CONTIG         0x0200  /* stop at first hole */
+enum {
+       RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
+       RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
+       RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
+};
 
 /**
  * radix_tree_iter_init - initialize radix tree iterator
@@ -364,7 +373,7 @@ int radix_tree_join(struct radix_tree_root *, unsigned long index,
  * @start:     iteration starting index
  * Returns:    NULL
  */
-static __always_inline void **
+static __always_inline void __rcu **
 radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
 {
        /*
@@ -393,10 +402,46 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
  * Also it fills @iter with data about chunk: position in the tree (index),
  * its end (next_index), and constructs a bit mask for tagged iterating (tags).
  */
-void **radix_tree_next_chunk(struct radix_tree_root *root,
+void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);
 
 /**
+ * radix_tree_iter_lookup - look up an index in the radix tree
+ * @root: radix tree root
+ * @iter: iterator state
+ * @index: key to look up
+ *
+ * If @index is present in the radix tree, this function returns the slot
+ * containing it and updates @iter to describe the entry.  If @index is not
+ * present, it returns NULL.
+ */
+static inline void __rcu **
+radix_tree_iter_lookup(const struct radix_tree_root *root,
+                       struct radix_tree_iter *iter, unsigned long index)
+{
+       radix_tree_iter_init(iter, index);
+       return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
+}
+
+/**
+ * radix_tree_iter_find - find a present entry
+ * @root: radix tree root
+ * @iter: iterator state
+ * @index: start location
+ *
+ * This function returns the slot containing the entry with the lowest index
+ * which is at least @index.  If @index is larger than any present entry, this
+ * function returns NULL.  The @iter is updated to describe the entry found.
+ */
+static inline void __rcu **
+radix_tree_iter_find(const struct radix_tree_root *root,
+                       struct radix_tree_iter *iter, unsigned long index)
+{
+       radix_tree_iter_init(iter, index);
+       return radix_tree_next_chunk(root, iter, 0);
+}
+
+/**
  * radix_tree_iter_retry - retry this chunk of the iteration
  * @iter:      iterator state
  *
@@ -406,7 +451,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
  * and continue the iteration.
  */
 static inline __must_check
-void **radix_tree_iter_retry(struct radix_tree_iter *iter)
+void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
 {
        iter->next_index = iter->index;
        iter->tags = 0;
@@ -429,7 +474,7 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
  * have been invalidated by an insertion or deletion.  Call this function
  * before releasing the lock to continue the iteration from the next index.
  */
-void **__must_check radix_tree_iter_resume(void **slot,
+void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);
 
 /**
@@ -445,11 +490,11 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
-                               unsigned flags);
+void __rcu **__radix_tree_next_slot(void __rcu **slot,
+                               struct radix_tree_iter *iter, unsigned flags);
 #else
 /* Can't happen without sibling entries, but the compiler can't tell that */
-static inline void ** __radix_tree_next_slot(void **slot,
+static inline void __rcu **__radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
 {
        return slot;
@@ -475,8 +520,8 @@ static inline void ** __radix_tree_next_slot(void **slot,
  * b) we are doing non-tagged iteration, and iter->index and iter->next_index
  *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
  */
-static __always_inline void **
-radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
+static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
+                               struct radix_tree_iter *iter, unsigned flags)
 {
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
@@ -514,7 +559,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
        return NULL;
 
  found:
-       if (unlikely(radix_tree_is_internal_node(*slot)))
+       if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot))))
                return __radix_tree_next_slot(slot, iter, flags);
        return slot;
 }
index 600aadf..0023fee 100644 (file)
@@ -1,54 +1,10 @@
 #ifndef _LINUX_REFCOUNT_H
 #define _LINUX_REFCOUNT_H
 
-/*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- */
-
 #include <linux/atomic.h>
-#include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-
-#ifdef CONFIG_DEBUG_REFCOUNT
-#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
-#define __refcount_check       __must_check
-#else
-#define REFCOUNT_WARN(cond, str) (void)(cond)
-#define __refcount_check
-#endif
+#include <linux/kernel.h>
 
 typedef struct refcount_struct {
        atomic_t refs;
@@ -66,229 +22,21 @@ static inline unsigned int refcount_read(const refcount_t *r)
        return atomic_read(&r->refs);
 }
 
-static inline __refcount_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               if (!val)
-                       return false;
-
-               if (unlikely(val == UINT_MAX))
-                       return true;
-
-               new = val + i;
-               if (new < val)
-                       new = UINT_MAX;
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-       return true;
-}
-
-static inline void refcount_add(unsigned int i, refcount_t *r)
-{
-       REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- */
-static inline __refcount_check
-bool refcount_inc_not_zero(refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               new = val + 1;
-
-               if (!val)
-                       return false;
-
-               if (unlikely(!new))
-                       return true;
-
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-       return true;
-}
-
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
- */
-static inline void refcount_inc(refcount_t *r)
-{
-       REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
-
-       for (;;) {
-               if (unlikely(val == UINT_MAX))
-                       return false;
-
-               new = val - i;
-               if (new > val) {
-                       REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-                       return false;
-               }
-
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       return !new;
-}
-
-static inline __refcount_check
-bool refcount_dec_and_test(refcount_t *r)
-{
-       return refcount_sub_and_test(1, r);
-}
+extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
+extern void refcount_add(unsigned int i, refcount_t *r);
 
-/*
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-static inline
-void refcount_dec(refcount_t *r)
-{
-       REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
-
-/*
- * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
- * success thereof.
- *
- * Like all decrement operations, it provides release memory order and provides
- * a control dependency.
- *
- * It can be used like a try-delete operator; this explicit case is provided
- * and not cmpxchg in generic, because that would allow implementing unsafe
- * operations.
- */
-static inline __refcount_check
-bool refcount_dec_if_one(refcount_t *r)
-{
-       return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
-}
-
-/*
- * No atomic_t counterpart, it decrements unless the value is 1, in which case
- * it will return false.
- *
- * Was often done like: atomic_add_unless(&var, -1, 1)
- */
-static inline __refcount_check
-bool refcount_dec_not_one(refcount_t *r)
-{
-       unsigned int old, new, val = atomic_read(&r->refs);
+extern __must_check bool refcount_inc_not_zero(refcount_t *r);
+extern void refcount_inc(refcount_t *r);
 
-       for (;;) {
-               if (unlikely(val == UINT_MAX))
-                       return true;
+extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
+extern void refcount_sub(unsigned int i, refcount_t *r);
 
-               if (val == 1)
-                       return false;
+extern __must_check bool refcount_dec_and_test(refcount_t *r);
+extern void refcount_dec(refcount_t *r);
 
-               new = val - 1;
-               if (new > val) {
-                       REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-                       return true;
-               }
-
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
-
-       return true;
-}
-
-/*
- * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
-{
-       if (refcount_dec_not_one(r))
-               return false;
-
-       mutex_lock(lock);
-       if (!refcount_dec_and_test(r)) {
-               mutex_unlock(lock);
-               return false;
-       }
-
-       return true;
-}
-
-/*
- * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
-{
-       if (refcount_dec_not_one(r))
-               return false;
-
-       spin_lock(lock);
-       if (!refcount_dec_and_test(r)) {
-               spin_unlock(lock);
-               return false;
-       }
-
-       return true;
-}
+extern __must_check bool refcount_dec_if_one(refcount_t *r);
+extern __must_check bool refcount_dec_not_one(refcount_t *r);
+extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
+extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
 
 #endif /* _LINUX_REFCOUNT_H */
diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
new file mode 100644 (file)
index 0000000..ea05f6c
--- /dev/null
@@ -0,0 +1,23 @@
+/*
+ * rodata_test.h: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifndef _RODATA_TEST_H
+#define _RODATA_TEST_H
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+extern const int rodata_test_data;
+void rodata_test(void);
+#else
+static inline void rodata_test(void) {}
+#endif
+
+#endif /* _RODATA_TEST_H */
index 451e241..4a28deb 100644 (file)
@@ -2904,6 +2904,28 @@ static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
  */
 extern struct mm_struct * mm_alloc(void);
 
+/**
+ * mmgrab() - Pin a &struct mm_struct.
+ * @mm: The &struct mm_struct to pin.
+ *
+ * Make sure that @mm will not get freed even after the owning task
+ * exits. This doesn't guarantee that the associated address space
+ * will still exist later on and mmget_not_zero() has to be used before
+ * accessing it.
+ *
+ * This is a preferred way to to pin @mm for a longer/unbounded amount
+ * of time.
+ *
+ * Use mmdrop() to release the reference acquired by mmgrab().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmgrab(struct mm_struct *mm)
+{
+       atomic_inc(&mm->mm_count);
+}
+
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct *mm)
@@ -2926,6 +2948,27 @@ static inline void mmdrop_async(struct mm_struct *mm)
        }
 }
 
+/**
+ * mmget() - Pin the address space associated with a &struct mm_struct.
+ * @mm: The address space to pin.
+ *
+ * Make sure that the address space of the given &struct mm_struct doesn't
+ * go away. This does not protect against parts of the address space being
+ * modified or freed, however.
+ *
+ * Never use this function to pin this address space for an
+ * unbounded/indefinite amount of time.
+ *
+ * Use mmput() to release the reference acquired by mmget().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmget(struct mm_struct *mm)
+{
+       atomic_inc(&mm->mm_users);
+}
+
 static inline bool mmget_not_zero(struct mm_struct *mm)
 {
        return atomic_inc_not_zero(&mm->mm_users);
index d0efd6e..4fc222f 100644 (file)
@@ -21,7 +21,7 @@ struct sem_array {
        struct list_head        list_id;        /* undo requests on this array */
        int                     sem_nsems;      /* no. of semaphores in array */
        int                     complex_count;  /* pending complex operations */
-       bool                    complex_mode;   /* no parallel simple ops */
+       unsigned int            use_global_lock;/* >0: global lock required */
 };
 
 #ifdef CONFIG_SYSVIPC
index 3f22932..f4199e7 100644 (file)
@@ -7,7 +7,7 @@ struct mtd_partition;
  * struct flash_platform_data: board-specific flash data
  * @name: optional flash device name (eg, as used with mtdparts=)
  * @parts: optional array of mtd_partitions for static partitioning
- * @nr_parts: number of mtd_partitions for static partitoning
+ * @nr_parts: number of mtd_partitions for static partitioning
  * @type: optional flash device type (e.g. m25p80 vs m25p64), for use
  *     with chips that can't be queried for JEDEC or other IDs
  *
index 8a511c0..20d157a 100644 (file)
@@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
        kref_put(&h->ref, cd->cache_put);
 }
 
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
+       if (!test_bit(CACHE_VALID, &h->flags))
+               return false;
+
        return  (h->expiry_time < seconds_since_boot()) ||
                (detail->flush_time >= h->last_refresh);
 }
@@ -227,6 +230,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd);
 extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
                                        umode_t, struct cache_detail *);
 extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
 
 /* Must store cache_detail in seq_file->private if using next three functions */
 extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
index cfda6ad..245fc59 100644 (file)
@@ -110,6 +110,15 @@ struct rpcrdma_msg {
 };
 
 /*
+ * XDR sizes, in quads
+ */
+enum {
+       rpcrdma_fixed_maxsz     = 4,
+       rpcrdma_segment_maxsz   = 4,
+       rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz,
+};
+
+/*
  * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
  */
 #define RPCRDMA_HDRLEN_MIN     (sizeof(__be32) * 7)
index 7321ae9..e770abe 100644 (file)
@@ -400,10 +400,14 @@ struct svc_version {
        struct svc_procedure *  vs_proc;        /* per-procedure info */
        u32                     vs_xdrsize;     /* xdrsize needed for this version */
 
-       unsigned int            vs_hidden : 1,  /* Don't register with portmapper.
-                                                * Only used for nfsacl so far. */
-                               vs_rpcb_optnl:1;/* Don't care the result of register.
-                                                * Only used for nfsv4. */
+       /* Don't register with rpcbind */
+       bool                    vs_hidden;
+
+       /* Don't care if the rpcbind registration fails */
+       bool                    vs_rpcb_optnl;
+
+       /* Need xprt with congestion control */
+       bool                    vs_need_cong_ctrl;
 
        /* Override dispatch function (e.g. when caching replies).
         * A return value of 0 means drop the request. 
index 757fb96..b105f73 100644 (file)
@@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
-       struct list_head free;
+       struct list_head list;
        struct svc_rdma_op_ctxt *read_hdr;
        struct svc_rdma_fastreg_mr *frmr;
        int hdr_count;
@@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt {
        struct ib_cqe cqe;
        struct ib_cqe reg_cqe;
        struct ib_cqe inv_cqe;
-       struct list_head dto_q;
        u32 byte_len;
        u32 position;
        struct svcxprt_rdma *xprt;
@@ -141,7 +140,8 @@ struct svcxprt_rdma {
        atomic_t             sc_sq_avail;       /* SQEs ready to be consumed */
        unsigned int         sc_sq_depth;       /* Depth of SQ */
        unsigned int         sc_rq_depth;       /* Depth of RQ */
-       u32                  sc_max_requests;   /* Forward credits */
+       __be32               sc_fc_credits;     /* Forward credits */
+       u32                  sc_max_requests;   /* Max requests */
        u32                  sc_max_bc_requests;/* Backward credits */
        int                  sc_max_req_size;   /* Size of each RQ WR buf */
 
@@ -171,7 +171,6 @@ struct svcxprt_rdma {
 
        wait_queue_head_t    sc_send_wait;      /* SQ exhaustion waitlist */
        unsigned long        sc_flags;
-       struct list_head     sc_dto_q;          /* DTO tasklet I/O pending Q */
        struct list_head     sc_read_complete_q;
        struct work_struct   sc_work;
 };
@@ -214,11 +213,7 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
                                            __be32, __be64, u32);
-extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
-                                            struct rpcrdma_msg *,
-                                            struct rpcrdma_msg *,
-                                            enum rpcrdma_proc);
-extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
index 7440290..ddb7f94 100644 (file)
@@ -67,6 +67,7 @@ struct svc_xprt {
 #define XPT_CACHE_AUTH 11              /* cache auth info */
 #define XPT_LOCAL      12              /* connection from loopback interface */
 #define XPT_KILL_TEMP   13             /* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL  14              /* has congestion control */
 
        struct svc_serv         *xpt_server;    /* service for transport */
        atomic_t                xpt_reserved;   /* space on outq that is rsvd */
index a26cc43..bde063c 100644 (file)
@@ -106,9 +106,9 @@ struct work_struct {
 #endif
 };
 
-#define WORK_DATA_INIT()       ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL)
+#define WORK_DATA_INIT()       ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()        \
-       ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
+       ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
 
 struct delayed_work {
        struct work_struct work;
index e1006b3..bee1404 100644 (file)
@@ -174,10 +174,10 @@ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv);
  *             not freed when the control is deleted. Should this be needed
  *             then a new internal bitfield can be added to tell the framework
  *             to free this pointer.
- * @p_cur:     The control's current value represented via an union with
+ * @p_cur:     The control's current value represented via a union with
  *             provides a standard way of accessing control types
  *             through a pointer.
- * @p_new:     The control's new value represented via an union with provides
+ * @p_new:     The control's new value represented via a union with provides
  *             a standard way of accessing control types
  *             through a pointer.
  */
index c92dc03..ead1aa6 100644 (file)
@@ -1948,7 +1948,7 @@ struct cfg80211_deauth_request {
  * struct cfg80211_disassoc_request - Disassociation request data
  *
  * This structure provides information needed to complete IEEE 802.11
- * disassocation.
+ * disassociation.
  *
  * @bss: the BSS to disassociate from
  * @ie: Extra IEs to add to Disassociation frame or %NULL
index b9a08cd..a3bab3c 100644 (file)
@@ -3392,7 +3392,7 @@ enum ieee80211_reconfig_type {
  *     since there won't be any time to beacon before the switch anyway.
  * @pre_channel_switch: This is an optional callback that is called
  *     before a channel switch procedure is started (ie. when a STA
- *     gets a CSA or an userspace initiated channel-switch), allowing
+ *     gets a CSA or a userspace initiated channel-switch), allowing
  *     the driver to prepare for the channel switch.
  * @post_channel_switch: This is an optional callback that is called
  *     after a channel switch procedure is completed, allowing the
index d84849c..0f1813c 100644 (file)
@@ -60,6 +60,7 @@
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
 
 struct ib_umem;
 
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+       struct rdma_cgroup      *cg;            /* owner rdma cgroup */
+#endif
+};
+
 struct ib_ucontext {
        struct ib_device       *device;
        struct list_head        pd_list;
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
        struct list_head        no_private_counters;
        int                     odp_mrs_count;
 #endif
+
+       struct ib_rdmacg_object cg_obj;
 };
 
 struct ib_uobject {
@@ -1395,6 +1404,7 @@ struct ib_uobject {
        struct ib_ucontext     *context;        /* associated user context */
        void                   *object;         /* containing object */
        struct list_head        list;           /* link to context's list */
+       struct ib_rdmacg_object cg_obj;         /* rdmacg object */
        int                     id;             /* index into kernel idr */
        struct kref             ref;
        struct rw_semaphore     mutex;          /* protects .live */
@@ -2128,6 +2138,10 @@ struct ib_device {
        struct attribute_group       *hw_stats_ag;
        struct rdma_hw_stats         *hw_stats;
 
+#ifdef CONFIG_CGROUP_RDMA
+       struct rdmacg_device         cg_device;
+#endif
+
        /**
         * The following mandatory functions are used only at device
         * registration.  Keep functions such as these at the end of this
index 021ed33..744b3d0 100644 (file)
@@ -113,17 +113,13 @@ struct autofs_dev_ioctl {
 
 static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 {
-       memset(in, 0, sizeof(struct autofs_dev_ioctl));
+       memset(in, 0, AUTOFS_DEV_IOCTL_SIZE);
        in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
        in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
-       in->size = sizeof(struct autofs_dev_ioctl);
+       in->size = AUTOFS_DEV_IOCTL_SIZE;
        in->ioctlfd = -1;
 }
 
-/*
- * If you change this make sure you make the corresponding change
- * to autofs-dev-ioctl.c:lookup_ioctl()
- */
 enum {
        /* Get various version info */
        AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
@@ -160,8 +156,6 @@ enum {
        AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
 };
 
-#define AUTOFS_IOCTL 0x93
-
 #define AUTOFS_DEV_IOCTL_VERSION \
        _IOWR(AUTOFS_IOCTL, \
              AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
index 1bfc3ed..aa63451 100644 (file)
@@ -61,12 +61,23 @@ struct autofs_packet_expire {
        char name[NAME_MAX+1];
 };
 
-#define AUTOFS_IOC_READY      _IO(0x93, 0x60)
-#define AUTOFS_IOC_FAIL       _IO(0x93, 0x61)
-#define AUTOFS_IOC_CATATONIC  _IO(0x93, 0x62)
-#define AUTOFS_IOC_PROTOVER   _IOR(0x93, 0x63, int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
-#define AUTOFS_IOC_EXPIRE     _IOR(0x93, 0x65, struct autofs_packet_expire)
+#define AUTOFS_IOCTL 0x93
+
+enum {
+       AUTOFS_IOC_READY_CMD = 0x60,
+       AUTOFS_IOC_FAIL_CMD,
+       AUTOFS_IOC_CATATONIC_CMD,
+       AUTOFS_IOC_PROTOVER_CMD,
+       AUTOFS_IOC_SETTIMEOUT_CMD,
+       AUTOFS_IOC_EXPIRE_CMD,
+};
+
+#define AUTOFS_IOC_READY        _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD)
+#define AUTOFS_IOC_FAIL         _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD)
+#define AUTOFS_IOC_CATATONIC    _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD)
+#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long)
+#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire)
 
 #endif /* _UAPI_LINUX_AUTO_FS_H */
index 8f8f1bd..7c6da42 100644 (file)
@@ -148,10 +148,16 @@ union autofs_v5_packet_union {
        autofs_packet_expire_direct_t expire_direct;
 };
 
-#define AUTOFS_IOC_EXPIRE_MULTI                _IOW(0x93, 0x66, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT     AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT       AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER         _IOR(0x93, 0x67, int)
-#define AUTOFS_IOC_ASKUMOUNT           _IOR(0x93, 0x70, int)
+enum {
+       AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
+       AUTOFS_IOC_PROTOSUBVER_CMD,
+       AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
+};
+
+#define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
+#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_EXPIRE_DIRECT   AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
+#define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
 #endif /* _LINUX_AUTO_FS4_H */
index 7550e91..c111a91 100644 (file)
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 #include <linux/compiler.h>
-#include <linux/sysctl.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 
index 3efc0ca..79da349 100644 (file)
@@ -2,6 +2,7 @@
 #define _UAPI_XT_HASHLIMIT_H
 
 #include <linux/types.h>
+#include <linux/limits.h>
 #include <linux/if.h>
 
 /* timings are in milliseconds. */
index 0df7bd5..c3be256 100644 (file)
@@ -32,7 +32,8 @@
 #define NFSEXP_ASYNC           0x0010
 #define NFSEXP_GATHERED_WRITES 0x0020
 #define NFSEXP_NOREADDIRPLUS    0x0040
-/* 80 100 currently unused */
+#define NFSEXP_SECURITY_LABEL  0x0080
+/* 0x100 currently unused */
 #define NFSEXP_NOHIDE          0x0200
 #define NFSEXP_NOSUBTREECHECK  0x0400
 #define        NFSEXP_NOAUTHNLM        0x0800          /* Don't authenticate NLM requests - just trust */
@@ -53,7 +54,7 @@
 #define NFSEXP_PNFS            0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS                0x3FE7F
+#define NFSEXP_ALLFLAGS                0x3FEFF
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS   (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
index 56806bc..7fb7112 100644 (file)
@@ -181,7 +181,7 @@ struct grant_entry_header {
 };
 
 /*
- * Version 2 of the grant entry structure, here is an union because three
+ * Version 2 of the grant entry structure, here is a union because three
  * different types are suppotted: full_page, sub_page and transitive.
  */
 union grant_entry_v2 {
index 8c39615..a92f27d 100644 (file)
@@ -1078,6 +1078,16 @@ config CGROUP_PIDS
          since the PIDs limit only affects a process's ability to fork, not to
          attach to a cgroup.
 
+config CGROUP_RDMA
+       bool "RDMA controller"
+       help
+         Provides enforcement of RDMA resources defined by IB stack.
+         It is fairly easy for consumers to exhaust RDMA resources, which
+         can result into resource unavailability to other consumers.
+         RDMA controller is designed to stop this from happening.
+         Attaching processes with active RDMA resources to the cgroup
+         hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
        bool "Freezer controller"
        help
index b32ad7d..981f286 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/dirent.h>
 #include <linux/syscalls.h>
 #include <linux/utime.h>
+#include <linux/file.h>
 
 static ssize_t __init xwrite(int fd, const char *p, size_t count)
 {
@@ -647,6 +648,7 @@ static int __init populate_rootfs(void)
                        printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
                free_initrd();
 #endif
+               flush_delayed_fput();
                /*
                 * Try loading default modules from initramfs.  This gives
                 * us a chance to load before device_initcalls.
index 24ea487..ae9f200 100644 (file)
@@ -71,7 +71,6 @@
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
-#include <linux/file.h>
 #include <linux/ptrace.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
@@ -83,6 +82,7 @@
 #include <linux/proc_ns.h>
 #include <linux/io.h>
 #include <linux/cache.h>
+#include <linux/rodata_test.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -554,7 +554,7 @@ asmlinkage __visible void __init start_kernel(void)
        if (WARN(!irqs_disabled(),
                 "Interrupts were enabled *very* early, fixing it\n"))
                local_irq_disable();
-       idr_init_cache();
+       radix_tree_init();
 
        /*
         * Allow workqueue creation and work item queueing/cancelling
@@ -569,7 +569,6 @@ asmlinkage __visible void __init start_kernel(void)
        trace_init();
 
        context_tracking_init();
-       radix_tree_init();
        /* init some links before init_ISA_irqs() */
        early_irq_init();
        init_IRQ();
@@ -936,9 +935,10 @@ __setup("rodata=", set_debug_rodata);
 #ifdef CONFIG_STRICT_KERNEL_RWX
 static void mark_readonly(void)
 {
-       if (rodata_enabled)
+       if (rodata_enabled) {
                mark_rodata_ro();
-       else
+               rodata_test();
+       } else
                pr_info("Kernel memory protection disabled.\n");
 }
 #else
@@ -960,8 +960,6 @@ static int __ref kernel_init(void *unused)
        system_state = SYSTEM_RUNNING;
        numa_default_policy();
 
-       flush_delayed_fput();
-
        rcu_end_inkernel_boot();
 
        if (ramdisk_execute_command) {
index 7a2d8f0..4fdd970 100644 (file)
@@ -558,6 +558,7 @@ static void wq_add(struct mqueue_inode_info *info, int sr,
  */
 static int wq_sleep(struct mqueue_inode_info *info, int sr,
                    ktime_t *timeout, struct ext_wait_queue *ewp)
+       __releases(&info->lock)
 {
        int retval;
        signed long time;
index 3ec5742..e468cd1 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -159,22 +159,42 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMOPM_FAST    64  /* ~ 372 bytes on stack */
 
 /*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS     10
+
+/*
  * Locking:
  * a) global sem_lock() for read/write
  *     sem_undo.id_next,
  *     sem_array.complex_count,
- *     sem_array.complex_mode
  *     sem_array.pending{_alter,_const},
  *     sem_array.sem_undo
  *
  * b) global or semaphore sem_lock() for read/write:
  *     sem_array.sem_base[i].pending_{const,alter}:
- *     sem_array.complex_mode (for read)
  *
  * c) special:
  *     sem_undo_list.list_proc:
  *     * undo_list->lock for write
  *     * rcu for read
+ *     use_global_lock:
+ *     * global sem_lock() for write
+ *     * either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
  */
 
 #define sc_semmsl      sem_ctls[0]
@@ -273,29 +293,22 @@ static void complexmode_enter(struct sem_array *sma)
        int i;
        struct sem *sem;
 
-       if (sma->complex_mode)  {
-               /* We are already in complex_mode. Nothing to do */
+       if (sma->use_global_lock > 0)  {
+               /*
+                * We are already in global lock mode.
+                * Nothing to do, just reset the
+                * counter until we return to simple mode.
+                */
+               sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
                return;
        }
-
-       /* We need a full barrier after seting complex_mode:
-        * The write to complex_mode must be visible
-        * before we read the first sem->lock spinlock state.
-        */
-       smp_store_mb(sma->complex_mode, true);
+       sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 
        for (i = 0; i < sma->sem_nsems; i++) {
                sem = sma->sem_base + i;
-               spin_unlock_wait(&sem->lock);
+               spin_lock(&sem->lock);
+               spin_unlock(&sem->lock);
        }
-       /*
-        * spin_unlock_wait() is not a memory barriers, it is only a
-        * control barrier. The code must pair with spin_unlock(&sem->lock),
-        * thus just the control barrier is insufficient.
-        *
-        * smp_rmb() is sufficient, as writes cannot pass the control barrier.
-        */
-       smp_rmb();
 }
 
 /*
@@ -310,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
                 */
                return;
        }
-       /*
-        * Immediately after setting complex_mode to false,
-        * a simple op can start. Thus: all memory writes
-        * performed by the current operation must be visible
-        * before we set complex_mode to false.
-        */
-       smp_store_release(&sma->complex_mode, false);
+       if (sma->use_global_lock == 1) {
+               /*
+                * Immediately after setting use_global_lock to 0,
+                * a simple op can start. Thus: all memory writes
+                * performed by the current operation must be visible
+                * before we set use_global_lock to 0.
+                */
+               smp_store_release(&sma->use_global_lock, 0);
+       } else {
+               sma->use_global_lock--;
+       }
 }
 
 #define SEM_GLOBAL_LOCK        (-1)
@@ -346,30 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
         * Optimized locking is possible if no complex operation
         * is either enqueued or processed right now.
         *
-        * Both facts are tracked by complex_mode.
+        * Both facts are tracked by use_global_mode.
         */
        sem = sma->sem_base + sops->sem_num;
 
        /*
-        * Initial check for complex_mode. Just an optimization,
+        * Initial check for use_global_lock. Just an optimization,
         * no locking, no memory barrier.
         */
-       if (!sma->complex_mode) {
+       if (!sma->use_global_lock) {
                /*
                 * It appears that no complex operation is around.
                 * Acquire the per-semaphore lock.
                 */
                spin_lock(&sem->lock);
 
-               /*
-                * See 51d7d5205d33
-                * ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
-                * A full barrier is required: the write of sem->lock
-                * must be visible before the read is executed
-                */
-               smp_mb();
-
-               if (!smp_load_acquire(&sma->complex_mode)) {
+               /* pairs with smp_store_release() */
+               if (!smp_load_acquire(&sma->use_global_lock)) {
                        /* fast path successful! */
                        return sops->sem_num;
                }
@@ -379,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
        /* slow path: acquire the full lock */
        ipc_lock_object(&sma->sem_perm);
 
-       if (sma->complex_count == 0) {
-               /* False alarm:
-                * There is no complex operation, thus we can switch
-                * back to the fast path.
+       if (sma->use_global_lock == 0) {
+               /*
+                * The use_global_lock mode ended while we waited for
+                * sma->sem_perm.lock. Thus we must switch to locking
+                * with sem->lock.
+                * Unlike in the fast path, there is no need to recheck
+                * sma->use_global_lock after we have acquired sem->lock:
+                * We own sma->sem_perm.lock, thus use_global_lock cannot
+                * change.
                 */
                spin_lock(&sem->lock);
+
                ipc_unlock_object(&sma->sem_perm);
                return sops->sem_num;
        } else {
-               /* Not a false alarm, thus complete the sequence for a
-                * full lock.
+               /*
+                * Not a false alarm, thus continue to use the global lock
+                * mode. No need for complexmode_enter(), this was done by
+                * the caller that has set use_global_mode to non-zero.
                 */
-               complexmode_enter(sma);
                return SEM_GLOBAL_LOCK;
        }
 }
@@ -495,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
        }
 
        sma->complex_count = 0;
-       sma->complex_mode = true; /* dropped by sem_unlock below */
+       sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
        INIT_LIST_HEAD(&sma->pending_alter);
        INIT_LIST_HEAD(&sma->pending_const);
        INIT_LIST_HEAD(&sma->list_id);
index d7805ac..06ea9ef 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1091,8 +1091,8 @@ out_unlock1:
  * "raddr" thing points to kernel space, and there has to be a wrapper around
  * this.
  */
-long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
-             unsigned long shmlba)
+long do_shmat(int shmid, char __user *shmaddr, int shmflg,
+             ulong *raddr, unsigned long shmlba)
 {
        struct shmid_kernel *shp;
        unsigned long addr;
@@ -1113,8 +1113,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
                goto out;
        else if ((addr = (ulong)shmaddr)) {
                if (addr & (shmlba - 1)) {
-                       if (shmflg & SHM_RND)
-                               addr &= ~(shmlba - 1);     /* round down */
+                       /*
+                        * Round down to the nearest multiple of shmlba.
+                        * For sane do_mmap_pgoff() parameters, avoid
+                        * round downs that trigger nil-page and MAP_FIXED.
+                        */
+                       if ((shmflg & SHM_RND) && addr >= shmlba)
+                               addr &= ~(shmlba - 1);
                        else
 #ifndef __ARCH_FORCE_SHMLBA
                                if (addr & ~PAGE_MASK)
index 12c679f..b302b47 100644 (file)
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644 (file)
index 0000000..387348a
--- /dev/null
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644 (file)
index 0000000..9203bfb
--- /dev/null
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+       /* the cgroup and css_set this link associates */
+       struct cgroup           *cgrp;
+       struct css_set          *cset;
+
+       /* list of cgrp_cset_links anchored at cgrp->cset_links */
+       struct list_head        cset_link;
+
+       /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+       struct list_head        cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+       /* the src and dst cset list running through cset->mg_node */
+       struct list_head        src_csets;
+       struct list_head        dst_csets;
+
+       /* the subsys currently being processed */
+       int                     ssid;
+
+       /*
+        * Fields for cgroup_taskset_*() iteration.
+        *
+        * Before migration is committed, the target migration tasks are on
+        * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+        * the csets on ->dst_csets.  ->csets point to either ->src_csets
+        * or ->dst_csets depending on whether migration is committed.
+        *
+        * ->cur_csets and ->cur_task point to the current task position
+        * during iteration.
+        */
+       struct list_head        *csets;
+       struct css_set          *cur_cset;
+       struct task_struct      *cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+       /*
+        * Preloaded source and destination csets.  Used to guarantee
+        * atomic success or failure on actual migration.
+        */
+       struct list_head        preloaded_src_csets;
+       struct list_head        preloaded_dst_csets;
+
+       /* tasks and csets to migrate */
+       struct cgroup_taskset   tset;
+
+       /* subsystems affected by migration */
+       u16                     ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset)                                              \
+{                                                                              \
+       .src_csets              = LIST_HEAD_INIT(tset.src_csets),               \
+       .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),               \
+       .csets                  = &tset.src_csets,                              \
+}
+
+#define CGROUP_MGCTX_INIT(name)                                                        \
+{                                                                              \
+       LIST_HEAD_INIT(name.preloaded_src_csets),                               \
+       LIST_HEAD_INIT(name.preloaded_dst_csets),                               \
+       CGROUP_TASKSET_INIT(name.tset),                                         \
+}
+
+#define DEFINE_CGROUP_MGCTX(name)                                              \
+       struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+       u16 subsys_mask;
+       unsigned int flags;
+       char *release_agent;
+       bool cpuset_clone_children;
+       char *name;
+       /* User explicitly requested empty subsystem */
+       bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root)                                            \
+       list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)                                      \
+       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
+            (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+       return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+       return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+       unsigned long flags;
+
+       /*
+        * Ensure that the refcount doesn't hit zero while any readers
+        * can see it. Similar to atomic_dec_and_lock(), but for an
+        * rwlock
+        */
+       if (atomic_add_unless(&cset->refcount, -1, 1))
+               return;
+
+       spin_lock_irqsave(&css_set_lock, flags);
+       put_css_set_locked(cset);
+       spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+       atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+                                    struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                         struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+                              struct cgroup_root *root, unsigned long magic,
+                              struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+                           struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+                  struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+                      bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+                            size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+                          loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+                    struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+                            void *data, unsigned long magic,
+                            struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644 (file)
index 0000000..fc34bcf
--- /dev/null
@@ -0,0 +1,1395 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY   HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+       return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+       struct cgroup_root *root;
+       int retval = 0;
+
+       mutex_lock(&cgroup_mutex);
+       percpu_down_write(&cgroup_threadgroup_rwsem);
+       for_each_root(root) {
+               struct cgroup *from_cgrp;
+
+               if (root == &cgrp_dfl_root)
+                       continue;
+
+               spin_lock_irq(&css_set_lock);
+               from_cgrp = task_cgroup_from_root(from, root);
+               spin_unlock_irq(&css_set_lock);
+
+               retval = cgroup_attach_task(from_cgrp, tsk, false);
+               if (retval)
+                       break;
+       }
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+       mutex_unlock(&cgroup_mutex);
+
+       return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+       DEFINE_CGROUP_MGCTX(mgctx);
+       struct cgrp_cset_link *link;
+       struct css_task_iter it;
+       struct task_struct *task;
+       int ret;
+
+       if (cgroup_on_dfl(to))
+               return -EINVAL;
+
+       if (!cgroup_may_migrate_to(to))
+               return -EBUSY;
+
+       mutex_lock(&cgroup_mutex);
+
+       percpu_down_write(&cgroup_threadgroup_rwsem);
+
+       /* all tasks in @from are being moved, all csets are source */
+       spin_lock_irq(&css_set_lock);
+       list_for_each_entry(link, &from->cset_links, cset_link)
+               cgroup_migrate_add_src(link->cset, to, &mgctx);
+       spin_unlock_irq(&css_set_lock);
+
+       ret = cgroup_migrate_prepare_dst(&mgctx);
+       if (ret)
+               goto out_err;
+
+       /*
+        * Migrate tasks one-by-one until @from is empty.  This fails iff
+        * ->can_attach() fails.
+        */
+       do {
+               css_task_iter_start(&from->self, &it);
+               task = css_task_iter_next(&it);
+               if (task)
+                       get_task_struct(task);
+               css_task_iter_end(&it);
+
+               if (task) {
+                       ret = cgroup_migrate(task, false, &mgctx);
+                       if (!ret)
+                               trace_cgroup_transfer_tasks(to, task, false);
+                       put_task_struct(task);
+               }
+       } while (task && !ret);
+out_err:
+       cgroup_migrate_finish(&mgctx);
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+       mutex_unlock(&cgroup_mutex);
+       return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+       CGROUP_FILE_PROCS,
+       CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+       /*
+        * used to find which pidlist is wanted. doesn't change as long as
+        * this particular list stays in the list.
+       */
+       struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+       /* array of xids */
+       pid_t *list;
+       /* how many elements the above list has */
+       int length;
+       /* each of these stored in a list by its cgroup */
+       struct list_head links;
+       /* pointer to the cgroup we belong to, for list removal purposes */
+       struct cgroup *owner;
+       /* for delayed destruction */
+       struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+       if (PIDLIST_TOO_LARGE(count))
+               return vmalloc(count * sizeof(pid_t));
+       else
+               return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+       kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+       struct cgroup_pidlist *l, *tmp_l;
+
+       mutex_lock(&cgrp->pidlist_mutex);
+       list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+               mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+       mutex_unlock(&cgrp->pidlist_mutex);
+
+       flush_workqueue(cgroup_pidlist_destroy_wq);
+       BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+                                               destroy_dwork);
+       struct cgroup_pidlist *tofree = NULL;
+
+       mutex_lock(&l->owner->pidlist_mutex);
+
+       /*
+        * Destroy iff we didn't get queued again.  The state won't change
+        * as destroy_dwork can only be queued while locked.
+        */
+       if (!delayed_work_pending(dwork)) {
+               list_del(&l->links);
+               pidlist_free(l->list);
+               put_pid_ns(l->key.ns);
+               tofree = l;
+       }
+
+       mutex_unlock(&l->owner->pidlist_mutex);
+       kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+       int src, dest = 1;
+
+       /*
+        * we presume the 0th element is unique, so i starts at 1. trivial
+        * edge cases first; no work needs to be done for either
+        */
+       if (length == 0 || length == 1)
+               return length;
+       /* src and dest walk down the list; dest counts unique elements */
+       for (src = 1; src < length; src++) {
+               /* find next unique element */
+               while (list[src] == list[src-1]) {
+                       src++;
+                       if (src == length)
+                               goto after;
+               }
+               /* dest always points to where the next unique element goes */
+               list[dest] = list[src];
+               dest++;
+       }
+after:
+       return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+       return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+                                                 enum cgroup_filetype type)
+{
+       struct cgroup_pidlist *l;
+       /* don't need task_nsproxy() if we're looking at ourself */
+       struct pid_namespace *ns = task_active_pid_ns(current);
+
+       lockdep_assert_held(&cgrp->pidlist_mutex);
+
+       list_for_each_entry(l, &cgrp->pidlists, links)
+               if (l->key.type == type && l->key.ns == ns)
+                       return l;
+       return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+                                               enum cgroup_filetype type)
+{
+       struct cgroup_pidlist *l;
+
+       lockdep_assert_held(&cgrp->pidlist_mutex);
+
+       l = cgroup_pidlist_find(cgrp, type);
+       if (l)
+               return l;
+
+       /* entry not found; create a new one */
+       l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+       if (!l)
+               return l;
+
+       INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+       l->key.type = type;
+       /* don't need task_nsproxy() if we're looking at ourself */
+       l->key.ns = get_pid_ns(task_active_pid_ns(current));
+       l->owner = cgrp;
+       list_add(&l->links, &cgrp->pidlists);
+       return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+       int count = 0;
+       struct cgrp_cset_link *link;
+
+       spin_lock_irq(&css_set_lock);
+       list_for_each_entry(link, &cgrp->cset_links, cset_link)
+               count += atomic_read(&link->cset->refcount);
+       spin_unlock_irq(&css_set_lock);
+       return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+                             struct cgroup_pidlist **lp)
+{
+       pid_t *array;
+       int length;
+       int pid, n = 0; /* used for populating the array */
+       struct css_task_iter it;
+       struct task_struct *tsk;
+       struct cgroup_pidlist *l;
+
+       lockdep_assert_held(&cgrp->pidlist_mutex);
+
+       /*
+        * If cgroup gets more users after we read count, we won't have
+        * enough space - tough.  This race is indistinguishable to the
+        * caller from the case that the additional cgroup users didn't
+        * show up until sometime later on.
+        */
+       length = cgroup_task_count(cgrp);
+       array = pidlist_allocate(length);
+       if (!array)
+               return -ENOMEM;
+       /* now, populate the array */
+       css_task_iter_start(&cgrp->self, &it);
+       while ((tsk = css_task_iter_next(&it))) {
+               if (unlikely(n == length))
+                       break;
+               /* get tgid or pid for procs or tasks file respectively */
+               if (type == CGROUP_FILE_PROCS)
+                       pid = task_tgid_vnr(tsk);
+               else
+                       pid = task_pid_vnr(tsk);
+               if (pid > 0) /* make sure to only use valid results */
+                       array[n++] = pid;
+       }
+       css_task_iter_end(&it);
+       length = n;
+       /* now sort & (if procs) strip out duplicates */
+       sort(array, length, sizeof(pid_t), cmppid, NULL);
+       if (type == CGROUP_FILE_PROCS)
+               length = pidlist_uniq(array, length);
+
+       l = cgroup_pidlist_find_create(cgrp, type);
+       if (!l) {
+               pidlist_free(array);
+               return -ENOMEM;
+       }
+
+       /* store array, freeing old if necessary */
+       pidlist_free(l->list);
+       l->list = array;
+       l->length = length;
+       *lp = l;
+       return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+       /*
+        * Initially we receive a position value that corresponds to
+        * one more than the last pid shown (or 0 on the first call or
+        * after a seek to the start). Use a binary-search to find the
+        * next pid to display, if any
+        */
+       struct kernfs_open_file *of = s->private;
+       struct cgroup *cgrp = seq_css(s)->cgroup;
+       struct cgroup_pidlist *l;
+       enum cgroup_filetype type = seq_cft(s)->private;
+       int index = 0, pid = *pos;
+       int *iter, ret;
+
+       mutex_lock(&cgrp->pidlist_mutex);
+
+       /*
+        * !NULL @of->priv indicates that this isn't the first start()
+        * after open.  If the matching pidlist is around, we can use that.
+        * Look for it.  Note that @of->priv can't be used directly.  It
+        * could already have been destroyed.
+        */
+       if (of->priv)
+               of->priv = cgroup_pidlist_find(cgrp, type);
+
+       /*
+        * Either this is the first start() after open or the matching
+        * pidlist has been destroyed inbetween.  Create a new one.
+        */
+       if (!of->priv) {
+               ret = pidlist_array_load(cgrp, type,
+                                        (struct cgroup_pidlist **)&of->priv);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
+       l = of->priv;
+
+       if (pid) {
+               int end = l->length;
+
+               while (index < end) {
+                       int mid = (index + end) / 2;
+                       if (l->list[mid] == pid) {
+                               index = mid;
+                               break;
+                       } else if (l->list[mid] <= pid)
+                               index = mid + 1;
+                       else
+                               end = mid;
+               }
+       }
+       /* If we're off the end of the array, we're done */
+       if (index >= l->length)
+               return NULL;
+       /* Update the abstract position to be the actual pid that we found */
+       iter = l->list + index;
+       *pos = *iter;
+       return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+       struct kernfs_open_file *of = s->private;
+       struct cgroup_pidlist *l = of->priv;
+
+       if (l)
+               mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+                                CGROUP_PIDLIST_DESTROY_DELAY);
+       mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct kernfs_open_file *of = s->private;
+       struct cgroup_pidlist *l = of->priv;
+       pid_t *p = v;
+       pid_t *end = l->list + l->length;
+       /*
+        * Advance to the next pid in the array. If this goes off the
+        * end, we're done
+        */
+       p++;
+       if (p >= end) {
+               return NULL;
+       } else {
+               *pos = *p;
+               return p;
+       }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+       seq_printf(s, "%d\n", *(int *)v);
+
+       return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
+{
+       return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+                                         char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup *cgrp;
+
+       BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENODEV;
+       spin_lock(&release_agent_path_lock);
+       strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+               sizeof(cgrp->root->release_agent_path));
+       spin_unlock(&release_agent_path_lock);
+       cgroup_kn_unlock(of->kn);
+       return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+       spin_lock(&release_agent_path_lock);
+       seq_puts(seq, cgrp->root->release_agent_path);
+       spin_unlock(&release_agent_path_lock);
+       seq_putc(seq, '\n');
+       return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+       seq_puts(seq, "0\n");
+       return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+                                        struct cftype *cft)
+{
+       return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+                                         struct cftype *cft, u64 val)
+{
+       if (val)
+               set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+       else
+               clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+       return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+                                     struct cftype *cft)
+{
+       return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+                                      struct cftype *cft, u64 val)
+{
+       if (val)
+               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+       else
+               clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+       return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+       {
+               .name = "cgroup.procs",
+               .seq_start = cgroup_pidlist_start,
+               .seq_next = cgroup_pidlist_next,
+               .seq_stop = cgroup_pidlist_stop,
+               .seq_show = cgroup_pidlist_show,
+               .private = CGROUP_FILE_PROCS,
+               .write = cgroup_procs_write,
+       },
+       {
+               .name = "cgroup.clone_children",
+               .read_u64 = cgroup_clone_children_read,
+               .write_u64 = cgroup_clone_children_write,
+       },
+       {
+               .name = "cgroup.sane_behavior",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .seq_show = cgroup_sane_behavior_show,
+       },
+       {
+               .name = "tasks",
+               .seq_start = cgroup_pidlist_start,
+               .seq_next = cgroup_pidlist_next,
+               .seq_stop = cgroup_pidlist_stop,
+               .seq_show = cgroup_pidlist_show,
+               .private = CGROUP_FILE_TASKS,
+               .write = cgroup_tasks_write,
+       },
+       {
+               .name = "notify_on_release",
+               .read_u64 = cgroup_read_notify_on_release,
+               .write_u64 = cgroup_write_notify_on_release,
+       },
+       {
+               .name = "release_agent",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .seq_show = cgroup_release_agent_show,
+               .write = cgroup_release_agent_write,
+               .max_write_len = PATH_MAX - 1,
+       },
+       { }     /* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+       struct cgroup_subsys *ss;
+       int i;
+
+       seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+       /*
+        * ideally we don't want subsystems moving around while we do this.
+        * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+        * subsys/hierarchy state.
+        */
+       mutex_lock(&cgroup_mutex);
+
+       for_each_subsys(ss, i)
+               seq_printf(m, "%s\t%d\t%d\t%d\n",
+                          ss->legacy_name, ss->root->hierarchy_id,
+                          atomic_read(&ss->root->nr_cgrps),
+                          cgroup_ssid_enabled(i));
+
+       mutex_unlock(&cgroup_mutex);
+       return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+       .open = cgroupstats_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+       struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+       struct cgroup *cgrp;
+       struct css_task_iter it;
+       struct task_struct *tsk;
+
+       /* it should be kernfs_node belonging to cgroupfs and is a directory */
+       if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+           kernfs_type(kn) != KERNFS_DIR)
+               return -EINVAL;
+
+       mutex_lock(&cgroup_mutex);
+
+       /*
+        * We aren't being called from kernfs and there's no guarantee on
+        * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+        * @kn->priv is RCU safe.  Let's do the RCU dancing.
+        */
+       rcu_read_lock();
+       cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+       if (!cgrp || cgroup_is_dead(cgrp)) {
+               rcu_read_unlock();
+               mutex_unlock(&cgroup_mutex);
+               return -ENOENT;
+       }
+       rcu_read_unlock();
+
+       css_task_iter_start(&cgrp->self, &it);
+       while ((tsk = css_task_iter_next(&it))) {
+               switch (tsk->state) {
+               case TASK_RUNNING:
+                       stats->nr_running++;
+                       break;
+               case TASK_INTERRUPTIBLE:
+                       stats->nr_sleeping++;
+                       break;
+               case TASK_UNINTERRUPTIBLE:
+                       stats->nr_uninterruptible++;
+                       break;
+               case TASK_STOPPED:
+                       stats->nr_stopped++;
+                       break;
+               default:
+                       if (delayacct_is_task_waiting_on_io(tsk))
+                               stats->nr_io_wait++;
+                       break;
+               }
+       }
+       css_task_iter_end(&it);
+
+       mutex_unlock(&cgroup_mutex);
+       return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+       if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+           !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+               schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+       struct cgroup *cgrp =
+               container_of(work, struct cgroup, release_agent_work);
+       char *pathbuf = NULL, *agentbuf = NULL;
+       char *argv[3], *envp[3];
+       int ret;
+
+       mutex_lock(&cgroup_mutex);
+
+       pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+       agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+       if (!pathbuf || !agentbuf)
+               goto out;
+
+       spin_lock_irq(&css_set_lock);
+       ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+       spin_unlock_irq(&css_set_lock);
+       if (ret < 0 || ret >= PATH_MAX)
+               goto out;
+
+       argv[0] = agentbuf;
+       argv[1] = pathbuf;
+       argv[2] = NULL;
+
+       /* minimal command environment */
+       envp[0] = "HOME=/";
+       envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+       envp[2] = NULL;
+
+       mutex_unlock(&cgroup_mutex);
+       call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+       goto out_free;
+out:
+       mutex_unlock(&cgroup_mutex);
+out_free:
+       kfree(agentbuf);
+       kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+                         const char *new_name_str)
+{
+       struct cgroup *cgrp = kn->priv;
+       int ret;
+
+       if (kernfs_type(kn) != KERNFS_DIR)
+               return -ENOTDIR;
+       if (kn->parent != new_parent)
+               return -EIO;
+
+       /*
+        * We're gonna grab cgroup_mutex which nests outside kernfs
+        * active_ref.  kernfs_rename() doesn't require active_ref
+        * protection.  Break them before grabbing cgroup_mutex.
+        */
+       kernfs_break_active_protection(new_parent);
+       kernfs_break_active_protection(kn);
+
+       mutex_lock(&cgroup_mutex);
+
+       ret = kernfs_rename(kn, new_parent, new_name_str);
+       if (!ret)
+               trace_cgroup_rename(cgrp);
+
+       mutex_unlock(&cgroup_mutex);
+
+       kernfs_unbreak_active_protection(kn);
+       kernfs_unbreak_active_protection(new_parent);
+       return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+       struct cgroup_subsys *ss;
+       int ssid;
+
+       for_each_subsys(ss, ssid)
+               if (root->subsys_mask & (1 << ssid))
+                       seq_show_option(seq, ss->legacy_name, NULL);
+       if (root->flags & CGRP_ROOT_NOPREFIX)
+               seq_puts(seq, ",noprefix");
+       if (root->flags & CGRP_ROOT_XATTR)
+               seq_puts(seq, ",xattr");
+
+       spin_lock(&release_agent_path_lock);
+       if (strlen(root->release_agent_path))
+               seq_show_option(seq, "release_agent",
+                               root->release_agent_path);
+       spin_unlock(&release_agent_path_lock);
+
+       if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+               seq_puts(seq, ",clone_children");
+       if (strlen(root->name))
+               seq_show_option(seq, "name", root->name);
+       return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+       char *token, *o = data;
+       bool all_ss = false, one_ss = false;
+       u16 mask = U16_MAX;
+       struct cgroup_subsys *ss;
+       int nr_opts = 0;
+       int i;
+
+#ifdef CONFIG_CPUSETS
+       mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+       memset(opts, 0, sizeof(*opts));
+
+       while ((token = strsep(&o, ",")) != NULL) {
+               nr_opts++;
+
+               if (!*token)
+                       return -EINVAL;
+               if (!strcmp(token, "none")) {
+                       /* Explicitly have no subsystems */
+                       opts->none = true;
+                       continue;
+               }
+               if (!strcmp(token, "all")) {
+                       /* Mutually exclusive option 'all' + subsystem name */
+                       if (one_ss)
+                               return -EINVAL;
+                       all_ss = true;
+                       continue;
+               }
+               if (!strcmp(token, "noprefix")) {
+                       opts->flags |= CGRP_ROOT_NOPREFIX;
+                       continue;
+               }
+               if (!strcmp(token, "clone_children")) {
+                       opts->cpuset_clone_children = true;
+                       continue;
+               }
+               if (!strcmp(token, "xattr")) {
+                       opts->flags |= CGRP_ROOT_XATTR;
+                       continue;
+               }
+               if (!strncmp(token, "release_agent=", 14)) {
+                       /* Specifying two release agents is forbidden */
+                       if (opts->release_agent)
+                               return -EINVAL;
+                       opts->release_agent =
+                               kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+                       if (!opts->release_agent)
+                               return -ENOMEM;
+                       continue;
+               }
+               if (!strncmp(token, "name=", 5)) {
+                       const char *name = token + 5;
+                       /* Can't specify an empty name */
+                       if (!strlen(name))
+                               return -EINVAL;
+                       /* Must match [\w.-]+ */
+                       for (i = 0; i < strlen(name); i++) {
+                               char c = name[i];
+                               if (isalnum(c))
+                                       continue;
+                               if ((c == '.') || (c == '-') || (c == '_'))
+                                       continue;
+                               return -EINVAL;
+                       }
+                       /* Specifying two names is forbidden */
+                       if (opts->name)
+                               return -EINVAL;
+                       opts->name = kstrndup(name,
+                                             MAX_CGROUP_ROOT_NAMELEN - 1,
+                                             GFP_KERNEL);
+                       if (!opts->name)
+                               return -ENOMEM;
+
+                       continue;
+               }
+
+               for_each_subsys(ss, i) {
+                       if (strcmp(token, ss->legacy_name))
+                               continue;
+                       if (!cgroup_ssid_enabled(i))
+                               continue;
+                       if (cgroup1_ssid_disabled(i))
+                               continue;
+
+                       /* Mutually exclusive option 'all' + subsystem name */
+                       if (all_ss)
+                               return -EINVAL;
+                       opts->subsys_mask |= (1 << i);
+                       one_ss = true;
+
+                       break;
+               }
+               if (i == CGROUP_SUBSYS_COUNT)
+                       return -ENOENT;
+       }
+
+       /*
+        * If the 'all' option was specified select all the subsystems,
+        * otherwise if 'none', 'name=' and a subsystem name options were
+        * not specified, let's default to 'all'
+        */
+       if (all_ss || (!one_ss && !opts->none && !opts->name))
+               for_each_subsys(ss, i)
+                       if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+                               opts->subsys_mask |= (1 << i);
+
+       /*
+        * We either have to specify by name or by subsystems. (So all
+        * empty hierarchies must have a name).
+        */
+       if (!opts->subsys_mask && !opts->name)
+               return -EINVAL;
+
+       /*
+        * Option noprefix was introduced just for backward compatibility
+        * with the old cpuset, so we allow noprefix only if mounting just
+        * the cpuset subsystem.
+        */
+       if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+               return -EINVAL;
+
+       /* Can't specify "none" and some subsystems */
+       if (opts->subsys_mask && opts->none)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+       int ret = 0;
+       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+       struct cgroup_sb_opts opts;
+       u16 added_mask, removed_mask;
+
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+       /* See what subsystems are wanted */
+       ret = parse_cgroupfs_options(data, &opts);
+       if (ret)
+               goto out_unlock;
+
+       if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+               pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+                       task_tgid_nr(current), current->comm);
+
+       added_mask = opts.subsys_mask & ~root->subsys_mask;
+       removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+       /* Don't allow flags or name to change at remount */
+       if ((opts.flags ^ root->flags) ||
+           (opts.name && strcmp(opts.name, root->name))) {
+               pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+                      opts.flags, opts.name ?: "", root->flags, root->name);
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       /* remounting is not allowed for populated hierarchies */
+       if (!list_empty(&root->cgrp.self.children)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       ret = rebind_subsystems(root, added_mask);
+       if (ret)
+               goto out_unlock;
+
+       WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+       if (opts.release_agent) {
+               spin_lock(&release_agent_path_lock);
+               strcpy(root->release_agent_path, opts.release_agent);
+               spin_unlock(&release_agent_path_lock);
+       }
+
+       trace_cgroup_remount(root);
+
+ out_unlock:
+       kfree(opts.release_agent);
+       kfree(opts.name);
+       mutex_unlock(&cgroup_mutex);
+       return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+       .rename                 = cgroup1_rename,
+       .show_options           = cgroup1_show_options,
+       .remount_fs             = cgroup1_remount,
+       .mkdir                  = cgroup_mkdir,
+       .rmdir                  = cgroup_rmdir,
+       .show_path              = cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+                            void *data, unsigned long magic,
+                            struct cgroup_namespace *ns)
+{
+       struct super_block *pinned_sb = NULL;
+       struct cgroup_sb_opts opts;
+       struct cgroup_root *root;
+       struct cgroup_subsys *ss;
+       struct dentry *dentry;
+       int i, ret;
+
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+       /* First find the desired set of subsystems */
+       ret = parse_cgroupfs_options(data, &opts);
+       if (ret)
+               goto out_unlock;
+
+       /*
+        * Destruction of cgroup root is asynchronous, so subsystems may
+        * still be dying after the previous unmount.  Let's drain the
+        * dying subsystems.  We just need to ensure that the ones
+        * unmounted previously finish dying and don't care about new ones
+        * starting.  Testing ref liveliness is good enough.
+        */
+       for_each_subsys(ss, i) {
+               if (!(opts.subsys_mask & (1 << i)) ||
+                   ss->root == &cgrp_dfl_root)
+                       continue;
+
+               if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+                       mutex_unlock(&cgroup_mutex);
+                       msleep(10);
+                       ret = restart_syscall();
+                       goto out_free;
+               }
+               cgroup_put(&ss->root->cgrp);
+       }
+
+       for_each_root(root) {
+               bool name_match = false;
+
+               if (root == &cgrp_dfl_root)
+                       continue;
+
+               /*
+                * If we asked for a name then it must match.  Also, if
+                * name matches but sybsys_mask doesn't, we should fail.
+                * Remember whether name matched.
+                */
+               if (opts.name) {
+                       if (strcmp(opts.name, root->name))
+                               continue;
+                       name_match = true;
+               }
+
+               /*
+                * If we asked for subsystems (or explicitly for no
+                * subsystems) then they must match.
+                */
+               if ((opts.subsys_mask || opts.none) &&
+                   (opts.subsys_mask != root->subsys_mask)) {
+                       if (!name_match)
+                               continue;
+                       ret = -EBUSY;
+                       goto out_unlock;
+               }
+
+               if (root->flags ^ opts.flags)
+                       pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+               /*
+                * We want to reuse @root whose lifetime is governed by its
+                * ->cgrp.  Let's check whether @root is alive and keep it
+                * that way.  As cgroup_kill_sb() can happen anytime, we
+                * want to block it by pinning the sb so that @root doesn't
+                * get killed before mount is complete.
+                *
+                * With the sb pinned, tryget_live can reliably indicate
+                * whether @root can be reused.  If it's being killed,
+                * drain it.  We can use wait_queue for the wait but this
+                * path is super cold.  Let's just sleep a bit and retry.
+                */
+               pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+               if (IS_ERR(pinned_sb) ||
+                   !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+                       mutex_unlock(&cgroup_mutex);
+                       if (!IS_ERR_OR_NULL(pinned_sb))
+                               deactivate_super(pinned_sb);
+                       msleep(10);
+                       ret = restart_syscall();
+                       goto out_free;
+               }
+
+               ret = 0;
+               goto out_unlock;
+       }
+
+       /*
+        * No such thing, create a new one.  name= matching without subsys
+        * specification is allowed for already existing hierarchies but we
+        * can't create new one without subsys specification.
+        */
+       if (!opts.subsys_mask && !opts.none) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       /* Hierarchies may only be created in the initial cgroup namespace. */
+       if (ns != &init_cgroup_ns) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
+       root = kzalloc(sizeof(*root), GFP_KERNEL);
+       if (!root) {
+               ret = -ENOMEM;
+               goto out_unlock;
+       }
+
+       init_cgroup_root(root, &opts);
+
+       ret = cgroup_setup_root(root, opts.subsys_mask);
+       if (ret)
+               cgroup_free_root(root);
+
+out_unlock:
+       mutex_unlock(&cgroup_mutex);
+out_free:
+       kfree(opts.release_agent);
+       kfree(opts.name);
+
+       if (ret)
+               return ERR_PTR(ret);
+
+       dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+                                CGROUP_SUPER_MAGIC, ns);
+
+       /*
+        * If @pinned_sb, we're reusing an existing root and holding an
+        * extra ref on its sb.  Mount is complete.  Put the extra ref.
+        */
+       if (pinned_sb)
+               deactivate_super(pinned_sb);
+
+       return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+       /*
+        * Used to destroy pidlists and separate to serve as flush domain.
+        * Cap @max_active to 1 too.
+        */
+       cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+                                                   0, 1);
+       BUG_ON(!cgroup_pidlist_destroy_wq);
+       return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+       struct cgroup_subsys *ss;
+       char *token;
+       int i;
+
+       while ((token = strsep(&str, ",")) != NULL) {
+               if (!*token)
+                       continue;
+
+               if (!strcmp(token, "all")) {
+                       cgroup_no_v1_mask = U16_MAX;
+                       break;
+               }
+
+               for_each_subsys(ss, i) {
+                       if (strcmp(token, ss->name) &&
+                           strcmp(token, ss->legacy_name))
+                               continue;
+
+                       cgroup_no_v1_mask |= 1 << i;
+               }
+       }
+       return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+       if (!css)
+               return ERR_PTR(-ENOMEM);
+
+       return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+       kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
+{
+       return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
+{
+       return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+                                        struct cftype *cft)
+{
+       u64 count;
+
+       rcu_read_lock();
+       count = atomic_read(&task_css_set(current)->refcount);
+       rcu_read_unlock();
+       return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
+       char *name_buf;
+
+       name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+       if (!name_buf)
+               return -ENOMEM;
+
+       spin_lock_irq(&css_set_lock);
+       rcu_read_lock();
+       cset = rcu_dereference(current->cgroups);
+       list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+               struct cgroup *c = link->cgrp;
+
+               cgroup_name(c, name_buf, NAME_MAX + 1);
+               seq_printf(seq, "Root %d group %s\n",
+                          c->root->hierarchy_id, name_buf);
+       }
+       rcu_read_unlock();
+       spin_unlock_irq(&css_set_lock);
+       kfree(name_buf);
+       return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+       struct cgroup_subsys_state *css = seq_css(seq);
+       struct cgrp_cset_link *link;
+
+       spin_lock_irq(&css_set_lock);
+       list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+               struct css_set *cset = link->cset;
+               struct task_struct *task;
+               int count = 0;
+
+               seq_printf(seq, "css_set %p\n", cset);
+
+               list_for_each_entry(task, &cset->tasks, cg_list) {
+                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+                               goto overflow;
+                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+               }
+
+               list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+                               goto overflow;
+                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+               }
+               continue;
+       overflow:
+               seq_puts(seq, "  ...\n");
+       }
+       spin_unlock_irq(&css_set_lock);
+       return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+       return (!cgroup_is_populated(css->cgroup) &&
+               !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+       {
+               .name = "taskcount",
+               .read_u64 = debug_taskcount_read,
+       },
+
+       {
+               .name = "current_css_set",
+               .read_u64 = current_css_set_read,
+       },
+
+       {
+               .name = "current_css_set_refcount",
+               .read_u64 = current_css_set_refcount_read,
+       },
+
+       {
+               .name = "current_css_set_cg_links",
+               .seq_show = current_css_set_cg_links_read,
+       },
+
+       {
+               .name = "cgroup_css_links",
+               .seq_show = cgroup_css_links_read,
+       },
+
+       {
+               .name = "releasable",
+               .read_u64 = releasable_read,
+       },
+
+       { }     /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+       .css_alloc = debug_css_alloc,
+       .css_free = debug_css_free,
+       .legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
similarity index 72%
rename from kernel/cgroup.c
rename to kernel/cgroup/cgroup.c
index 53bbca7..e8f87bf 100644 (file)
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
 #include <linux/cred.h>
-#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
-#include <linux/list.h>
 #include <linux/magic.h>
-#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
 #include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
-#include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
 
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY   HZ
-
 #define CGROUP_FILE_NAME_MAX           (MAX_CGROUP_TYPE_NAMELEN +      \
                                         MAX_CFTYPE_NAME + 2)
 
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
-#ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()                            \
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_visible;
 
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
 /* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 static u64 css_serial_nr_next = 1;
 
 /*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
 static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
        .root_cset      = &init_css_set,
 };
 
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
 static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  * is fine for individual subsystems but unsuitable for cgroup core.  This
  * is slower static_key_enabled() based test indexed by @ssid.
  */
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
 {
        if (CGROUP_SUBSYS_COUNT == 0)
                return false;
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
-static bool cgroup_ssid_no_v1(int ssid)
-{
-       return cgroup_no_v1_mask & (1 << ssid);
-}
-
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
  *
  * - debug: disallowed on the default hierarchy.
  */
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
        return cgrp->root == &cgrp_dfl_root;
 }
@@ -481,12 +435,6 @@ out_unlock:
        return css;
 }
 
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-       return !(cgrp->self.flags & CSS_ONLINE);
-}
-
 static void cgroup_get(struct cgroup *cgrp)
 {
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-static int notify_on_release(const struct cgroup *cgrp)
-{
-       return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
@@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp)
                else
 
 /**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)                                      \
-       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
-            (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
  * do_each_subsys_mask - filter for_each_subsys with a bitmask
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
        }                                                               \
 } while (false)
 
-/* iterate across the hierarchies */
-#define for_each_root(root)                                            \
-       list_for_each_entry((root), &cgroup_roots, root_list)
-
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)                                \
        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
                        ;                                               \
                else
 
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-       /* the cgroup and css_set this link associates */
-       struct cgroup           *cgrp;
-       struct css_set          *cset;
-
-       /* list of cgrp_cset_links anchored at cgrp->cset_links */
-       struct list_head        cset_link;
-
-       /* list of cgrp_cset_links anchored at css_set->cgrp_links */
-       struct list_head        cgrp_link;
-};
-
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
  */
 struct css_set init_css_set = {
        .refcount               = ATOMIC_INIT(1),
-       .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
+       .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
+       .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
-       .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count       = 1;    /* 1 for init_css_set */
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
                if (!trigger)
                        break;
 
-               check_for_release(cgrp);
+               cgroup1_check_for_release(cgrp);
                cgroup_file_notify(&cgrp->events_file);
 
                cgrp = cgroup_parent(cgrp);
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
        return key;
 }
 
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
 {
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
        kfree_rcu(cset, rcu_head);
 }
 
-static void put_css_set(struct css_set *cset)
-{
-       unsigned long flags;
-
-       /*
-        * Ensure that the refcount doesn't hit zero while any readers
-        * can see it. Similar to atomic_dec_and_lock(), but for an
-        * rwlock
-        */
-       if (atomic_add_unless(&cset->refcount, -1, 1))
-               return;
-
-       spin_lock_irqsave(&css_set_lock, flags);
-       put_css_set_locked(cset);
-       spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-       atomic_inc(&cset->refcount);
-}
-
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        }
 
        atomic_set(&cset->refcount, 1);
-       INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
-       INIT_LIST_HEAD(&cset->mg_preload_node);
-       INIT_LIST_HEAD(&cset->mg_node);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
+       INIT_LIST_HEAD(&cset->cgrp_links);
+       INIT_LIST_HEAD(&cset->mg_preload_node);
+       INIT_LIST_HEAD(&cset->mg_node);
 
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        return cset;
 }
 
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
        struct cgroup *root_cgrp = kf_root->kn->priv;
 
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
 {
        if (root) {
                idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_lock held.
  */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-                                           struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+                                    struct cgroup_root *root)
 {
        /*
         * No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  * inaccessible any time.  If the caller intends to continue to access the
  * cgroup, it should pin it before invoking this function.
  */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
 {
        struct cgroup *cgrp;
 
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
-                                         bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
 {
        struct cgroup *cgrp;
 
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
        if (!css->ss) {
                if (cgroup_on_dfl(cgrp))
-                       cfts = cgroup_dfl_base_files;
+                       cfts = cgroup_base_files;
                else
-                       cfts = cgroup_legacy_base_files;
+                       cfts = cgroup1_base_files;
 
                return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
        }
@@ -1559,7 +1439,7 @@ err:
        return ret;
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
        return 0;
 }
 
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-                           struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+                    struct kernfs_root *kf_root)
 {
        int len = 0;
        char *buf = NULL;
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
        return len;
 }
 
-static int cgroup_show_options(struct seq_file *seq,
-                              struct kernfs_root *kf_root)
-{
-       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-       struct cgroup_subsys *ss;
-       int ssid;
-
-       if (root != &cgrp_dfl_root)
-               for_each_subsys(ss, ssid)
-                       if (root->subsys_mask & (1 << ssid))
-                               seq_show_option(seq, ss->legacy_name, NULL);
-       if (root->flags & CGRP_ROOT_NOPREFIX)
-               seq_puts(seq, ",noprefix");
-       if (root->flags & CGRP_ROOT_XATTR)
-               seq_puts(seq, ",xattr");
-
-       spin_lock(&release_agent_path_lock);
-       if (strlen(root->release_agent_path))
-               seq_show_option(seq, "release_agent",
-                               root->release_agent_path);
-       spin_unlock(&release_agent_path_lock);
-
-       if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-               seq_puts(seq, ",clone_children");
-       if (strlen(root->name))
-               seq_show_option(seq, "name", root->name);
-       return 0;
-}
-
-struct cgroup_sb_opts {
-       u16 subsys_mask;
-       unsigned int flags;
-       char *release_agent;
-       bool cpuset_clone_children;
-       char *name;
-       /* User explicitly requested empty subsystem */
-       bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-       char *token, *o = data;
-       bool all_ss = false, one_ss = false;
-       u16 mask = U16_MAX;
-       struct cgroup_subsys *ss;
-       int nr_opts = 0;
-       int i;
-
-#ifdef CONFIG_CPUSETS
-       mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-       memset(opts, 0, sizeof(*opts));
-
-       while ((token = strsep(&o, ",")) != NULL) {
-               nr_opts++;
-
-               if (!*token)
-                       return -EINVAL;
-               if (!strcmp(token, "none")) {
-                       /* Explicitly have no subsystems */
-                       opts->none = true;
-                       continue;
-               }
-               if (!strcmp(token, "all")) {
-                       /* Mutually exclusive option 'all' + subsystem name */
-                       if (one_ss)
-                               return -EINVAL;
-                       all_ss = true;
-                       continue;
-               }
-               if (!strcmp(token, "noprefix")) {
-                       opts->flags |= CGRP_ROOT_NOPREFIX;
-                       continue;
-               }
-               if (!strcmp(token, "clone_children")) {
-                       opts->cpuset_clone_children = true;
-                       continue;
-               }
-               if (!strcmp(token, "xattr")) {
-                       opts->flags |= CGRP_ROOT_XATTR;
-                       continue;
-               }
-               if (!strncmp(token, "release_agent=", 14)) {
-                       /* Specifying two release agents is forbidden */
-                       if (opts->release_agent)
-                               return -EINVAL;
-                       opts->release_agent =
-                               kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-                       if (!opts->release_agent)
-                               return -ENOMEM;
-                       continue;
-               }
-               if (!strncmp(token, "name=", 5)) {
-                       const char *name = token + 5;
-                       /* Can't specify an empty name */
-                       if (!strlen(name))
-                               return -EINVAL;
-                       /* Must match [\w.-]+ */
-                       for (i = 0; i < strlen(name); i++) {
-                               char c = name[i];
-                               if (isalnum(c))
-                                       continue;
-                               if ((c == '.') || (c == '-') || (c == '_'))
-                                       continue;
-                               return -EINVAL;
-                       }
-                       /* Specifying two names is forbidden */
-                       if (opts->name)
-                               return -EINVAL;
-                       opts->name = kstrndup(name,
-                                             MAX_CGROUP_ROOT_NAMELEN - 1,
-                                             GFP_KERNEL);
-                       if (!opts->name)
-                               return -ENOMEM;
-
-                       continue;
-               }
-
-               for_each_subsys(ss, i) {
-                       if (strcmp(token, ss->legacy_name))
-                               continue;
-                       if (!cgroup_ssid_enabled(i))
-                               continue;
-                       if (cgroup_ssid_no_v1(i))
-                               continue;
-
-                       /* Mutually exclusive option 'all' + subsystem name */
-                       if (all_ss)
-                               return -EINVAL;
-                       opts->subsys_mask |= (1 << i);
-                       one_ss = true;
-
-                       break;
-               }
-               if (i == CGROUP_SUBSYS_COUNT)
-                       return -ENOENT;
-       }
-
-       /*
-        * If the 'all' option was specified select all the subsystems,
-        * otherwise if 'none', 'name=' and a subsystem name options were
-        * not specified, let's default to 'all'
-        */
-       if (all_ss || (!one_ss && !opts->none && !opts->name))
-               for_each_subsys(ss, i)
-                       if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
-                               opts->subsys_mask |= (1 << i);
-
-       /*
-        * We either have to specify by name or by subsystems. (So all
-        * empty hierarchies must have a name).
-        */
-       if (!opts->subsys_mask && !opts->name)
-               return -EINVAL;
-
-       /*
-        * Option noprefix was introduced just for backward compatibility
-        * with the old cpuset, so we allow noprefix only if mounting just
-        * the cpuset subsystem.
-        */
-       if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-               return -EINVAL;
-
-       /* Can't specify "none" and some subsystems */
-       if (opts->subsys_mask && opts->none)
-               return -EINVAL;
-
-       return 0;
-}
-
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-       int ret = 0;
-       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-       struct cgroup_sb_opts opts;
-       u16 added_mask, removed_mask;
-
-       if (root == &cgrp_dfl_root) {
-               pr_err("remount is not allowed\n");
-               return -EINVAL;
-       }
-
-       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-       /* See what subsystems are wanted */
-       ret = parse_cgroupfs_options(data, &opts);
-       if (ret)
-               goto out_unlock;
-
-       if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-               pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-                       task_tgid_nr(current), current->comm);
-
-       added_mask = opts.subsys_mask & ~root->subsys_mask;
-       removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-       /* Don't allow flags or name to change at remount */
-       if ((opts.flags ^ root->flags) ||
-           (opts.name && strcmp(opts.name, root->name))) {
-               pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-                      opts.flags, opts.name ?: "", root->flags, root->name);
-               ret = -EINVAL;
-               goto out_unlock;
-       }
-
-       /* remounting is not allowed for populated hierarchies */
-       if (!list_empty(&root->cgrp.self.children)) {
-               ret = -EBUSY;
-               goto out_unlock;
-       }
-
-       ret = rebind_subsystems(root, added_mask);
-       if (ret)
-               goto out_unlock;
-
-       WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-       if (opts.release_agent) {
-               spin_lock(&release_agent_path_lock);
-               strcpy(root->release_agent_path, opts.release_agent);
-               spin_unlock(&release_agent_path_lock);
-       }
-
-       trace_cgroup_remount(root);
-
- out_unlock:
-       kfree(opts.release_agent);
-       kfree(opts.name);
-       mutex_unlock(&cgroup_mutex);
-       return ret;
+       pr_err("remount is not allowed\n");
+       return -EINVAL;
 }
 
 /*
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
        init_waitqueue_head(&cgrp->offline_waitq);
-       INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+       INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
-static void init_cgroup_root(struct cgroup_root *root,
-                            struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 {
        struct cgroup *cgrp = &root->cgrp;
 
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
+       struct kernfs_syscall_ops *kf_sops;
        struct css_set *cset;
        int i, ret;
 
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        if (ret)
                goto cancel_ref;
 
-       root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+       kf_sops = root == &cgrp_dfl_root ?
+               &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+       root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
@@ -2080,20 +1736,48 @@ out:
        return ret;
 }
 
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+                              struct cgroup_root *root, unsigned long magic,
+                              struct cgroup_namespace *ns)
+{
+       struct dentry *dentry;
+       bool new_sb;
+
+       dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
+
+       /*
+        * In non-init cgroup namespace, instead of root cgroup's dentry,
+        * we return the dentry corresponding to the cgroupns->root_cgrp.
+        */
+       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+               struct dentry *nsdentry;
+               struct cgroup *cgrp;
+
+               mutex_lock(&cgroup_mutex);
+               spin_lock_irq(&css_set_lock);
+
+               cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+               spin_unlock_irq(&css_set_lock);
+               mutex_unlock(&cgroup_mutex);
+
+               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+               dput(dentry);
+               dentry = nsdentry;
+       }
+
+       if (IS_ERR(dentry) || !new_sb)
+               cgroup_put(&root->cgrp);
+
+       return dentry;
+}
+
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data)
 {
-       bool is_v2 = fs_type == &cgroup2_fs_type;
-       struct super_block *pinned_sb = NULL;
        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-       struct cgroup_subsys *ss;
-       struct cgroup_root *root;
-       struct cgroup_sb_opts opts;
        struct dentry *dentry;
-       int ret;
-       int i;
-       bool new_sb;
 
        get_cgroup_ns(ns);
 
@@ -2110,225 +1794,65 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        if (!use_task_css_set_links)
                cgroup_enable_task_cg_lists();
 
-       if (is_v2) {
+       if (fs_type == &cgroup2_fs_type) {
                if (data) {
                        pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
                        put_cgroup_ns(ns);
                        return ERR_PTR(-EINVAL);
                }
                cgrp_dfl_visible = true;
-               root = &cgrp_dfl_root;
-               cgroup_get(&root->cgrp);
-               goto out_mount;
+               cgroup_get(&cgrp_dfl_root.cgrp);
+
+               dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+                                        CGROUP2_SUPER_MAGIC, ns);
+       } else {
+               dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+                                      CGROUP_SUPER_MAGIC, ns);
        }
 
-       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+       put_cgroup_ns(ns);
+       return dentry;
+}
 
-       /* First find the desired set of subsystems */
-       ret = parse_cgroupfs_options(data, &opts);
-       if (ret)
-               goto out_unlock;
+static void cgroup_kill_sb(struct super_block *sb)
+{
+       struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 
        /*
-        * Destruction of cgroup root is asynchronous, so subsystems may
-        * still be dying after the previous unmount.  Let's drain the
-        * dying subsystems.  We just need to ensure that the ones
-        * unmounted previously finish dying and don't care about new ones
-        * starting.  Testing ref liveliness is good enough.
+        * If @root doesn't have any mounts or children, start killing it.
+        * This prevents new mounts by disabling percpu_ref_tryget_live().
+        * cgroup_mount() may wait for @root's release.
+        *
+        * And don't kill the default root.
         */
-       for_each_subsys(ss, i) {
-               if (!(opts.subsys_mask & (1 << i)) ||
-                   ss->root == &cgrp_dfl_root)
-                       continue;
+       if (!list_empty(&root->cgrp.self.children) ||
+           root == &cgrp_dfl_root)
+               cgroup_put(&root->cgrp);
+       else
+               percpu_ref_kill(&root->cgrp.self.refcnt);
 
-               if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-                       mutex_unlock(&cgroup_mutex);
-                       msleep(10);
-                       ret = restart_syscall();
-                       goto out_free;
-               }
-               cgroup_put(&ss->root->cgrp);
-       }
+       kernfs_kill_sb(sb);
+}
 
-       for_each_root(root) {
-               bool name_match = false;
+struct file_system_type cgroup_fs_type = {
+       .name = "cgroup",
+       .mount = cgroup_mount,
+       .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
+};
 
-               if (root == &cgrp_dfl_root)
-                       continue;
+static struct file_system_type cgroup2_fs_type = {
+       .name = "cgroup2",
+       .mount = cgroup_mount,
+       .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
+};
 
-               /*
-                * If we asked for a name then it must match.  Also, if
-                * name matches but sybsys_mask doesn't, we should fail.
-                * Remember whether name matched.
-                */
-               if (opts.name) {
-                       if (strcmp(opts.name, root->name))
-                               continue;
-                       name_match = true;
-               }
-
-               /*
-                * If we asked for subsystems (or explicitly for no
-                * subsystems) then they must match.
-                */
-               if ((opts.subsys_mask || opts.none) &&
-                   (opts.subsys_mask != root->subsys_mask)) {
-                       if (!name_match)
-                               continue;
-                       ret = -EBUSY;
-                       goto out_unlock;
-               }
-
-               if (root->flags ^ opts.flags)
-                       pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-               /*
-                * We want to reuse @root whose lifetime is governed by its
-                * ->cgrp.  Let's check whether @root is alive and keep it
-                * that way.  As cgroup_kill_sb() can happen anytime, we
-                * want to block it by pinning the sb so that @root doesn't
-                * get killed before mount is complete.
-                *
-                * With the sb pinned, tryget_live can reliably indicate
-                * whether @root can be reused.  If it's being killed,
-                * drain it.  We can use wait_queue for the wait but this
-                * path is super cold.  Let's just sleep a bit and retry.
-                */
-               pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-               if (IS_ERR(pinned_sb) ||
-                   !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-                       mutex_unlock(&cgroup_mutex);
-                       if (!IS_ERR_OR_NULL(pinned_sb))
-                               deactivate_super(pinned_sb);
-                       msleep(10);
-                       ret = restart_syscall();
-                       goto out_free;
-               }
-
-               ret = 0;
-               goto out_unlock;
-       }
-
-       /*
-        * No such thing, create a new one.  name= matching without subsys
-        * specification is allowed for already existing hierarchies but we
-        * can't create new one without subsys specification.
-        */
-       if (!opts.subsys_mask && !opts.none) {
-               ret = -EINVAL;
-               goto out_unlock;
-       }
-
-       /* Hierarchies may only be created in the initial cgroup namespace. */
-       if (ns != &init_cgroup_ns) {
-               ret = -EPERM;
-               goto out_unlock;
-       }
-
-       root = kzalloc(sizeof(*root), GFP_KERNEL);
-       if (!root) {
-               ret = -ENOMEM;
-               goto out_unlock;
-       }
-
-       init_cgroup_root(root, &opts);
-
-       ret = cgroup_setup_root(root, opts.subsys_mask);
-       if (ret)
-               cgroup_free_root(root);
-
-out_unlock:
-       mutex_unlock(&cgroup_mutex);
-out_free:
-       kfree(opts.release_agent);
-       kfree(opts.name);
-
-       if (ret) {
-               put_cgroup_ns(ns);
-               return ERR_PTR(ret);
-       }
-out_mount:
-       dentry = kernfs_mount(fs_type, flags, root->kf_root,
-                             is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
-                             &new_sb);
-
-       /*
-        * In non-init cgroup namespace, instead of root cgroup's
-        * dentry, we return the dentry corresponding to the
-        * cgroupns->root_cgrp.
-        */
-       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
-               struct dentry *nsdentry;
-               struct cgroup *cgrp;
-
-               mutex_lock(&cgroup_mutex);
-               spin_lock_irq(&css_set_lock);
-
-               cgrp = cset_cgroup_from_root(ns->root_cset, root);
-
-               spin_unlock_irq(&css_set_lock);
-               mutex_unlock(&cgroup_mutex);
-
-               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
-               dput(dentry);
-               dentry = nsdentry;
-       }
-
-       if (IS_ERR(dentry) || !new_sb)
-               cgroup_put(&root->cgrp);
-
-       /*
-        * If @pinned_sb, we're reusing an existing root and holding an
-        * extra ref on its sb.  Mount is complete.  Put the extra ref.
-        */
-       if (pinned_sb) {
-               WARN_ON(new_sb);
-               deactivate_super(pinned_sb);
-       }
-
-       put_cgroup_ns(ns);
-       return dentry;
-}
-
-static void cgroup_kill_sb(struct super_block *sb)
-{
-       struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
-       struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-
-       /*
-        * If @root doesn't have any mounts or children, start killing it.
-        * This prevents new mounts by disabling percpu_ref_tryget_live().
-        * cgroup_mount() may wait for @root's release.
-        *
-        * And don't kill the default root.
-        */
-       if (!list_empty(&root->cgrp.self.children) ||
-           root == &cgrp_dfl_root)
-               cgroup_put(&root->cgrp);
-       else
-               percpu_ref_kill(&root->cgrp.self.refcnt);
-
-       kernfs_kill_sb(sb);
-}
-
-static struct file_system_type cgroup_fs_type = {
-       .name = "cgroup",
-       .mount = cgroup_mount,
-       .kill_sb = cgroup_kill_sb,
-       .fs_flags = FS_USERNS_MOUNT,
-};
-
-static struct file_system_type cgroup2_fs_type = {
-       .name = "cgroup2",
-       .mount = cgroup_mount,
-       .kill_sb = cgroup_kill_sb,
-       .fs_flags = FS_USERNS_MOUNT,
-};
-
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-                                struct cgroup_namespace *ns)
-{
-       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                         struct cgroup_namespace *ns)
+{
+       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
 
        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
 }
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
-       /* the src and dst cset list running through cset->mg_node */
-       struct list_head        src_csets;
-       struct list_head        dst_csets;
-
-       /* the subsys currently being processed */
-       int                     ssid;
-
-       /*
-        * Fields for cgroup_taskset_*() iteration.
-        *
-        * Before migration is committed, the target migration tasks are on
-        * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-        * the csets on ->dst_csets.  ->csets point to either ->src_csets
-        * or ->dst_csets depending on whether migration is committed.
-        *
-        * ->cur_csets and ->cur_task point to the current task position
-        * during iteration.
-        */
-       struct list_head        *csets;
-       struct css_set          *cur_cset;
-       struct task_struct      *cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset)      (struct cgroup_taskset){        \
-       .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
-       .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
-       .csets                  = &tset.src_csets,                      \
-}
-
 /**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
  *
- * Add @task, which is a migration target, to @tset.  This function becomes
- * noop if @task doesn't need to be migrated.  @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset.  This function
+ * becomes noop if @task doesn't need to be migrated.  @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
  */
-static void cgroup_taskset_add(struct task_struct *task,
-                              struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+                                   struct cgroup_mgctx *mgctx)
 {
        struct css_set *cset;
 
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
 
        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
-               list_add_tail(&cset->mg_node, &tset->src_csets);
+               list_add_tail(&cset->mg_node,
+                             &mgctx->tset.src_csets);
        if (list_empty(&cset->mg_dst_cset->mg_node))
-               list_move_tail(&cset->mg_dst_cset->mg_node,
-                              &tset->dst_csets);
+               list_add_tail(&cset->mg_dst_cset->mg_node,
+                             &mgctx->tset.dst_csets);
 }
 
 /**
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 /**
  * cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
  *
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
  */
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-                                 struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 {
+       struct cgroup_taskset *tset = &mgctx->tset;
        struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
                return 0;
 
        /* check that we can legitimately attach to the cgroup */
-       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                if (ss->can_attach) {
                        tset->ssid = ssid;
                        ret = ss->can_attach(tset);
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
         */
        tset->csets = &tset->dst_csets;
 
-       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                if (ss->attach) {
                        tset->ssid = ssid;
                        ss->attach(tset);
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
        goto out_release_tset;
 
 out_cancel_attach:
-       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                if (ssid == failed_ssid)
                        break;
                if (ss->cancel_attach) {
@@ -2616,7 +2109,7 @@ out_release_tset:
  * zero for migration destination cgroups with tasks so that child cgroups
  * don't compete against tasks.
  */
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 {
        return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
                !dst_cgrp->subtree_control;
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 
 /**
  * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
 {
+       LIST_HEAD(preloaded);
        struct css_set *cset, *tmp_cset;
 
        lockdep_assert_held(&cgroup_mutex);
 
        spin_lock_irq(&css_set_lock);
-       list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+       list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+       list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+       list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
+
        spin_unlock_irq(&css_set_lock);
 }
 
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * into play and the preloaded css_sets are guaranteed to cover all
  * migrations.
  */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
-                                  struct cgroup *dst_cgrp,
-                                  struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+                           struct cgroup *dst_cgrp,
+                           struct cgroup_mgctx *mgctx)
 {
        struct cgroup *src_cgrp;
 
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        src_cset->mg_src_cgrp = src_cgrp;
        src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
-       list_add(&src_cset->mg_preload_node, preloaded_csets);
+       list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
  *
  * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets.  This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
  */
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 {
-       LIST_HEAD(csets);
        struct css_set *src_cset, *tmp_cset;
 
        lockdep_assert_held(&cgroup_mutex);
 
        /* look up the dst cset for each src cset and link it to src */
-       list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+       list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+                                mg_preload_node) {
                struct css_set *dst_cset;
+               struct cgroup_subsys *ss;
+               int ssid;
 
                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
                src_cset->mg_dst_cset = dst_cset;
 
                if (list_empty(&dst_cset->mg_preload_node))
-                       list_add(&dst_cset->mg_preload_node, &csets);
+                       list_add_tail(&dst_cset->mg_preload_node,
+                                     &mgctx->preloaded_dst_csets);
                else
                        put_css_set(dst_cset);
+
+               for_each_subsys(ss, ssid)
+                       if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+                               mgctx->ss_mask |= 1 << ssid;
        }
 
-       list_splice_tail(&csets, preloaded_csets);
        return 0;
 err:
-       cgroup_migrate_finish(&csets);
+       cgroup_migrate_finish(mgctx);
        return -ENOMEM;
 }
 
@@ -2759,7 +2264,7 @@ err:
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
@@ -2773,10 +2278,9 @@ err:
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-                         struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+                  struct cgroup_mgctx *mgctx)
 {
-       struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
        struct task_struct *task;
 
        /*
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
        rcu_read_lock();
        task = leader;
        do {
-               cgroup_taskset_add(task, &tset);
+               cgroup_migrate_add_task(task, mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
        spin_unlock_irq(&css_set_lock);
 
-       return cgroup_taskset_migrate(&tset, root);
+       return cgroup_migrate_execute(mgctx);
 }
 
 /**
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  *
  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
-                             struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+                      bool threadgroup)
 {
-       LIST_HEAD(preloaded_csets);
+       DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
        int ret;
 
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        rcu_read_lock();
        task = leader;
        do {
-               cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
-                                      &preloaded_csets);
+               cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        spin_unlock_irq(&css_set_lock);
 
        /* prepare dst csets and commit */
-       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&mgctx);
        if (!ret)
-               ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+               ret = cgroup_migrate(leader, threadgroup, &mgctx);
 
-       cgroup_migrate_finish(&preloaded_csets);
+       cgroup_migrate_finish(&mgctx);
 
        if (!ret)
                trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                                         struct cgroup *dst_cgrp,
                                         struct kernfs_open_file *of)
 {
-       const struct cred *cred = current_cred();
-       const struct cred *tcred = get_task_cred(task);
        int ret = 0;
 
-       /*
-        * even if we're attaching all tasks in the thread group, we only
-        * need to check permissions on one of them.
-        */
-       if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-           !uid_eq(cred->euid, tcred->uid) &&
-           !uid_eq(cred->euid, tcred->suid))
-               ret = -EACCES;
-
-       if (!ret && cgroup_on_dfl(dst_cgrp)) {
+       if (cgroup_on_dfl(dst_cgrp)) {
                struct super_block *sb = of->file->f_path.dentry->d_sb;
                struct cgroup *cgrp;
                struct inode *inode;
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                        ret = inode_permission(inode, MAY_WRITE);
                        iput(inode);
                }
+       } else {
+               const struct cred *cred = current_cred();
+               const struct cred *tcred = get_task_cred(task);
+
+               /*
+                * even if we're attaching all tasks in the thread group,
+                * we only need to check permissions on one of them.
+                */
+               if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+                   !uid_eq(cred->euid, tcred->uid) &&
+                   !uid_eq(cred->euid, tcred->suid))
+                       ret = -EACCES;
+               put_cred(tcred);
        }
 
-       put_cred(tcred);
        return ret;
 }
 
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                                   size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+                            size_t nbytes, loff_t off, bool threadgroup)
 {
        struct task_struct *tsk;
        struct cgroup_subsys *ss;
@@ -2950,86 +2454,12 @@ out_unlock_threadgroup:
        return ret ?: nbytes;
 }
 
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-       struct cgroup_root *root;
-       int retval = 0;
-
-       mutex_lock(&cgroup_mutex);
-       percpu_down_write(&cgroup_threadgroup_rwsem);
-       for_each_root(root) {
-               struct cgroup *from_cgrp;
-
-               if (root == &cgrp_dfl_root)
-                       continue;
-
-               spin_lock_irq(&css_set_lock);
-               from_cgrp = task_cgroup_from_root(from, root);
-               spin_unlock_irq(&css_set_lock);
-
-               retval = cgroup_attach_task(from_cgrp, tsk, false);
-               if (retval)
-                       break;
-       }
-       percpu_up_write(&cgroup_threadgroup_rwsem);
-       mutex_unlock(&cgroup_mutex);
-
-       return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-                                 char *buf, size_t nbytes, loff_t off)
-{
-       return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-                                 char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+                          loff_t off)
 {
        return __cgroup_procs_write(of, buf, nbytes, off, true);
 }
 
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-                                         char *buf, size_t nbytes, loff_t off)
-{
-       struct cgroup *cgrp;
-
-       BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-       cgrp = cgroup_kn_lock_live(of->kn, false);
-       if (!cgrp)
-               return -ENODEV;
-       spin_lock(&release_agent_path_lock);
-       strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-               sizeof(cgrp->root->release_agent_path));
-       spin_unlock(&release_agent_path_lock);
-       cgroup_kn_unlock(of->kn);
-       return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-       struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-       spin_lock(&release_agent_path_lock);
-       seq_puts(seq, cgrp->root->release_agent_path);
-       spin_unlock(&release_agent_path_lock);
-       seq_putc(seq, '\n');
-       return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-       seq_puts(seq, "0\n");
-       return 0;
-}
-
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
 {
        struct cgroup_subsys *ss;
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
-       LIST_HEAD(preloaded_csets);
-       struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+       DEFINE_CGROUP_MGCTX(mgctx);
        struct cgroup_subsys_state *d_css;
        struct cgroup *dsct;
        struct css_set *src_cset;
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                struct cgrp_cset_link *link;
 
                list_for_each_entry(link, &dsct->cset_links, cset_link)
-                       cgroup_migrate_add_src(link->cset, dsct,
-                                              &preloaded_csets);
+                       cgroup_migrate_add_src(link->cset, dsct, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);
 
        /* NULL dst indicates self on default hierarchy */
-       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&mgctx);
        if (ret)
                goto out_finish;
 
        spin_lock_irq(&css_set_lock);
-       list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+       list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
                struct task_struct *task, *ntask;
 
-               /* src_csets precede dst_csets, break on the first dst_cset */
-               if (!src_cset->mg_src_cgrp)
-                       break;
-
                /* all tasks in src_csets need to be migrated */
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-                       cgroup_taskset_add(task, &tset);
+                       cgroup_migrate_add_task(task, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);
 
-       ret = cgroup_taskset_migrate(&tset, cgrp->root);
+       ret = cgroup_migrate_execute(&mgctx);
 out_finish:
-       cgroup_migrate_finish(&preloaded_csets);
+       cgroup_migrate_finish(&mgctx);
        percpu_up_write(&cgroup_threadgroup_rwsem);
        return ret;
 }
@@ -3131,7 +2555,7 @@ out_finish:
  * controller while the previous css is still around.  This function grabs
  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
  */
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
        __acquires(&cgroup_mutex)
 {
        struct cgroup *dsct;
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
        return 0;
 }
 
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+       struct cftype *cft = of->kn->priv;
+
+       if (cft->open)
+               return cft->open(of);
+       return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+       struct cftype *cft = of->kn->priv;
+
+       if (cft->release)
+               cft->release(of);
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
 {
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-       seq_cft(seq)->seq_stop(seq, v);
+       if (seq_cft(seq)->seq_stop)
+               seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
 static struct kernfs_ops cgroup_kf_single_ops = {
        .atomic_write_len       = PAGE_SIZE,
+       .open                   = cgroup_file_open,
+       .release                = cgroup_file_release,
        .write                  = cgroup_file_write,
        .seq_show               = cgroup_seqfile_show,
 };
 
 static struct kernfs_ops cgroup_kf_ops = {
        .atomic_write_len       = PAGE_SIZE,
+       .open                   = cgroup_file_open,
+       .release                = cgroup_file_release,
        .write                  = cgroup_file_write,
        .seq_start              = cgroup_seqfile_start,
        .seq_next               = cgroup_seqfile_next,
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
        .seq_show               = cgroup_seqfile_show,
 };
 
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-                        const char *new_name_str)
-{
-       struct cgroup *cgrp = kn->priv;
-       int ret;
-
-       if (kernfs_type(kn) != KERNFS_DIR)
-               return -ENOTDIR;
-       if (kn->parent != new_parent)
-               return -EIO;
-
-       /*
-        * This isn't a proper migration and its usefulness is very
-        * limited.  Disallow on the default hierarchy.
-        */
-       if (cgroup_on_dfl(cgrp))
-               return -EPERM;
-
-       /*
-        * We're gonna grab cgroup_mutex which nests outside kernfs
-        * active_ref.  kernfs_rename() doesn't require active_ref
-        * protection.  Break them before grabbing cgroup_mutex.
-        */
-       kernfs_break_active_protection(new_parent);
-       kernfs_break_active_protection(kn);
-
-       mutex_lock(&cgroup_mutex);
-
-       ret = kernfs_rename(kn, new_parent, new_name_str);
-       if (!ret)
-               trace_cgroup_rename(cgrp);
-
-       mutex_unlock(&cgroup_mutex);
-
-       kernfs_unbreak_active_protection(kn);
-       kernfs_unbreak_active_protection(new_parent);
-       return ret;
-}
-
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
@@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 }
 
 /**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
-       int count = 0;
-       struct cgrp_cset_link *link;
-
-       spin_lock_irq(&css_set_lock);
-       list_for_each_entry(link, &cgrp->cset_links, cset_link)
-               count += atomic_read(&link->cset->refcount);
-       spin_unlock_irq(&css_set_lock);
-       return count;
-}
-
-/**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
  * @parent: css whose children to walk
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
                put_task_struct(it->cur_task);
 }
 
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+static void cgroup_procs_release(struct kernfs_open_file *of)
 {
-       LIST_HEAD(preloaded_csets);
-       struct cgrp_cset_link *link;
-       struct css_task_iter it;
-       struct task_struct *task;
-       int ret;
-
-       if (!cgroup_may_migrate_to(to))
-               return -EBUSY;
+       if (of->priv) {
+               css_task_iter_end(of->priv);
+               kfree(of->priv);
+       }
+}
 
-       mutex_lock(&cgroup_mutex);
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct kernfs_open_file *of = s->private;
+       struct css_task_iter *it = of->priv;
+       struct task_struct *task;
 
-       percpu_down_write(&cgroup_threadgroup_rwsem);
+       do {
+               task = css_task_iter_next(it);
+       } while (task && !thread_group_leader(task));
 
-       /* all tasks in @from are being moved, all csets are source */
-       spin_lock_irq(&css_set_lock);
-       list_for_each_entry(link, &from->cset_links, cset_link)
-               cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-       spin_unlock_irq(&css_set_lock);
+       return task;
+}
 
-       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-       if (ret)
-               goto out_err;
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+       struct kernfs_open_file *of = s->private;
+       struct cgroup *cgrp = seq_css(s)->cgroup;
+       struct css_task_iter *it = of->priv;
 
        /*
-        * Migrate tasks one-by-one until @from is empty.  This fails iff
-        * ->can_attach() fails.
+        * When a seq_file is seeked, it's always traversed sequentially
+        * from position 0, so we can simply keep iterating on !0 *pos.
         */
-       do {
-               css_task_iter_start(&from->self, &it);
-               task = css_task_iter_next(&it);
-               if (task)
-                       get_task_struct(task);
-               css_task_iter_end(&it);
-
-               if (task) {
-                       ret = cgroup_migrate(task, false, to->root);
-                       if (!ret)
-                               trace_cgroup_transfer_tasks(to, task, false);
-                       put_task_struct(task);
-               }
-       } while (task && !ret);
-out_err:
-       cgroup_migrate_finish(&preloaded_csets);
-       percpu_up_write(&cgroup_threadgroup_rwsem);
-       mutex_unlock(&cgroup_mutex);
-       return ret;
-}
+       if (!it) {
+               if (WARN_ON_ONCE((*pos)++))
+                       return ERR_PTR(-EINVAL);
 
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
+               it = kzalloc(sizeof(*it), GFP_KERNEL);
+               if (!it)
+                       return ERR_PTR(-ENOMEM);
+               of->priv = it;
+               css_task_iter_start(&cgrp->self, it);
+       } else if (!(*pos)++) {
+               css_task_iter_end(it);
+               css_task_iter_start(&cgrp->self, it);
+       }
 
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-       CGROUP_FILE_PROCS,
-       CGROUP_FILE_TASKS,
-};
+       return cgroup_procs_next(s, NULL, NULL);
+}
 
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-       /*
-        * used to find which pidlist is wanted. doesn't change as long as
-        * this particular list stays in the list.
-       */
-       struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-       /* array of xids */
-       pid_t *list;
-       /* how many elements the above list has */
-       int length;
-       /* each of these stored in a list by its cgroup */
-       struct list_head links;
-       /* pointer to the cgroup we belong to, for list removal purposes */
-       struct cgroup *owner;
-       /* for delayed destruction */
-       struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
-       if (PIDLIST_TOO_LARGE(count))
-               return vmalloc(count * sizeof(pid_t));
-       else
-               return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-       kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
-       struct cgroup_pidlist *l, *tmp_l;
-
-       mutex_lock(&cgrp->pidlist_mutex);
-       list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-               mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-       mutex_unlock(&cgrp->pidlist_mutex);
-
-       flush_workqueue(cgroup_pidlist_destroy_wq);
-       BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-       struct delayed_work *dwork = to_delayed_work(work);
-       struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-                                               destroy_dwork);
-       struct cgroup_pidlist *tofree = NULL;
-
-       mutex_lock(&l->owner->pidlist_mutex);
-
-       /*
-        * Destroy iff we didn't get queued again.  The state won't change
-        * as destroy_dwork can only be queued while locked.
-        */
-       if (!delayed_work_pending(dwork)) {
-               list_del(&l->links);
-               pidlist_free(l->list);
-               put_pid_ns(l->key.ns);
-               tofree = l;
-       }
-
-       mutex_unlock(&l->owner->pidlist_mutex);
-       kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-       int src, dest = 1;
-
-       /*
-        * we presume the 0th element is unique, so i starts at 1. trivial
-        * edge cases first; no work needs to be done for either
-        */
-       if (length == 0 || length == 1)
-               return length;
-       /* src and dest walk down the list; dest counts unique elements */
-       for (src = 1; src < length; src++) {
-               /* find next unique element */
-               while (list[src] == list[src-1]) {
-                       src++;
-                       if (src == length)
-                               goto after;
-               }
-               /* dest always points to where the next unique element goes */
-               list[dest] = list[src];
-               dest++;
-       }
-after:
-       return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property.  In the long term, we
- * want to do away with it.  Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
-{
-       unsigned a = pid & 0x55555555;
-       unsigned b = pid & 0xAAAAAAAA;
-
-       return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
-       if (cgroup_on_dfl(cgrp))
-               return pid_fry(pid);
-       else
-               return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
-       return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
-       return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-                                                 enum cgroup_filetype type)
-{
-       struct cgroup_pidlist *l;
-       /* don't need task_nsproxy() if we're looking at ourself */
-       struct pid_namespace *ns = task_active_pid_ns(current);
-
-       lockdep_assert_held(&cgrp->pidlist_mutex);
-
-       list_for_each_entry(l, &cgrp->pidlists, links)
-               if (l->key.type == type && l->key.ns == ns)
-                       return l;
-       return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-                                               enum cgroup_filetype type)
-{
-       struct cgroup_pidlist *l;
-
-       lockdep_assert_held(&cgrp->pidlist_mutex);
-
-       l = cgroup_pidlist_find(cgrp, type);
-       if (l)
-               return l;
-
-       /* entry not found; create a new one */
-       l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-       if (!l)
-               return l;
-
-       INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-       l->key.type = type;
-       /* don't need task_nsproxy() if we're looking at ourself */
-       l->key.ns = get_pid_ns(task_active_pid_ns(current));
-       l->owner = cgrp;
-       list_add(&l->links, &cgrp->pidlists);
-       return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-                             struct cgroup_pidlist **lp)
-{
-       pid_t *array;
-       int length;
-       int pid, n = 0; /* used for populating the array */
-       struct css_task_iter it;
-       struct task_struct *tsk;
-       struct cgroup_pidlist *l;
-
-       lockdep_assert_held(&cgrp->pidlist_mutex);
-
-       /*
-        * If cgroup gets more users after we read count, we won't have
-        * enough space - tough.  This race is indistinguishable to the
-        * caller from the case that the additional cgroup users didn't
-        * show up until sometime later on.
-        */
-       length = cgroup_task_count(cgrp);
-       array = pidlist_allocate(length);
-       if (!array)
-               return -ENOMEM;
-       /* now, populate the array */
-       css_task_iter_start(&cgrp->self, &it);
-       while ((tsk = css_task_iter_next(&it))) {
-               if (unlikely(n == length))
-                       break;
-               /* get tgid or pid for procs or tasks file respectively */
-               if (type == CGROUP_FILE_PROCS)
-                       pid = task_tgid_vnr(tsk);
-               else
-                       pid = task_pid_vnr(tsk);
-               if (pid > 0) /* make sure to only use valid results */
-                       array[n++] = pid;
-       }
-       css_task_iter_end(&it);
-       length = n;
-       /* now sort & (if procs) strip out duplicates */
-       if (cgroup_on_dfl(cgrp))
-               sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
-       else
-               sort(array, length, sizeof(pid_t), cmppid, NULL);
-       if (type == CGROUP_FILE_PROCS)
-               length = pidlist_uniq(array, length);
-
-       l = cgroup_pidlist_find_create(cgrp, type);
-       if (!l) {
-               pidlist_free(array);
-               return -ENOMEM;
-       }
-
-       /* store array, freeing old if necessary */
-       pidlist_free(l->list);
-       l->list = array;
-       l->length = length;
-       *lp = l;
-       return 0;
-}
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
-       struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-       struct cgroup *cgrp;
-       struct css_task_iter it;
-       struct task_struct *tsk;
-
-       /* it should be kernfs_node belonging to cgroupfs and is a directory */
-       if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-           kernfs_type(kn) != KERNFS_DIR)
-               return -EINVAL;
-
-       mutex_lock(&cgroup_mutex);
-
-       /*
-        * We aren't being called from kernfs and there's no guarantee on
-        * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-        * @kn->priv is RCU safe.  Let's do the RCU dancing.
-        */
-       rcu_read_lock();
-       cgrp = rcu_dereference(kn->priv);
-       if (!cgrp || cgroup_is_dead(cgrp)) {
-               rcu_read_unlock();
-               mutex_unlock(&cgroup_mutex);
-               return -ENOENT;
-       }
-       rcu_read_unlock();
-
-       css_task_iter_start(&cgrp->self, &it);
-       while ((tsk = css_task_iter_next(&it))) {
-               switch (tsk->state) {
-               case TASK_RUNNING:
-                       stats->nr_running++;
-                       break;
-               case TASK_INTERRUPTIBLE:
-                       stats->nr_sleeping++;
-                       break;
-               case TASK_UNINTERRUPTIBLE:
-                       stats->nr_uninterruptible++;
-                       break;
-               case TASK_STOPPED:
-                       stats->nr_stopped++;
-                       break;
-               default:
-                       if (delayacct_is_task_waiting_on_io(tsk))
-                               stats->nr_io_wait++;
-                       break;
-               }
-       }
-       css_task_iter_end(&it);
-
-       mutex_unlock(&cgroup_mutex);
-       return 0;
-}
-
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+static int cgroup_procs_show(struct seq_file *s, void *v)
 {
-       /*
-        * Initially we receive a position value that corresponds to
-        * one more than the last pid shown (or 0 on the first call or
-        * after a seek to the start). Use a binary-search to find the
-        * next pid to display, if any
-        */
-       struct kernfs_open_file *of = s->private;
-       struct cgroup *cgrp = seq_css(s)->cgroup;
-       struct cgroup_pidlist *l;
-       enum cgroup_filetype type = seq_cft(s)->private;
-       int index = 0, pid = *pos;
-       int *iter, ret;
-
-       mutex_lock(&cgrp->pidlist_mutex);
-
-       /*
-        * !NULL @of->priv indicates that this isn't the first start()
-        * after open.  If the matching pidlist is around, we can use that.
-        * Look for it.  Note that @of->priv can't be used directly.  It
-        * could already have been destroyed.
-        */
-       if (of->priv)
-               of->priv = cgroup_pidlist_find(cgrp, type);
-
-       /*
-        * Either this is the first start() after open or the matching
-        * pidlist has been destroyed inbetween.  Create a new one.
-        */
-       if (!of->priv) {
-               ret = pidlist_array_load(cgrp, type,
-                                        (struct cgroup_pidlist **)&of->priv);
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-       l = of->priv;
-
-       if (pid) {
-               int end = l->length;
-
-               while (index < end) {
-                       int mid = (index + end) / 2;
-                       if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
-                               index = mid;
-                               break;
-                       } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
-                               index = mid + 1;
-                       else
-                               end = mid;
-               }
-       }
-       /* If we're off the end of the array, we're done */
-       if (index >= l->length)
-               return NULL;
-       /* Update the abstract position to be the actual pid that we found */
-       iter = l->list + index;
-       *pos = cgroup_pid_fry(cgrp, *iter);
-       return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
-       struct kernfs_open_file *of = s->private;
-       struct cgroup_pidlist *l = of->priv;
-
-       if (l)
-               mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-                                CGROUP_PIDLIST_DESTROY_DELAY);
-       mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct kernfs_open_file *of = s->private;
-       struct cgroup_pidlist *l = of->priv;
-       pid_t *p = v;
-       pid_t *end = l->list + l->length;
-       /*
-        * Advance to the next pid in the array. If this goes off the
-        * end, we're done
-        */
-       p++;
-       if (p >= end) {
-               return NULL;
-       } else {
-               *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
-               return p;
-       }
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-       seq_printf(s, "%d\n", *(int *)v);
-
-       return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-                                        struct cftype *cft)
-{
-       return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-                                         struct cftype *cft, u64 val)
-{
-       if (val)
-               set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-       else
-               clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-       return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-                                     struct cftype *cft)
-{
-       return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-                                      struct cftype *cft, u64 val)
-{
-       if (val)
-               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-       else
-               clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+       seq_printf(s, "%d\n", task_tgid_vnr(v));
        return 0;
 }
 
 /* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.procs",
                .file_offset = offsetof(struct cgroup, procs_file),
-               .seq_start = cgroup_pidlist_start,
-               .seq_next = cgroup_pidlist_next,
-               .seq_stop = cgroup_pidlist_stop,
-               .seq_show = cgroup_pidlist_show,
-               .private = CGROUP_FILE_PROCS,
+               .release = cgroup_procs_release,
+               .seq_start = cgroup_procs_start,
+               .seq_next = cgroup_procs_next,
+               .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
        {
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
        { }     /* terminate */
 };
 
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
-       {
-               .name = "cgroup.procs",
-               .seq_start = cgroup_pidlist_start,
-               .seq_next = cgroup_pidlist_next,
-               .seq_stop = cgroup_pidlist_stop,
-               .seq_show = cgroup_pidlist_show,
-               .private = CGROUP_FILE_PROCS,
-               .write = cgroup_procs_write,
-       },
-       {
-               .name = "cgroup.clone_children",
-               .read_u64 = cgroup_clone_children_read,
-               .write_u64 = cgroup_clone_children_write,
-       },
-       {
-               .name = "cgroup.sane_behavior",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_sane_behavior_show,
-       },
-       {
-               .name = "tasks",
-               .seq_start = cgroup_pidlist_start,
-               .seq_next = cgroup_pidlist_next,
-               .seq_stop = cgroup_pidlist_stop,
-               .seq_show = cgroup_pidlist_show,
-               .private = CGROUP_FILE_TASKS,
-               .write = cgroup_tasks_write,
-       },
-       {
-               .name = "notify_on_release",
-               .read_u64 = cgroup_read_notify_on_release,
-               .write_u64 = cgroup_write_notify_on_release,
-       },
-       {
-               .name = "release_agent",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_release_agent_show,
-               .write = cgroup_release_agent_write,
-               .max_write_len = PATH_MAX - 1,
-       },
-       { }     /* terminate */
-};
-
 /*
  * css destruction is four-stage process.
  *
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
-               cgroup_pidlist_destroy_all(cgrp);
+               cgroup1_pidlist_destroy_all(cgrp);
                cancel_work_sync(&cgrp->release_agent_work);
 
                if (cgroup_parent(cgrp)) {
@@ -5302,8 +4150,7 @@ out_free_cgrp:
        return ERR_PTR(ret);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                       umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 {
        struct cgroup *parent, *cgrp;
        struct kernfs_node *kn;
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         */
        kernfs_remove(cgrp->kn);
 
-       check_for_release(cgroup_parent(cgrp));
+       cgroup1_check_for_release(cgroup_parent(cgrp));
 
        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        return 0;
 };
 
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
 {
        struct cgroup *cgrp;
        int ret = 0;
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .remount_fs             = cgroup_remount,
-       .show_options           = cgroup_show_options,
        .mkdir                  = cgroup_mkdir,
        .rmdir                  = cgroup_rmdir,
-       .rename                 = cgroup_rename,
        .show_path              = cgroup_show_path,
 };
 
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
 
        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-       BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
-       BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
        /*
         * The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
                        continue;
                }
 
-               if (cgroup_ssid_no_v1(ssid))
+               if (cgroup1_ssid_disabled(ssid))
                        printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
                               ss->name);
 
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
         */
        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
        BUG_ON(!cgroup_destroy_wq);
-
-       /*
-        * Used to destroy pidlists and separate to serve as flush domain.
-        * Cap @max_active to 1 too.
-        */
-       cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-                                                   0, 1);
-       BUG_ON(!cgroup_pidlist_destroy_wq);
-
        return 0;
 }
 core_initcall(cgroup_wq_init);
@@ -5835,42 +4671,6 @@ out:
        return retval;
 }
 
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-       struct cgroup_subsys *ss;
-       int i;
-
-       seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-       /*
-        * ideally we don't want subsystems moving around while we do this.
-        * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-        * subsys/hierarchy state.
-        */
-       mutex_lock(&cgroup_mutex);
-
-       for_each_subsys(ss, i)
-               seq_printf(m, "%s\t%d\t%d\t%d\n",
-                          ss->legacy_name, ss->root->hierarchy_id,
-                          atomic_read(&ss->root->nr_cgrps),
-                          cgroup_ssid_enabled(i));
-
-       mutex_unlock(&cgroup_mutex);
-       return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
-       .open = cgroupstats_open,
-       .read = seq_read,
-       .llseek = seq_lseek,
-       .release = single_release,
-};
-
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
        put_css_set(cset);
 }
 
-static void check_for_release(struct cgroup *cgrp)
-{
-       if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-           !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-               schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
-       struct cgroup *cgrp =
-               container_of(work, struct cgroup, release_agent_work);
-       char *pathbuf = NULL, *agentbuf = NULL;
-       char *argv[3], *envp[3];
-       int ret;
-
-       mutex_lock(&cgroup_mutex);
-
-       pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-       agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-       if (!pathbuf || !agentbuf)
-               goto out;
-
-       spin_lock_irq(&css_set_lock);
-       ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-       spin_unlock_irq(&css_set_lock);
-       if (ret < 0 || ret >= PATH_MAX)
-               goto out;
-
-       argv[0] = agentbuf;
-       argv[1] = pathbuf;
-       argv[2] = NULL;
-
-       /* minimal command environment */
-       envp[0] = "HOME=/";
-       envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-       envp[2] = NULL;
-
-       mutex_unlock(&cgroup_mutex);
-       call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-       goto out_free;
-out:
-       mutex_unlock(&cgroup_mutex);
-out_free:
-       kfree(agentbuf);
-       kfree(pathbuf);
-}
-
 static int __init cgroup_disable(char *str)
 {
        struct cgroup_subsys *ss;
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_no_v1(char *str)
-{
-       struct cgroup_subsys *ss;
-       char *token;
-       int i;
-
-       while ((token = strsep(&str, ",")) != NULL) {
-               if (!*token)
-                       continue;
-
-               if (!strcmp(token, "all")) {
-                       cgroup_no_v1_mask = U16_MAX;
-                       break;
-               }
-
-               for_each_subsys(ss, i) {
-                       if (strcmp(token, ss->name) &&
-                           strcmp(token, ss->legacy_name))
-                               continue;
-
-                       cgroup_no_v1_mask |= 1 << i;
-               }
-       }
-       return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
         * have been or be removed at any point.  @kn->priv is RCU
         * protected for this access.  See css_release_work_fn() for details.
         */
-       cgrp = rcu_dereference(kn->priv);
+       cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);
 
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif /* CONFIG_SOCK_CGROUP_DATA */
 
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-       return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-       dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-       struct cgroup_namespace *new_ns;
-       int ret;
-
-       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-       if (!new_ns)
-               return ERR_PTR(-ENOMEM);
-       ret = ns_alloc_inum(&new_ns->ns);
-       if (ret) {
-               kfree(new_ns);
-               return ERR_PTR(ret);
-       }
-       atomic_set(&new_ns->count, 1);
-       new_ns->ns.ops = &cgroupns_operations;
-       return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-       put_css_set(ns->root_cset);
-       dec_cgroup_namespaces(ns->ucounts);
-       put_user_ns(ns->user_ns);
-       ns_free_inum(&ns->ns);
-       kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-                                       struct user_namespace *user_ns,
-                                       struct cgroup_namespace *old_ns)
-{
-       struct cgroup_namespace *new_ns;
-       struct ucounts *ucounts;
-       struct css_set *cset;
-
-       BUG_ON(!old_ns);
-
-       if (!(flags & CLONE_NEWCGROUP)) {
-               get_cgroup_ns(old_ns);
-               return old_ns;
-       }
-
-       /* Allow only sysadmin to create cgroup namespace. */
-       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
-
-       ucounts = inc_cgroup_namespaces(user_ns);
-       if (!ucounts)
-               return ERR_PTR(-ENOSPC);
-
-       /* It is not safe to take cgroup_mutex here */
-       spin_lock_irq(&css_set_lock);
-       cset = task_css_set(current);
-       get_css_set(cset);
-       spin_unlock_irq(&css_set_lock);
-
-       new_ns = alloc_cgroup_ns();
-       if (IS_ERR(new_ns)) {
-               put_css_set(cset);
-               dec_cgroup_namespaces(ucounts);
-               return new_ns;
-       }
-
-       new_ns->user_ns = get_user_ns(user_ns);
-       new_ns->ucounts = ucounts;
-       new_ns->root_cset = cset;
-
-       return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-       return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-               return -EPERM;
-
-       /* Don't need to do anything if we are attaching to our own cgroupns. */
-       if (cgroup_ns == nsproxy->cgroup_ns)
-               return 0;
-
-       get_cgroup_ns(cgroup_ns);
-       put_cgroup_ns(nsproxy->cgroup_ns);
-       nsproxy->cgroup_ns = cgroup_ns;
-
-       return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-       struct cgroup_namespace *ns = NULL;
-       struct nsproxy *nsproxy;
-
-       task_lock(task);
-       nsproxy = task->nsproxy;
-       if (nsproxy) {
-               ns = nsproxy->cgroup_ns;
-               get_cgroup_ns(ns);
-       }
-       task_unlock(task);
-
-       return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-       put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-       return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-       .name           = "cgroup",
-       .type           = CLONE_NEWCGROUP,
-       .get            = cgroupns_get,
-       .put            = cgroupns_put,
-       .install        = cgroupns_install,
-       .owner          = cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-       return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
 #ifdef CONFIG_CGROUP_BPF
 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
                      enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
        return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-       if (!css)
-               return ERR_PTR(-ENOMEM);
-
-       return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-       kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-                               struct cftype *cft)
-{
-       return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-                               struct cftype *cft)
-{
-       return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-                                        struct cftype *cft)
-{
-       u64 count;
-
-       rcu_read_lock();
-       count = atomic_read(&task_css_set(current)->refcount);
-       rcu_read_unlock();
-       return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-       struct cgrp_cset_link *link;
-       struct css_set *cset;
-       char *name_buf;
-
-       name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-       if (!name_buf)
-               return -ENOMEM;
-
-       spin_lock_irq(&css_set_lock);
-       rcu_read_lock();
-       cset = rcu_dereference(current->cgroups);
-       list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-               struct cgroup *c = link->cgrp;
-
-               cgroup_name(c, name_buf, NAME_MAX + 1);
-               seq_printf(seq, "Root %d group %s\n",
-                          c->root->hierarchy_id, name_buf);
-       }
-       rcu_read_unlock();
-       spin_unlock_irq(&css_set_lock);
-       kfree(name_buf);
-       return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-       struct cgroup_subsys_state *css = seq_css(seq);
-       struct cgrp_cset_link *link;
-
-       spin_lock_irq(&css_set_lock);
-       list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-               struct css_set *cset = link->cset;
-               struct task_struct *task;
-               int count = 0;
-
-               seq_printf(seq, "css_set %p\n", cset);
-
-               list_for_each_entry(task, &cset->tasks, cg_list) {
-                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-                               goto overflow;
-                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-               }
-
-               list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-                               goto overflow;
-                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-               }
-               continue;
-       overflow:
-               seq_puts(seq, "  ...\n");
-       }
-       spin_unlock_irq(&css_set_lock);
-       return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-       return (!cgroup_is_populated(css->cgroup) &&
-               !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-       {
-               .name = "taskcount",
-               .read_u64 = debug_taskcount_read,
-       },
-
-       {
-               .name = "current_css_set",
-               .read_u64 = current_css_set_read,
-       },
-
-       {
-               .name = "current_css_set_refcount",
-               .read_u64 = current_css_set_refcount_read,
-       },
-
-       {
-               .name = "current_css_set_cg_links",
-               .seq_show = current_css_set_cg_links_read,
-       },
-
-       {
-               .name = "cgroup_css_links",
-               .seq_show = cgroup_css_links_read,
-       },
-
-       {
-               .name = "releasable",
-               .read_u64 = releasable_read,
-       },
-
-       { }     /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-       .css_alloc = debug_css_alloc,
-       .css_free = debug_css_free,
-       .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
similarity index 100%
rename from kernel/cpuset.c
rename to kernel/cgroup/cpuset.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644 (file)
index 0000000..cff7ea6
--- /dev/null
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+       struct cgroup_namespace *new_ns;
+       int ret;
+
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       ret = ns_alloc_inum(&new_ns->ns);
+       if (ret) {
+               kfree(new_ns);
+               return ERR_PTR(ret);
+       }
+       atomic_set(&new_ns->count, 1);
+       new_ns->ns.ops = &cgroupns_operations;
+       return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+       put_css_set(ns->root_cset);
+       dec_cgroup_namespaces(ns->ucounts);
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+       kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+                                       struct user_namespace *user_ns,
+                                       struct cgroup_namespace *old_ns)
+{
+       struct cgroup_namespace *new_ns;
+       struct ucounts *ucounts;
+       struct css_set *cset;
+
+       BUG_ON(!old_ns);
+
+       if (!(flags & CLONE_NEWCGROUP)) {
+               get_cgroup_ns(old_ns);
+               return old_ns;
+       }
+
+       /* Allow only sysadmin to create cgroup namespace. */
+       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       ucounts = inc_cgroup_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
+       /* It is not safe to take cgroup_mutex here */
+       spin_lock_irq(&css_set_lock);
+       cset = task_css_set(current);
+       get_css_set(cset);
+       spin_unlock_irq(&css_set_lock);
+
+       new_ns = alloc_cgroup_ns();
+       if (IS_ERR(new_ns)) {
+               put_css_set(cset);
+               dec_cgroup_namespaces(ucounts);
+               return new_ns;
+       }
+
+       new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
+       new_ns->root_cset = cset;
+
+       return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+       return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* Don't need to do anything if we are attaching to our own cgroupns. */
+       if (cgroup_ns == nsproxy->cgroup_ns)
+               return 0;
+
+       get_cgroup_ns(cgroup_ns);
+       put_cgroup_ns(nsproxy->cgroup_ns);
+       nsproxy->cgroup_ns = cgroup_ns;
+
+       return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+       struct cgroup_namespace *ns = NULL;
+       struct nsproxy *nsproxy;
+
+       task_lock(task);
+       nsproxy = task->nsproxy;
+       if (nsproxy) {
+               ns = nsproxy->cgroup_ns;
+               get_cgroup_ns(ns);
+       }
+       task_unlock(task);
+
+       return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+       put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+       return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+       .name           = "cgroup",
+       .type           = CLONE_NEWCGROUP,
+       .get            = cgroupns_get,
+       .put            = cgroupns_put,
+       .install        = cgroupns_install,
+       .owner          = cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+       return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
similarity index 100%
rename from kernel/cgroup_pids.c
rename to kernel/cgroup/pids.c
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644 (file)
index 0000000..defad3c
--- /dev/null
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+       RDMACG_RESOURCE_TYPE_MAX,
+       RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+       [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
+       [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+       int max;
+       int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+       struct rdmacg_device    *device;
+       struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
+
+       struct list_head        cg_node;
+       struct list_head        dev_node;
+
+       /* count active user tasks of this pool */
+       u64                     usage_sum;
+       /* total number counts which are set to max */
+       int                     num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+       return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+       return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+       return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+                              int index, int new_max)
+{
+       if (new_max == S32_MAX) {
+               if (rpool->resources[index].max != S32_MAX)
+                       rpool->num_max_cnt++;
+       } else {
+               if (rpool->resources[index].max == S32_MAX)
+                       rpool->num_max_cnt--;
+       }
+       rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+       int i;
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+               set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_del(&rpool->cg_node);
+       list_del(&rpool->dev_node);
+       kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device)
+
+{
+       struct rdmacg_resource_pool *pool;
+
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_for_each_entry(pool, &cg->rpools, cg_node)
+               if (pool->device == device)
+                       return pool;
+
+       return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+       struct rdmacg_resource_pool *rpool;
+
+       rpool = find_cg_rpool_locked(cg, device);
+       if (rpool)
+               return rpool;
+
+       rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+       if (!rpool)
+               return ERR_PTR(-ENOMEM);
+
+       rpool->device = device;
+       set_all_resource_max_limit(rpool);
+
+       INIT_LIST_HEAD(&rpool->cg_node);
+       INIT_LIST_HEAD(&rpool->dev_node);
+       list_add_tail(&rpool->cg_node, &cg->rpools);
+       list_add_tail(&rpool->dev_node, &device->rpools);
+       return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+                  struct rdmacg_device *device,
+                  enum rdmacg_resource_type index)
+{
+       struct rdmacg_resource_pool *rpool;
+
+       rpool = find_cg_rpool_locked(cg, device);
+
+       /*
+        * rpool cannot be null at this stage. Let kernel operate in case
+        * if there a bug in IB stack or rdma controller, instead of crashing
+        * the system.
+        */
+       if (unlikely(!rpool)) {
+               pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+               return;
+       }
+
+       rpool->resources[index].usage--;
+
+       /*
+        * A negative count (or overflow) is invalid,
+        * it indicates a bug in the rdma controller.
+        */
+       WARN_ON_ONCE(rpool->resources[index].usage < 0);
+       rpool->usage_sum--;
+       if (rpool->usage_sum == 0 &&
+           rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+               /*
+                * No user of the rpool and all entries are set to max, so
+                * safe to delete this rpool.
+                */
+               free_cg_rpool_locked(rpool);
+       }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+                                    struct rdmacg_device *device,
+                                    struct rdma_cgroup *stop_cg,
+                                    enum rdmacg_resource_type index)
+{
+       struct rdma_cgroup *p;
+
+       mutex_lock(&rdmacg_mutex);
+
+       for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+               uncharge_cg_locked(p, device, index);
+
+       mutex_unlock(&rdmacg_mutex);
+
+       css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device,
+                    enum rdmacg_resource_type index)
+{
+       if (index >= RDMACG_RESOURCE_MAX)
+               return;
+
+       rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+                     struct rdmacg_device *device,
+                     enum rdmacg_resource_type index)
+{
+       struct rdma_cgroup *cg, *p;
+       struct rdmacg_resource_pool *rpool;
+       s64 new;
+       int ret = 0;
+
+       if (index >= RDMACG_RESOURCE_MAX)
+               return -EINVAL;
+
+       /*
+        * hold on to css, as cgroup can be removed but resource
+        * accounting happens on css.
+        */
+       cg = get_current_rdmacg();
+
+       mutex_lock(&rdmacg_mutex);
+       for (p = cg; p; p = parent_rdmacg(p)) {
+               rpool = get_cg_rpool_locked(p, device);
+               if (IS_ERR(rpool)) {
+                       ret = PTR_ERR(rpool);
+                       goto err;
+               } else {
+                       new = rpool->resources[index].usage + 1;
+                       if (new > rpool->resources[index].max) {
+                               ret = -EAGAIN;
+                               goto err;
+                       } else {
+                               rpool->resources[index].usage = new;
+                               rpool->usage_sum++;
+                       }
+               }
+       }
+       mutex_unlock(&rdmacg_mutex);
+
+       *rdmacg = cg;
+       return 0;
+
+err:
+       mutex_unlock(&rdmacg_mutex);
+       rdmacg_uncharge_hierarchy(cg, device, p, index);
+       return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+       INIT_LIST_HEAD(&device->dev_node);
+       INIT_LIST_HEAD(&device->rpools);
+
+       mutex_lock(&rdmacg_mutex);
+       list_add_tail(&device->dev_node, &rdmacg_devices);
+       mutex_unlock(&rdmacg_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+       struct rdmacg_resource_pool *rpool, *tmp;
+
+       /*
+        * Synchronize with any active resource settings,
+        * usage query happening via configfs.
+        */
+       mutex_lock(&rdmacg_mutex);
+       list_del_init(&device->dev_node);
+
+       /*
+        * Now that this device is off the cgroup list, its safe to free
+        * all the rpool resources.
+        */
+       list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+               free_cg_rpool_locked(rpool);
+
+       mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+       substring_t argstr;
+       const char **table = &rdmacg_resource_names[0];
+       char *name, *value = c;
+       size_t len;
+       int ret, i = 0;
+
+       name = strsep(&value, "=");
+       if (!name || !value)
+               return -EINVAL;
+
+       len = strlen(value);
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+               if (strcmp(table[i], name))
+                       continue;
+
+               argstr.from = value;
+               argstr.to = value + len;
+
+               ret = match_int(&argstr, intval);
+               if (ret >= 0) {
+                       if (*intval < 0)
+                               break;
+                       return i;
+               }
+               if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+                       *intval = S32_MAX;
+                       return i;
+               }
+               break;
+       }
+       return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+                              int *new_limits, unsigned long *enables)
+{
+       char *c;
+       int err = -EINVAL;
+
+       /* parse resource options */
+       while ((c = strsep(&options, " ")) != NULL) {
+               int index, intval;
+
+               index = parse_resource(c, &intval);
+               if (index < 0)
+                       goto err;
+
+               new_limits[index] = intval;
+               *enables |= BIT(index);
+       }
+       return 0;
+
+err:
+       return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+       struct rdmacg_device *device;
+
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_for_each_entry(device, &rdmacg_devices, dev_node)
+               if (!strcmp(name, device->name))
+                       return device;
+
+       return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+                                      char *buf, size_t nbytes, loff_t off)
+{
+       struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+       const char *dev_name;
+       struct rdmacg_resource_pool *rpool;
+       struct rdmacg_device *device;
+       char *options = strstrip(buf);
+       int *new_limits;
+       unsigned long enables = 0;
+       int i = 0, ret = 0;
+
+       /* extract the device name first */
+       dev_name = strsep(&options, " ");
+       if (!dev_name) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+       if (!new_limits) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = rdmacg_parse_limits(options, new_limits, &enables);
+       if (ret)
+               goto parse_err;
+
+       /* acquire lock to synchronize with hot plug devices */
+       mutex_lock(&rdmacg_mutex);
+
+       device = rdmacg_get_device_locked(dev_name);
+       if (!device) {
+               ret = -ENODEV;
+               goto dev_err;
+       }
+
+       rpool = get_cg_rpool_locked(cg, device);
+       if (IS_ERR(rpool)) {
+               ret = PTR_ERR(rpool);
+               goto dev_err;
+       }
+
+       /* now set the new limits of the rpool */
+       for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+               set_resource_limit(rpool, i, new_limits[i]);
+
+       if (rpool->usage_sum == 0 &&
+           rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+               /*
+                * No user of the rpool and all entries are set to max, so
+                * safe to delete this rpool.
+                */
+               free_cg_rpool_locked(rpool);
+       }
+
+dev_err:
+       mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+       kfree(new_limits);
+
+err:
+       return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+                              struct rdmacg_resource_pool *rpool)
+{
+       enum rdmacg_file_type sf_type;
+       int i;
+       u32 value;
+
+       sf_type = seq_cft(sf)->private;
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+               seq_puts(sf, rdmacg_resource_names[i]);
+               seq_putc(sf, '=');
+               if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+                       if (rpool)
+                               value = rpool->resources[i].max;
+                       else
+                               value = S32_MAX;
+               } else {
+                       if (rpool)
+                               value = rpool->resources[i].usage;
+                       else
+                               value = 0;
+               }
+
+               if (value == S32_MAX)
+                       seq_puts(sf, RDMACG_MAX_STR);
+               else
+                       seq_printf(sf, "%d", value);
+               seq_putc(sf, ' ');
+       }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+       struct rdmacg_device *device;
+       struct rdmacg_resource_pool *rpool;
+       struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+       mutex_lock(&rdmacg_mutex);
+
+       list_for_each_entry(device, &rdmacg_devices, dev_node) {
+               seq_printf(sf, "%s ", device->name);
+
+               rpool = find_cg_rpool_locked(cg, device);
+               print_rpool_values(sf, rpool);
+
+               seq_putc(sf, '\n');
+       }
+
+       mutex_unlock(&rdmacg_mutex);
+       return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+       {
+               .name = "max",
+               .write = rdmacg_resource_set_max,
+               .seq_show = rdmacg_resource_read,
+               .private = RDMACG_RESOURCE_TYPE_MAX,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       {
+               .name = "current",
+               .seq_show = rdmacg_resource_read,
+               .private = RDMACG_RESOURCE_TYPE_STAT,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       { }     /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+       struct rdma_cgroup *cg;
+
+       cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+       if (!cg)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&cg->rpools);
+       return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+       struct rdma_cgroup *cg = css_rdmacg(css);
+
+       kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+       struct rdma_cgroup *cg = css_rdmacg(css);
+       struct rdmacg_resource_pool *rpool;
+
+       mutex_lock(&rdmacg_mutex);
+
+       list_for_each_entry(rpool, &cg->rpools, cg_node)
+               set_all_resource_max_limit(rpool);
+
+       mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+       .css_alloc      = rdmacg_css_alloc,
+       .css_free       = rdmacg_css_free,
+       .css_offline    = rdmacg_css_offline,
+       .legacy_cftypes = rdmacg_files,
+       .dfl_cftypes    = rdmacg_files,
+};
index 1a8f34f..26a06e0 100644 (file)
@@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
@@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECCOMP=y
index 99127ed..28ee064 100644 (file)
@@ -1,4 +1,5 @@
 #  KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_LEGACY_PTYS is not set
index b2eb354..1031bdf 100644 (file)
@@ -455,7 +455,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
 {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
        if (ret || !write)
                return ret;
@@ -3522,6 +3522,8 @@ static void perf_event_enable_on_exec(int ctxn)
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
+       } else {
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
        }
        perf_ctx_unlock(cpuctx, ctx);
 
@@ -9955,6 +9957,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * of swizzling perf_event::ctx.
                 */
                perf_remove_from_context(group_leader, 0);
+               put_ctx(gctx);
 
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
@@ -9993,13 +9996,6 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
-
-               /*
-                * Now that all events are installed in @ctx, nothing
-                * references @gctx anymore, so drop the last reference we have
-                * on it.
-                */
-               put_ctx(gctx);
        }
 
        /*
@@ -10959,5 +10955,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc      = perf_cgroup_css_alloc,
        .css_free       = perf_cgroup_css_free,
        .attach         = perf_cgroup_attach,
+       /*
+        * Implicitly enable on dfl hierarchy so that perf events can
+        * always be filtered by cgroup2 path as long as perf_event
+        * controller is not mounted on a legacy hierarchy.
+        */
+       .implicit_on_dfl = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
index 18c6b23..d630f8a 100644 (file)
@@ -747,7 +747,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
                        continue;
                }
 
-               if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+               if (!mmget_not_zero(vma->vm_mm))
                        continue;
 
                info = prev;
index 90b09ca..8a768a3 100644 (file)
@@ -539,7 +539,7 @@ static void exit_mm(void)
                __set_current_state(TASK_RUNNING);
                down_read(&mm->mmap_sem);
        }
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
index 348fe73..246bf9a 100644 (file)
@@ -1000,7 +1000,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
-                       atomic_inc(&mm->mm_users);
+                       mmget(mm);
        }
        task_unlock(task);
        return mm;
@@ -1188,7 +1188,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        vmacache_flush(tsk);
 
        if (clone_flags & CLONE_VM) {
-               atomic_inc(&oldmm->mm_users);
+               mmget(oldmm);
                mm = oldmm;
                goto good_mm;
        }
index cdf3650..b687cb2 100644 (file)
@@ -338,7 +338,7 @@ static inline bool should_fail_futex(bool fshared)
 
 static inline void futex_get_mm(union futex_key *key)
 {
-       atomic_inc(&key->private.mm->mm_count);
+       mmgrab(key->private.mm);
        /*
         * Ensure futex_get_mm() implies a full barrier such that
         * get_futex_key() implies a full barrier. This is relied upon
index 6b66959..944d068 100644 (file)
@@ -353,7 +353,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
                return 0;
 
        /*
-        * Preserve the managed affinity setting and an userspace affinity
+        * Preserve the managed affinity setting and a userspace affinity
         * setup, but make sure that one of the targets is online.
         */
        if (irqd_affinity_is_managed(&desc->irq_data) ||
index 8f8dc91..0e413d9 100644 (file)
@@ -847,7 +847,7 @@ void relay_close(struct rchan *chan)
 
        if (chan->last_toobig)
                printk(KERN_WARNING "relay: one or more items not logged "
-                      "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+                      "[item size (%zd) > sub-buffer size (%zd)]\n",
                       chan->last_toobig, chan->subbuf_size);
 
        list_del(&chan->list);
index e1ae6ac..bbfb917 100644 (file)
@@ -1090,6 +1090,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        int ret = 0;
 
        rq = task_rq_lock(p, &rf);
+       update_rq_clock(rq);
 
        if (p->flags & PF_KTHREAD) {
                /*
@@ -2847,7 +2848,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
        if (!mm) {
                next->active_mm = oldmm;
-               atomic_inc(&oldmm->mm_count);
+               mmgrab(oldmm);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm_irqs_off(oldmm, mm, next);
@@ -5560,7 +5561,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
-       struct rq_flags rf, old_rf;
+       struct rq_flags rf;
        int dest_cpu;
 
        /*
@@ -5579,7 +5580,9 @@ static void migrate_tasks(struct rq *dead_rq)
         * class method both need to have an up-to-date
         * value of rq->clock[_task]
         */
+       rq_pin_lock(rq, &rf);
        update_rq_clock(rq);
+       rq_unpin_lock(rq, &rf);
 
        for (;;) {
                /*
@@ -5592,7 +5595,7 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task() assumes pinned rq->lock:
                 */
-               rq_pin_lock(rq, &rf);
+               rq_repin_lock(rq, &rf);
                next = pick_next_task(rq, &fake_task, &rf);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -5621,13 +5624,6 @@ static void migrate_tasks(struct rq *dead_rq)
                        continue;
                }
 
-               /*
-                * __migrate_task() may return with a different
-                * rq->lock held and a new cookie in 'rf', but we need
-                * to preserve rf::clock_update_flags for 'dead_rq'.
-                */
-               old_rf = rf;
-
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
@@ -5636,7 +5632,6 @@ static void migrate_tasks(struct rq *dead_rq)
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
-                       rf = old_rf;
                }
                raw_spin_unlock(&next->pi_lock);
        }
@@ -6098,7 +6093,7 @@ void __init sched_init(void)
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
-       atomic_inc(&init_mm.mm_count);
+       mmgrab(&init_mm);
        enter_lazy_tlb(&init_mm, current);
 
        /*
@@ -6819,11 +6814,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
 
-       sched_online_group(tg, parent);
-
        return &tg->css;
 }
 
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       struct task_group *parent = css_tg(css->parent);
+
+       if (parent)
+               sched_online_group(tg, parent);
+       return 0;
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
@@ -7229,6 +7233,7 @@ static struct cftype cpu_files[] = {
 
 struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
+       .css_online     = cpu_cgroup_css_online,
        .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
        .fork           = cpu_cgroup_fork,
index 13f9def..214a8fe 100644 (file)
@@ -3239,10 +3239,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
 
 int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 {
+       int err;
        struct task_struct *t = current;
-       return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
-               __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+       err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+                        &uss->ss_sp) |
+               __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
+       if (err)
+               return err;
+       if (t->sas_ss_flags & SS_AUTODISARM)
+               sas_ss_reset(t);
+       return 0;
 }
 #endif
 
index 0d887eb..01a9997 100644 (file)
@@ -311,7 +311,7 @@ EXPORT_SYMBOL_GPL(torture_random);
 /*
  * Variables for shuffling.  The idea is to ensure that each CPU stays
  * idle for an extended period to test interactions with dyntick idle,
- * as well as interactions with any per-CPU varibles.
+ * as well as interactions with any per-CPU variables.
  */
 struct shuffle_task {
        struct list_head st_l;
index 8f69579..0c8b78a 100644 (file)
@@ -559,7 +559,7 @@ config SBITMAP
        bool
 
 config PARMAN
-       tristate
+       tristate "parman" if COMPILE_TEST
 
 config PRIME_NUMBERS
        tristate
index 55735c9..97d62c2 100644 (file)
@@ -729,19 +729,6 @@ source "lib/Kconfig.kmemcheck"
 
 source "lib/Kconfig.kasan"
 
-config DEBUG_REFCOUNT
-       bool "Verbose refcount checks"
-       help
-         Say Y here if you want reference counters (refcount_t and kref) to
-         generate WARNs on dubious usage. Without this refcount_t will still
-         be a saturating counter and avoid Use-After-Free by turning it into
-         a resource leak Denial-Of-Service.
-
-         Use of this option will increase kernel text size but will alert the
-         admin of potential abuse.
-
-         If in doubt, say "N".
-
 endmenu # "Memory Debugging"
 
 config ARCH_HAS_KCOV
index c9023ef..320ac46 100644 (file)
@@ -25,6 +25,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
         earlycpio.o seq_buf.o siphash.o \
         nmi_backtrace.o nodemask.o win_minmax.o
 
+CFLAGS_radix-tree.o += -DCONFIG_SPARSE_RCU_POINTER
+CFLAGS_idr.o += -DCONFIG_SPARSE_RCU_POINTER
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_DMA_NOOP_OPS) += dma-noop.o
@@ -38,7 +41,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
         gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
         bsearch.o find_bit.o llist.o memweight.o kfifo.o \
         percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-        once.o
+        once.o refcount.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
index e77dfe0..8fa0791 100644 (file)
@@ -87,6 +87,14 @@ config FONT_6x10
          embedded devices with a 320x240 screen, to get a reasonable number
          of characters (53x24) that are still at a readable size.
 
+config FONT_10x18
+       bool "console 10x18 font (not supported by all drivers)" if FONTS
+       depends on FRAMEBUFFER_CONSOLE
+       help
+         This is a high resolution console font for machines with very
+         big letters. It fits between the sun 12x22 and the normal 8x16 font.
+         If other fonts are too big or too small for you, say Y, otherwise say N.
+
 config FONT_SUN8x16
        bool "Sparc console 8x16 font"
        depends on FRAMEBUFFER_CONSOLE && (!SPARC && FONTS || SPARC)
@@ -101,14 +109,6 @@ config FONT_SUN12x22
          big letters (like the letters used in the SPARC PROM). If the
          standard font is unreadable for you, say Y, otherwise say N.
 
-config FONT_10x18
-       bool "console 10x18 font (not supported by all drivers)" if FONTS
-       depends on FRAMEBUFFER_CONSOLE
-       help
-         This is a high resolution console font for machines with very
-         big letters. It fits between the sun 12x22 and the normal 8x16 font.
-         If other fonts are too big or too small for you, say Y, otherwise say N.
-
 config FONT_AUTOSELECT
        def_bool y
        depends on !FONT_8x8
index 52d2979..b13682b 100644 (file)
--- a/lib/idr.c
+++ b/lib/idr.c
-/*
- * 2002-10-18  written by Jim Houston jim.houston@ccur.com
- *     Copyright (C) 2002 by Concurrent Computer Corporation
- *     Distributed under the GNU GPL license version 2.
- *
- * Modified by George Anzinger to reuse immediately and to use
- * find bit instructions.  Also removed _irq on spinlocks.
- *
- * Modified by Nadia Derbey to make it RCU safe.
- *
- * Small id to pointer translation service.
- *
- * It uses a radix tree like structure as a sparse array indexed
- * by the id to obtain the pointer.  The bitmap makes allocating
- * a new id quick.
- *
- * You call it to allocate an id (an int) an associate with that id a
- * pointer or what ever, we treat it as a (void *).  You can pass this
- * id to a user for him to pass back at a later time.  You then pass
- * that id to this code and it returns your pointer.
- */
-
-#ifndef TEST                        // to test in user space...
-#include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/bitmap.h>
 #include <linux/export.h>
-#endif
-#include <linux/err.h>
-#include <linux/string.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/percpu.h>
-
-#define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
-#define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
-
-/* Leave the possibility of an incomplete final layer */
-#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS)
 
-/* Number of id_layer structs to leave in free list */
-#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2)
-
-static struct kmem_cache *idr_layer_cache;
-static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
-static DEFINE_PER_CPU(int, idr_preload_cnt);
+DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
 static DEFINE_SPINLOCK(simple_ida_lock);
 
-/* the maximum ID which can be allocated given idr->layers */
-static int idr_max(int layers)
-{
-       int bits = min_t(int, layers * IDR_BITS, MAX_IDR_SHIFT);
-
-       return (1 << bits) - 1;
-}
-
-/*
- * Prefix mask for an idr_layer at @layer.  For layer 0, the prefix mask is
- * all bits except for the lower IDR_BITS.  For layer 1, 2 * IDR_BITS, and
- * so on.
- */
-static int idr_layer_prefix_mask(int layer)
-{
-       return ~idr_max(layer + 1);
-}
-
-static struct idr_layer *get_from_free_list(struct idr *idp)
-{
-       struct idr_layer *p;
-       unsigned long flags;
-
-       spin_lock_irqsave(&idp->lock, flags);
-       if ((p = idp->id_free)) {
-               idp->id_free = p->ary[0];
-               idp->id_free_cnt--;
-               p->ary[0] = NULL;
-       }
-       spin_unlock_irqrestore(&idp->lock, flags);
-       return(p);
-}
-
 /**
- * idr_layer_alloc - allocate a new idr_layer
- * @gfp_mask: allocation mask
- * @layer_idr: optional idr to allocate from
- *
- * If @layer_idr is %NULL, directly allocate one using @gfp_mask or fetch
- * one from the per-cpu preload buffer.  If @layer_idr is not %NULL, fetch
- * an idr_layer from @idr->id_free.
- *
- * @layer_idr is to maintain backward compatibility with the old alloc
- * interface - idr_pre_get() and idr_get_new*() - and will be removed
- * together with per-pool preload buffer.
- */
-static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
-{
-       struct idr_layer *new;
-
-       /* this is the old path, bypass to get_from_free_list() */
-       if (layer_idr)
-               return get_from_free_list(layer_idr);
-
-       /*
-        * Try to allocate directly from kmem_cache.  We want to try this
-        * before preload buffer; otherwise, non-preloading idr_alloc()
-        * users will end up taking advantage of preloading ones.  As the
-        * following is allowed to fail for preloaded cases, suppress
-        * warning this time.
-        */
-       new = kmem_cache_zalloc(idr_layer_cache, gfp_mask | __GFP_NOWARN);
-       if (new)
-               return new;
-
-       /*
-        * Try to fetch one from the per-cpu preload buffer if in process
-        * context.  See idr_preload() for details.
-        */
-       if (!in_interrupt()) {
-               preempt_disable();
-               new = __this_cpu_read(idr_preload_head);
-               if (new) {
-                       __this_cpu_write(idr_preload_head, new->ary[0]);
-                       __this_cpu_dec(idr_preload_cnt);
-                       new->ary[0] = NULL;
-               }
-               preempt_enable();
-               if (new)
-                       return new;
-       }
-
-       /*
-        * Both failed.  Try kmem_cache again w/o adding __GFP_NOWARN so
-        * that memory allocation failure warning is printed as intended.
-        */
-       return kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-}
-
-static void idr_layer_rcu_free(struct rcu_head *head)
-{
-       struct idr_layer *layer;
-
-       layer = container_of(head, struct idr_layer, rcu_head);
-       kmem_cache_free(idr_layer_cache, layer);
-}
-
-static inline void free_layer(struct idr *idr, struct idr_layer *p)
-{
-       if (idr->hint == p)
-               RCU_INIT_POINTER(idr->hint, NULL);
-       call_rcu(&p->rcu_head, idr_layer_rcu_free);
-}
-
-/* only called when idp->lock is held */
-static void __move_to_free_list(struct idr *idp, struct idr_layer *p)
-{
-       p->ary[0] = idp->id_free;
-       idp->id_free = p;
-       idp->id_free_cnt++;
-}
-
-static void move_to_free_list(struct idr *idp, struct idr_layer *p)
-{
-       unsigned long flags;
-
-       /*
-        * Depends on the return element being zeroed.
-        */
-       spin_lock_irqsave(&idp->lock, flags);
-       __move_to_free_list(idp, p);
-       spin_unlock_irqrestore(&idp->lock, flags);
-}
-
-static void idr_mark_full(struct idr_layer **pa, int id)
-{
-       struct idr_layer *p = pa[0];
-       int l = 0;
-
-       __set_bit(id & IDR_MASK, p->bitmap);
-       /*
-        * If this layer is full mark the bit in the layer above to
-        * show that this part of the radix tree is full.  This may
-        * complete the layer above and require walking up the radix
-        * tree.
-        */
-       while (bitmap_full(p->bitmap, IDR_SIZE)) {
-               if (!(p = pa[++l]))
-                       break;
-               id = id >> IDR_BITS;
-               __set_bit((id & IDR_MASK), p->bitmap);
-       }
-}
-
-static int __idr_pre_get(struct idr *idp, gfp_t gfp_mask)
-{
-       while (idp->id_free_cnt < MAX_IDR_FREE) {
-               struct idr_layer *new;
-               new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-               if (new == NULL)
-                       return (0);
-               move_to_free_list(idp, new);
-       }
-       return 1;
-}
-
-/**
- * sub_alloc - try to allocate an id without growing the tree depth
- * @idp: idr handle
- * @starting_id: id to start search at
- * @pa: idr_layer[MAX_IDR_LEVEL] used as backtrack buffer
- * @gfp_mask: allocation mask for idr_layer_alloc()
- * @layer_idr: optional idr passed to idr_layer_alloc()
- *
- * Allocate an id in range [@starting_id, INT_MAX] from @idp without
- * growing its depth.  Returns
- *
- *  the allocated id >= 0 if successful,
- *  -EAGAIN if the tree needs to grow for allocation to succeed,
- *  -ENOSPC if the id space is exhausted,
- *  -ENOMEM if more idr_layers need to be allocated.
- */
-static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa,
-                    gfp_t gfp_mask, struct idr *layer_idr)
-{
-       int n, m, sh;
-       struct idr_layer *p, *new;
-       int l, id, oid;
-
-       id = *starting_id;
- restart:
-       p = idp->top;
-       l = idp->layers;
-       pa[l--] = NULL;
-       while (1) {
-               /*
-                * We run around this while until we reach the leaf node...
-                */
-               n = (id >> (IDR_BITS*l)) & IDR_MASK;
-               m = find_next_zero_bit(p->bitmap, IDR_SIZE, n);
-               if (m == IDR_SIZE) {
-                       /* no space available go back to previous layer. */
-                       l++;
-                       oid = id;
-                       id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
-
-                       /* if already at the top layer, we need to grow */
-                       if (id > idr_max(idp->layers)) {
-                               *starting_id = id;
-                               return -EAGAIN;
-                       }
-                       p = pa[l];
-                       BUG_ON(!p);
-
-                       /* If we need to go up one layer, continue the
-                        * loop; otherwise, restart from the top.
-                        */
-                       sh = IDR_BITS * (l + 1);
-                       if (oid >> sh == id >> sh)
-                               continue;
-                       else
-                               goto restart;
-               }
-               if (m != n) {
-                       sh = IDR_BITS*l;
-                       id = ((id >> sh) ^ n ^ m) << sh;
-               }
-               if ((id >= MAX_IDR_BIT) || (id < 0))
-                       return -ENOSPC;
-               if (l == 0)
-                       break;
-               /*
-                * Create the layer below if it is missing.
-                */
-               if (!p->ary[m]) {
-                       new = idr_layer_alloc(gfp_mask, layer_idr);
-                       if (!new)
-                               return -ENOMEM;
-                       new->layer = l-1;
-                       new->prefix = id & idr_layer_prefix_mask(new->layer);
-                       rcu_assign_pointer(p->ary[m], new);
-                       p->count++;
-               }
-               pa[l--] = p;
-               p = p->ary[m];
-       }
-
-       pa[l] = p;
-       return id;
-}
-
-static int idr_get_empty_slot(struct idr *idp, int starting_id,
-                             struct idr_layer **pa, gfp_t gfp_mask,
-                             struct idr *layer_idr)
-{
-       struct idr_layer *p, *new;
-       int layers, v, id;
-       unsigned long flags;
-
-       id = starting_id;
-build_up:
-       p = idp->top;
-       layers = idp->layers;
-       if (unlikely(!p)) {
-               if (!(p = idr_layer_alloc(gfp_mask, layer_idr)))
-                       return -ENOMEM;
-               p->layer = 0;
-               layers = 1;
-       }
-       /*
-        * Add a new layer to the top of the tree if the requested
-        * id is larger than the currently allocated space.
-        */
-       while (id > idr_max(layers)) {
-               layers++;
-               if (!p->count) {
-                       /* special case: if the tree is currently empty,
-                        * then we grow the tree by moving the top node
-                        * upwards.
-                        */
-                       p->layer++;
-                       WARN_ON_ONCE(p->prefix);
-                       continue;
-               }
-               if (!(new = idr_layer_alloc(gfp_mask, layer_idr))) {
-                       /*
-                        * The allocation failed.  If we built part of
-                        * the structure tear it down.
-                        */
-                       spin_lock_irqsave(&idp->lock, flags);
-                       for (new = p; p && p != idp->top; new = p) {
-                               p = p->ary[0];
-                               new->ary[0] = NULL;
-                               new->count = 0;
-                               bitmap_clear(new->bitmap, 0, IDR_SIZE);
-                               __move_to_free_list(idp, new);
-                       }
-                       spin_unlock_irqrestore(&idp->lock, flags);
-                       return -ENOMEM;
-               }
-               new->ary[0] = p;
-               new->count = 1;
-               new->layer = layers-1;
-               new->prefix = id & idr_layer_prefix_mask(new->layer);
-               if (bitmap_full(p->bitmap, IDR_SIZE))
-                       __set_bit(0, new->bitmap);
-               p = new;
-       }
-       rcu_assign_pointer(idp->top, p);
-       idp->layers = layers;
-       v = sub_alloc(idp, &id, pa, gfp_mask, layer_idr);
-       if (v == -EAGAIN)
-               goto build_up;
-       return(v);
-}
-
-/*
- * @id and @pa are from a successful allocation from idr_get_empty_slot().
- * Install the user pointer @ptr and mark the slot full.
- */
-static void idr_fill_slot(struct idr *idr, void *ptr, int id,
-                         struct idr_layer **pa)
-{
-       /* update hint used for lookup, cleared from free_layer() */
-       rcu_assign_pointer(idr->hint, pa[0]);
-
-       rcu_assign_pointer(pa[0]->ary[id & IDR_MASK], (struct idr_layer *)ptr);
-       pa[0]->count++;
-       idr_mark_full(pa, id);
-}
-
-
-/**
- * idr_preload - preload for idr_alloc()
- * @gfp_mask: allocation mask to use for preloading
- *
- * Preload per-cpu layer buffer for idr_alloc().  Can only be used from
- * process context and each idr_preload() invocation should be matched with
- * idr_preload_end().  Note that preemption is disabled while preloaded.
- *
- * The first idr_alloc() in the preloaded section can be treated as if it
- * were invoked with @gfp_mask used for preloading.  This allows using more
- * permissive allocation masks for idrs protected by spinlocks.
- *
- * For example, if idr_alloc() below fails, the failure can be treated as
- * if idr_alloc() were called with GFP_KERNEL rather than GFP_NOWAIT.
- *
- *     idr_preload(GFP_KERNEL);
- *     spin_lock(lock);
- *
- *     id = idr_alloc(idr, ptr, start, end, GFP_NOWAIT);
- *
- *     spin_unlock(lock);
- *     idr_preload_end();
- *     if (id < 0)
- *             error;
- */
-void idr_preload(gfp_t gfp_mask)
-{
-       /*
-        * Consuming preload buffer from non-process context breaks preload
-        * allocation guarantee.  Disallow usage from those contexts.
-        */
-       WARN_ON_ONCE(in_interrupt());
-       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
-       preempt_disable();
-
-       /*
-        * idr_alloc() is likely to succeed w/o full idr_layer buffer and
-        * return value from idr_alloc() needs to be checked for failure
-        * anyway.  Silently give up if allocation fails.  The caller can
-        * treat failures from idr_alloc() as if idr_alloc() were called
-        * with @gfp_mask which should be enough.
-        */
-       while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
-               struct idr_layer *new;
-
-               preempt_enable();
-               new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-               preempt_disable();
-               if (!new)
-                       break;
-
-               /* link the new one to per-cpu preload list */
-               new->ary[0] = __this_cpu_read(idr_preload_head);
-               __this_cpu_write(idr_preload_head, new);
-               __this_cpu_inc(idr_preload_cnt);
-       }
-}
-EXPORT_SYMBOL(idr_preload);
-
-/**
- * idr_alloc - allocate new idr entry
- * @idr: the (initialized) idr
+ * idr_alloc - allocate an id
+ * @idr: idr handle
  * @ptr: pointer to be associated with the new id
  * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive, <= 0 for max)
- * @gfp_mask: memory allocation flags
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
  *
- * Allocate an id in [start, end) and associate it with @ptr.  If no ID is
- * available in the specified range, returns -ENOSPC.  On memory allocation
- * failure, returns -ENOMEM.
+ * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
+ * if there are no unused IDs in that range.
  *
  * Note that @end is treated as max when <= 0.  This is to always allow
  * using @start + N as @end as long as N is inside integer range.
  *
- * The user is responsible for exclusively synchronizing all operations
- * which may modify @idr.  However, read-only accesses such as idr_find()
- * or iteration can be performed under RCU read lock provided the user
- * destroys @ptr in RCU-safe way after removal from idr.
+ * Simultaneous modifications to the @idr are not allowed and should be
+ * prevented by the user, usually with a lock.  idr_alloc() may be called
+ * concurrently with read-only accesses to the @idr, such as idr_find() and
+ * idr_for_each_entry().
  */
-int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
+int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
 {
-       int max = end > 0 ? end - 1 : INT_MAX;  /* inclusive upper limit */
-       struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-       int id;
+       void __rcu **slot;
+       struct radix_tree_iter iter;
 
-       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
-       /* sanity checks */
        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;
-       if (unlikely(max < start))
-               return -ENOSPC;
+       if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
+               return -EINVAL;
 
-       /* allocate id */
-       id = idr_get_empty_slot(idr, start, pa, gfp_mask, NULL);
-       if (unlikely(id < 0))
-               return id;
-       if (unlikely(id > max))
-               return -ENOSPC;
+       radix_tree_iter_init(&iter, start);
+       slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
+       if (IS_ERR(slot))
+               return PTR_ERR(slot);
 
-       idr_fill_slot(idr, ptr, id, pa);
-       return id;
+       radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
+       radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
+       return iter.index;
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
 /**
  * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
- * @idr: the (initialized) idr
+ * @idr: idr handle
  * @ptr: pointer to be associated with the new id
  * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive, <= 0 for max)
- * @gfp_mask: memory allocation flags
- *
- * Essentially the same as idr_alloc, but prefers to allocate progressively
- * higher ids if it can. If the "cur" counter wraps, then it will start again
- * at the "start" end of the range and allocate one that has already been used.
- */
-int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end,
-                       gfp_t gfp_mask)
-{
-       int id;
-
-       id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask);
-       if (id == -ENOSPC)
-               id = idr_alloc(idr, ptr, start, end, gfp_mask);
-
-       if (likely(id >= 0))
-               idr->cur = id + 1;
-       return id;
-}
-EXPORT_SYMBOL(idr_alloc_cyclic);
-
-static void idr_remove_warning(int id)
-{
-       WARN(1, "idr_remove called for id=%d which is not allocated.\n", id);
-}
-
-static void sub_remove(struct idr *idp, int shift, int id)
-{
-       struct idr_layer *p = idp->top;
-       struct idr_layer **pa[MAX_IDR_LEVEL + 1];
-       struct idr_layer ***paa = &pa[0];
-       struct idr_layer *to_free;
-       int n;
-
-       *paa = NULL;
-       *++paa = &idp->top;
-
-       while ((shift > 0) && p) {
-               n = (id >> shift) & IDR_MASK;
-               __clear_bit(n, p->bitmap);
-               *++paa = &p->ary[n];
-               p = p->ary[n];
-               shift -= IDR_BITS;
-       }
-       n = id & IDR_MASK;
-       if (likely(p != NULL && test_bit(n, p->bitmap))) {
-               __clear_bit(n, p->bitmap);
-               RCU_INIT_POINTER(p->ary[n], NULL);
-               to_free = NULL;
-               while(*paa && ! --((**paa)->count)){
-                       if (to_free)
-                               free_layer(idp, to_free);
-                       to_free = **paa;
-                       **paa-- = NULL;
-               }
-               if (!*paa)
-                       idp->layers = 0;
-               if (to_free)
-                       free_layer(idp, to_free);
-       } else
-               idr_remove_warning(id);
-}
-
-/**
- * idr_remove - remove the given id and free its slot
- * @idp: idr handle
- * @id: unique key
- */
-void idr_remove(struct idr *idp, int id)
-{
-       struct idr_layer *p;
-       struct idr_layer *to_free;
-
-       if (id < 0)
-               return;
-
-       if (id > idr_max(idp->layers)) {
-               idr_remove_warning(id);
-               return;
-       }
-
-       sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
-       if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
-           idp->top->ary[0]) {
-               /*
-                * Single child at leftmost slot: we can shrink the tree.
-                * This level is not needed anymore since when layers are
-                * inserted, they are inserted at the top of the existing
-                * tree.
-                */
-               to_free = idp->top;
-               p = idp->top->ary[0];
-               rcu_assign_pointer(idp->top, p);
-               --idp->layers;
-               to_free->count = 0;
-               bitmap_clear(to_free->bitmap, 0, IDR_SIZE);
-               free_layer(idp, to_free);
-       }
-}
-EXPORT_SYMBOL(idr_remove);
-
-static void __idr_remove_all(struct idr *idp)
-{
-       int n, id, max;
-       int bt_mask;
-       struct idr_layer *p;
-       struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-       struct idr_layer **paa = &pa[0];
-
-       n = idp->layers * IDR_BITS;
-       *paa = idp->top;
-       RCU_INIT_POINTER(idp->top, NULL);
-       max = idr_max(idp->layers);
-
-       id = 0;
-       while (id >= 0 && id <= max) {
-               p = *paa;
-               while (n > IDR_BITS && p) {
-                       n -= IDR_BITS;
-                       p = p->ary[(id >> n) & IDR_MASK];
-                       *++paa = p;
-               }
-
-               bt_mask = id;
-               id += 1 << n;
-               /* Get the highest bit that the above add changed from 0->1. */
-               while (n < fls(id ^ bt_mask)) {
-                       if (*paa)
-                               free_layer(idp, *paa);
-                       n += IDR_BITS;
-                       --paa;
-               }
-       }
-       idp->layers = 0;
-}
-
-/**
- * idr_destroy - release all cached layers within an idr tree
- * @idp: idr handle
- *
- * Free all id mappings and all idp_layers.  After this function, @idp is
- * completely unused and can be freed / recycled.  The caller is
- * responsible for ensuring that no one else accesses @idp during or after
- * idr_destroy().
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
  *
- * A typical clean-up sequence for objects stored in an idr tree will use
- * idr_for_each() to free all objects, if necessary, then idr_destroy() to
- * free up the id mappings and cached idr_layers.
+ * Allocates an ID larger than the last ID allocated if one is available.
+ * If not, it will attempt to allocate the smallest ID that is larger or
+ * equal to @start.
  */
-void idr_destroy(struct idr *idp)
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
 {
-       __idr_remove_all(idp);
+       int id, curr = idr->idr_next;
 
-       while (idp->id_free_cnt) {
-               struct idr_layer *p = get_from_free_list(idp);
-               kmem_cache_free(idr_layer_cache, p);
-       }
-}
-EXPORT_SYMBOL(idr_destroy);
+       if (curr < start)
+               curr = start;
 
-void *idr_find_slowpath(struct idr *idp, int id)
-{
-       int n;
-       struct idr_layer *p;
-
-       if (id < 0)
-               return NULL;
-
-       p = rcu_dereference_raw(idp->top);
-       if (!p)
-               return NULL;
-       n = (p->layer+1) * IDR_BITS;
+       id = idr_alloc(idr, ptr, curr, end, gfp);
+       if ((id == -ENOSPC) && (curr > start))
+               id = idr_alloc(idr, ptr, start, curr, gfp);
 
-       if (id > idr_max(p->layer + 1))
-               return NULL;
-       BUG_ON(n == 0);
+       if (id >= 0)
+               idr->idr_next = id + 1U;
 
-       while (n > 0 && p) {
-               n -= IDR_BITS;
-               BUG_ON(n != p->layer*IDR_BITS);
-               p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-       }
-       return((void *)p);
+       return id;
 }
-EXPORT_SYMBOL(idr_find_slowpath);
+EXPORT_SYMBOL(idr_alloc_cyclic);
 
 /**
  * idr_for_each - iterate through all stored pointers
- * @idp: idr handle
+ * @idr: idr handle
  * @fn: function to be called for each pointer
- * @data: data passed back to callback function
+ * @data: data passed to callback function
  *
- * Iterate over the pointers registered with the given idr.  The
- * callback function will be called for each pointer currently
- * registered, passing the id, the pointer and the data pointer passed
- * to this function.  It is not safe to modify the idr tree while in
- * the callback, so functions such as idr_get_new and idr_remove are
- * not allowed.
+ * The callback function will be called for each entry in @idr, passing
+ * the id, the pointer and the data pointer passed to this function.
  *
- * We check the return of @fn each time. If it returns anything other
- * than %0, we break out and return that value.
+ * If @fn returns anything other than %0, the iteration stops and that
+ * value is returned from this function.
  *
- * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove().
+ * idr_for_each() can be called concurrently with idr_alloc() and
+ * idr_remove() if protected by RCU.  Newly added entries may not be
+ * seen and deleted entries may be seen, but adding and removing entries
+ * will not cause other entries to be skipped, nor spurious ones to be seen.
  */
-int idr_for_each(struct idr *idp,
-                int (*fn)(int id, void *p, void *data), void *data)
+int idr_for_each(const struct idr *idr,
+               int (*fn)(int id, void *p, void *data), void *data)
 {
-       int n, id, max, error = 0;
-       struct idr_layer *p;
-       struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-       struct idr_layer **paa = &pa[0];
-
-       n = idp->layers * IDR_BITS;
-       *paa = rcu_dereference_raw(idp->top);
-       max = idr_max(idp->layers);
+       struct radix_tree_iter iter;
+       void __rcu **slot;
 
-       id = 0;
-       while (id >= 0 && id <= max) {
-               p = *paa;
-               while (n > 0 && p) {
-                       n -= IDR_BITS;
-                       p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-                       *++paa = p;
-               }
-
-               if (p) {
-                       error = fn(id, (void *)p, data);
-                       if (error)
-                               break;
-               }
-
-               id += 1 << n;
-               while (n < fls(id)) {
-                       n += IDR_BITS;
-                       --paa;
-               }
+       radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
+               int ret = fn(iter.index, rcu_dereference_raw(*slot), data);
+               if (ret)
+                       return ret;
        }
 
-       return error;
+       return 0;
 }
 EXPORT_SYMBOL(idr_for_each);
 
 /**
- * idr_get_next - lookup next object of id to given id.
- * @idp: idr handle
- * @nextidp:  pointer to lookup key
- *
- * Returns pointer to registered object with id, which is next number to
- * given id. After being looked up, *@nextidp will be updated for the next
- * iteration.
- *
- * This function can be called under rcu_read_lock(), given that the leaf
- * pointers lifetimes are correctly managed.
+ * idr_get_next - Find next populated entry
+ * @idr: idr handle
+ * @nextid: Pointer to lowest possible ID to return
+ *
+ * Returns the next populated entry in the tree with an ID greater than
+ * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
+ * to the ID of the found value.  To use in a loop, the value pointed to by
+ * nextid must be incremented by the user.
  */
-void *idr_get_next(struct idr *idp, int *nextidp)
+void *idr_get_next(struct idr *idr, int *nextid)
 {
-       struct idr_layer *p, *pa[MAX_IDR_LEVEL + 1];
-       struct idr_layer **paa = &pa[0];
-       int id = *nextidp;
-       int n, max;
+       struct radix_tree_iter iter;
+       void __rcu **slot;
 
-       /* find first ent */
-       p = *paa = rcu_dereference_raw(idp->top);
-       if (!p)
+       slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+       if (!slot)
                return NULL;
-       n = (p->layer + 1) * IDR_BITS;
-       max = idr_max(p->layer + 1);
-
-       while (id >= 0 && id <= max) {
-               p = *paa;
-               while (n > 0 && p) {
-                       n -= IDR_BITS;
-                       p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-                       *++paa = p;
-               }
-
-               if (p) {
-                       *nextidp = id;
-                       return p;
-               }
 
-               /*
-                * Proceed to the next layer at the current level.  Unlike
-                * idr_for_each(), @id isn't guaranteed to be aligned to
-                * layer boundary at this point and adding 1 << n may
-                * incorrectly skip IDs.  Make sure we jump to the
-                * beginning of the next layer using round_up().
-                */
-               id = round_up(id + 1, 1 << n);
-               while (n < fls(id)) {
-                       n += IDR_BITS;
-                       --paa;
-               }
-       }
-       return NULL;
+       *nextid = iter.index;
+       return rcu_dereference_raw(*slot);
 }
 EXPORT_SYMBOL(idr_get_next);
 
-
 /**
  * idr_replace - replace pointer for given id
- * @idp: idr handle
- * @ptr: pointer you want associated with the id
- * @id: lookup key
+ * @idr: idr handle
+ * @ptr: New pointer to associate with the ID
+ * @id: Lookup key
  *
- * Replace the pointer registered with an id and return the old value.
- * A %-ENOENT return indicates that @id was not found.
- * A %-EINVAL return indicates that @id was not within valid constraints.
+ * Replace the pointer registered with an ID and return the old value.
+ * This function can be called under the RCU read lock concurrently with
+ * idr_alloc() and idr_remove() (as long as the ID being removed is not
+ * the one being replaced!).
  *
- * The caller must serialize with writers.
+ * Returns: 0 on success.  %-ENOENT indicates that @id was not found.
+ * %-EINVAL indicates that @id or @ptr were not valid.
  */
-void *idr_replace(struct idr *idp, void *ptr, int id)
+void *idr_replace(struct idr *idr, void *ptr, int id)
 {
-       int n;
-       struct idr_layer *p, *old_p;
+       struct radix_tree_node *node;
+       void __rcu **slot = NULL;
+       void *entry;
 
-       if (id < 0)
+       if (WARN_ON_ONCE(id < 0))
+               return ERR_PTR(-EINVAL);
+       if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
                return ERR_PTR(-EINVAL);
 
-       p = idp->top;
-       if (!p)
-               return ERR_PTR(-ENOENT);
-
-       if (id > idr_max(p->layer + 1))
-               return ERR_PTR(-ENOENT);
-
-       n = p->layer * IDR_BITS;
-       while ((n > 0) && p) {
-               p = p->ary[(id >> n) & IDR_MASK];
-               n -= IDR_BITS;
-       }
-
-       n = id & IDR_MASK;
-       if (unlikely(p == NULL || !test_bit(n, p->bitmap)))
+       entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
+       if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);
 
-       old_p = p->ary[n];
-       rcu_assign_pointer(p->ary[n], ptr);
+       __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL);
 
-       return old_p;
+       return entry;
 }
 EXPORT_SYMBOL(idr_replace);
 
-void __init idr_init_cache(void)
-{
-       idr_layer_cache = kmem_cache_create("idr_layer_cache",
-                               sizeof(struct idr_layer), 0, SLAB_PANIC, NULL);
-}
-
-/**
- * idr_init - initialize idr handle
- * @idp:       idr handle
- *
- * This function is use to set up the handle (@idp) that you will pass
- * to the rest of the functions.
- */
-void idr_init(struct idr *idp)
-{
-       memset(idp, 0, sizeof(struct idr));
-       spin_lock_init(&idp->lock);
-}
-EXPORT_SYMBOL(idr_init);
-
-static int idr_has_entry(int id, void *p, void *data)
-{
-       return 1;
-}
-
-bool idr_is_empty(struct idr *idp)
-{
-       return !idr_for_each(idp, idr_has_entry, NULL);
-}
-EXPORT_SYMBOL(idr_is_empty);
-
 /**
  * DOC: IDA description
- * IDA - IDR based ID allocator
  *
- * This is id allocator without id -> pointer translation.  Memory
- * usage is much lower than full blown idr because each id only
- * occupies a bit.  ida uses a custom leaf node which contains
- * IDA_BITMAP_BITS slots.
- *
- * 2007-04-25  written by Tejun Heo <htejun@gmail.com>
+ * The IDA is an ID allocator which does not provide the ability to
+ * associate an ID with a pointer.  As such, it only needs to store one
+ * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
+ * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
+ * then initialise it using ida_init()).  To allocate a new ID, call
+ * ida_simple_get().  To free an ID, call ida_simple_remove().
+ *
+ * If you have more complex locking requirements, use a loop around
+ * ida_pre_get() and ida_get_new() to allocate a new ID.  Then use
+ * ida_remove() to free an ID.  You must make sure that ida_get_new() and
+ * ida_remove() cannot be called at the same time as each other for the
+ * same IDA.
+ *
+ * You can also use ida_get_new_above() if you need an ID to be allocated
+ * above a particular number.  ida_destroy() can be used to dispose of an
+ * IDA without needing to free the individual IDs in it.  You can use
+ * ida_is_empty() to find out whether the IDA has any IDs currently allocated.
+ *
+ * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
+ * limitation, it should be quite straightforward to raise the maximum.
  */
 
-static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap)
-{
-       unsigned long flags;
-
-       if (!ida->free_bitmap) {
-               spin_lock_irqsave(&ida->idr.lock, flags);
-               if (!ida->free_bitmap) {
-                       ida->free_bitmap = bitmap;
-                       bitmap = NULL;
-               }
-               spin_unlock_irqrestore(&ida->idr.lock, flags);
-       }
-
-       kfree(bitmap);
-}
-
-/**
- * ida_pre_get - reserve resources for ida allocation
- * @ida:       ida handle
- * @gfp_mask:  memory allocation flag
- *
- * This function should be called prior to locking and calling the
- * following function.  It preallocates enough memory to satisfy the
- * worst possible allocation.
- *
- * If the system is REALLY out of memory this function returns %0,
- * otherwise %1.
+/*
+ * Developer's notes:
+ *
+ * The IDA uses the functionality provided by the IDR & radix tree to store
+ * bitmaps in each entry.  The IDR_FREE tag means there is at least one bit
+ * free, unlike the IDR where it means at least one entry is free.
+ *
+ * I considered telling the radix tree that each slot is an order-10 node
+ * and storing the bit numbers in the radix tree, but the radix tree can't
+ * allow a single multiorder entry at index 0, which would significantly
+ * increase memory consumption for the IDA.  So instead we divide the index
+ * by the number of bits in the leaf bitmap before doing a radix tree lookup.
+ *
+ * As an optimisation, if there are only a few low bits set in any given
+ * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional
+ * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits
+ * directly in the entry.  By being really tricksy, we could store
+ * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising
+ * for 0-3 allocated IDs.
+ *
+ * We allow the radix tree 'exceptional' count to get out of date.  Nothing
+ * in the IDA nor the radix tree code checks it.  If it becomes important
+ * to maintain an accurate exceptional count, switch the rcu_assign_pointer()
+ * calls to radix_tree_iter_replace() which will correct the exceptional
+ * count.
+ *
+ * The IDA always requires a lock to alloc/free.  If we add a 'test_bit'
+ * equivalent, it will still need locking.  Going to RCU lookup would require
+ * using RCU to free bitmaps, and that's not trivial without embedding an
+ * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
+ * bitmap, which is excessive.
  */
-int ida_pre_get(struct ida *ida, gfp_t gfp_mask)
-{
-       /* allocate idr_layers */
-       if (!__idr_pre_get(&ida->idr, gfp_mask))
-               return 0;
 
-       /* allocate free_bitmap */
-       if (!ida->free_bitmap) {
-               struct ida_bitmap *bitmap;
-
-               bitmap = kmalloc(sizeof(struct ida_bitmap), gfp_mask);
-               if (!bitmap)
-                       return 0;
-
-               free_bitmap(ida, bitmap);
-       }
-
-       return 1;
-}
-EXPORT_SYMBOL(ida_pre_get);
+#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS)
 
 /**
  * ida_get_new_above - allocate new ID above or equal to a start id
- * @ida:       ida handle
- * @starting_id: id to start search at
- * @p_id:      pointer to the allocated handle
+ * @ida: ida handle
+ * @start: id to start search at
+ * @id: pointer to the allocated handle
  *
- * Allocate new ID above or equal to @starting_id.  It should be called
- * with any required locks.
+ * Allocate new ID above or equal to @start.  It should be called
+ * with any required locks to ensure that concurrent calls to
+ * ida_get_new_above() / ida_get_new() / ida_remove() are not allowed.
+ * Consider using ida_simple_get() if you do not have complex locking
+ * requirements.
  *
  * If memory is required, it will return %-EAGAIN, you should unlock
  * and go back to the ida_pre_get() call.  If the ida is full, it will
- * return %-ENOSPC.
- *
- * Note that callers must ensure that concurrent access to @ida is not possible.
- * See ida_simple_get() for a varaint which takes care of locking.
+ * return %-ENOSPC.  On success, it will return 0.
  *
- * @p_id returns a value in the range @starting_id ... %0x7fffffff.
+ * @id returns a value in the range @start ... %0x7fffffff.
  */
-int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
+int ida_get_new_above(struct ida *ida, int start, int *id)
 {
-       struct idr_layer *pa[MAX_IDR_LEVEL + 1];
+       struct radix_tree_root *root = &ida->ida_rt;
+       void __rcu **slot;
+       struct radix_tree_iter iter;
        struct ida_bitmap *bitmap;
-       unsigned long flags;
-       int idr_id = starting_id / IDA_BITMAP_BITS;
-       int offset = starting_id % IDA_BITMAP_BITS;
-       int t, id;
-
- restart:
-       /* get vacant slot */
-       t = idr_get_empty_slot(&ida->idr, idr_id, pa, 0, &ida->idr);
-       if (t < 0)
-               return t == -ENOMEM ? -EAGAIN : t;
-
-       if (t * IDA_BITMAP_BITS >= MAX_IDR_BIT)
-               return -ENOSPC;
-
-       if (t != idr_id)
-               offset = 0;
-       idr_id = t;
-
-       /* if bitmap isn't there, create a new one */
-       bitmap = (void *)pa[0]->ary[idr_id & IDR_MASK];
-       if (!bitmap) {
-               spin_lock_irqsave(&ida->idr.lock, flags);
-               bitmap = ida->free_bitmap;
-               ida->free_bitmap = NULL;
-               spin_unlock_irqrestore(&ida->idr.lock, flags);
-
-               if (!bitmap)
-                       return -EAGAIN;
-
-               memset(bitmap, 0, sizeof(struct ida_bitmap));
-               rcu_assign_pointer(pa[0]->ary[idr_id & IDR_MASK],
-                               (void *)bitmap);
-               pa[0]->count++;
-       }
-
-       /* lookup for empty slot */
-       t = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, offset);
-       if (t == IDA_BITMAP_BITS) {
-               /* no empty slot after offset, continue to the next chunk */
-               idr_id++;
-               offset = 0;
-               goto restart;
-       }
-
-       id = idr_id * IDA_BITMAP_BITS + t;
-       if (id >= MAX_IDR_BIT)
-               return -ENOSPC;
+       unsigned long index;
+       unsigned bit, ebit;
+       int new;
+
+       index = start / IDA_BITMAP_BITS;
+       bit = start % IDA_BITMAP_BITS;
+       ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+       slot = radix_tree_iter_init(&iter, index);
+       for (;;) {
+               if (slot)
+                       slot = radix_tree_next_slot(slot, &iter,
+                                               RADIX_TREE_ITER_TAGGED);
+               if (!slot) {
+                       slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX);
+                       if (IS_ERR(slot)) {
+                               if (slot == ERR_PTR(-ENOMEM))
+                                       return -EAGAIN;
+                               return PTR_ERR(slot);
+                       }
+               }
+               if (iter.index > index) {
+                       bit = 0;
+                       ebit = RADIX_TREE_EXCEPTIONAL_SHIFT;
+               }
+               new = iter.index * IDA_BITMAP_BITS;
+               bitmap = rcu_dereference_raw(*slot);
+               if (radix_tree_exception(bitmap)) {
+                       unsigned long tmp = (unsigned long)bitmap;
+                       ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit);
+                       if (ebit < BITS_PER_LONG) {
+                               tmp |= 1UL << ebit;
+                               rcu_assign_pointer(*slot, (void *)tmp);
+                               *id = new + ebit - RADIX_TREE_EXCEPTIONAL_SHIFT;
+                               return 0;
+                       }
+                       bitmap = this_cpu_xchg(ida_bitmap, NULL);
+                       if (!bitmap)
+                               return -EAGAIN;
+                       memset(bitmap, 0, sizeof(*bitmap));
+                       bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+                       rcu_assign_pointer(*slot, bitmap);
+               }
 
-       __set_bit(t, bitmap->bitmap);
-       if (++bitmap->nr_busy == IDA_BITMAP_BITS)
-               idr_mark_full(pa, idr_id);
+               if (bitmap) {
+                       bit = find_next_zero_bit(bitmap->bitmap,
+                                                       IDA_BITMAP_BITS, bit);
+                       new += bit;
+                       if (new < 0)
+                               return -ENOSPC;
+                       if (bit == IDA_BITMAP_BITS)
+                               continue;
 
-       *p_id = id;
+                       __set_bit(bit, bitmap->bitmap);
+                       if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
+                               radix_tree_iter_tag_clear(root, &iter,
+                                                               IDR_FREE);
+               } else {
+                       new += bit;
+                       if (new < 0)
+                               return -ENOSPC;
+                       if (ebit < BITS_PER_LONG) {
+                               bitmap = (void *)((1UL << ebit) |
+                                               RADIX_TREE_EXCEPTIONAL_ENTRY);
+                               radix_tree_iter_replace(root, &iter, slot,
+                                               bitmap);
+                               *id = new;
+                               return 0;
+                       }
+                       bitmap = this_cpu_xchg(ida_bitmap, NULL);
+                       if (!bitmap)
+                               return -EAGAIN;
+                       memset(bitmap, 0, sizeof(*bitmap));
+                       __set_bit(bit, bitmap->bitmap);
+                       radix_tree_iter_replace(root, &iter, slot, bitmap);
+               }
 
-       /* Each leaf node can handle nearly a thousand slots and the
-        * whole idea of ida is to have small memory foot print.
-        * Throw away extra resources one by one after each successful
-        * allocation.
-        */
-       if (ida->idr.id_free_cnt || ida->free_bitmap) {
-               struct idr_layer *p = get_from_free_list(&ida->idr);
-               if (p)
-                       kmem_cache_free(idr_layer_cache, p);
+               *id = new;
+               return 0;
        }
-
-       return 0;
 }
 EXPORT_SYMBOL(ida_get_new_above);
 
 /**
- * ida_remove - remove the given ID
- * @ida:       ida handle
- * @id:                ID to free
+ * ida_remove - Free the given ID
+ * @ida: ida handle
+ * @id: ID to free
+ *
+ * This function should not be called at the same time as ida_get_new_above().
  */
 void ida_remove(struct ida *ida, int id)
 {
-       struct idr_layer *p = ida->idr.top;
-       int shift = (ida->idr.layers - 1) * IDR_BITS;
-       int idr_id = id / IDA_BITMAP_BITS;
-       int offset = id % IDA_BITMAP_BITS;
-       int n;
+       unsigned long index = id / IDA_BITMAP_BITS;
+       unsigned offset = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
+       unsigned long *btmp;
+       struct radix_tree_iter iter;
+       void __rcu **slot;
 
-       if (idr_id > idr_max(ida->idr.layers))
+       slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index);
+       if (!slot)
                goto err;
 
-       /* clear full bits while looking up the leaf idr_layer */
-       while ((shift > 0) && p) {
-               n = (idr_id >> shift) & IDR_MASK;
-               __clear_bit(n, p->bitmap);
-               p = p->ary[n];
-               shift -= IDR_BITS;
+       bitmap = rcu_dereference_raw(*slot);
+       if (radix_tree_exception(bitmap)) {
+               btmp = (unsigned long *)slot;
+               offset += RADIX_TREE_EXCEPTIONAL_SHIFT;
+               if (offset >= BITS_PER_LONG)
+                       goto err;
+       } else {
+               btmp = bitmap->bitmap;
        }
-
-       if (p == NULL)
-               goto err;
-
-       n = idr_id & IDR_MASK;
-       __clear_bit(n, p->bitmap);
-
-       bitmap = (void *)p->ary[n];
-       if (!bitmap || !test_bit(offset, bitmap->bitmap))
+       if (!test_bit(offset, btmp))
                goto err;
 
-       /* update bitmap and remove it if empty */
-       __clear_bit(offset, bitmap->bitmap);
-       if (--bitmap->nr_busy == 0) {
-               __set_bit(n, p->bitmap);        /* to please idr_remove() */
-               idr_remove(&ida->idr, idr_id);
-               free_bitmap(ida, bitmap);
+       __clear_bit(offset, btmp);
+       radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE);
+       if (radix_tree_exception(bitmap)) {
+               if (rcu_dereference_raw(*slot) ==
+                                       (void *)RADIX_TREE_EXCEPTIONAL_ENTRY)
+                       radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
+       } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) {
+               kfree(bitmap);
+               radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
        }
-
        return;
-
  err:
        WARN(1, "ida_remove called for id=%d which is not allocated.\n", id);
 }
 EXPORT_SYMBOL(ida_remove);
 
 /**
- * ida_destroy - release all cached layers within an ida tree
- * @ida:               ida handle
+ * ida_destroy - Free the contents of an ida
+ * @ida: ida handle
+ *
+ * Calling this function releases all resources associated with an IDA.  When
+ * this call returns, the IDA is empty and can be reused or freed.  The caller
+ * should not allow ida_remove() or ida_get_new_above() to be called at the
+ * same time.
  */
 void ida_destroy(struct ida *ida)
 {
-       idr_destroy(&ida->idr);
-       kfree(ida->free_bitmap);
+       struct radix_tree_iter iter;
+       void __rcu **slot;
+
+       radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) {
+               struct ida_bitmap *bitmap = rcu_dereference_raw(*slot);
+               if (!radix_tree_exception(bitmap))
+                       kfree(bitmap);
+               radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
+       }
 }
 EXPORT_SYMBOL(ida_destroy);
 
@@ -1141,18 +482,3 @@ void ida_simple_remove(struct ida *ida, unsigned int id)
        spin_unlock_irqrestore(&simple_ida_lock, flags);
 }
 EXPORT_SYMBOL(ida_simple_remove);
-
-/**
- * ida_init - initialize ida handle
- * @ida:       ida handle
- *
- * This function is use to set up the handle (@ida) that you will pass
- * to the rest of the functions.
- */
-void ida_init(struct ida *ida)
-{
-       memset(ida, 0, sizeof(struct ida));
-       idr_init(&ida->idr);
-
-}
-EXPORT_SYMBOL(ida_init);
index c8cebb1..9c21000 100644 (file)
@@ -176,13 +176,12 @@ static int percpu_counter_cpu_dead(unsigned int cpu)
        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;
-               unsigned long flags;
 
-               raw_spin_lock_irqsave(&fbc->lock, flags);
+               raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
-               raw_spin_unlock_irqrestore(&fbc->lock, flags);
+               raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
 #endif
index 72fab49..5ed506d 100644 (file)
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/idr.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/radix-tree.h>
+#include <linux/kmemleak.h>
 #include <linux/percpu.h>
+#include <linux/preempt.h>             /* in_interrupt() */
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
 #include <linux/slab.h>
-#include <linux/kmemleak.h>
-#include <linux/cpu.h>
 #include <linux/string.h>
-#include <linux/bitops.h>
-#include <linux/rcupdate.h>
-#include <linux/preempt.h>             /* in_interrupt() */
 
 
 /* Number of nodes in fully populated tree of given height */
@@ -60,11 +61,28 @@ static struct kmem_cache *radix_tree_node_cachep;
 #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
 
 /*
+ * The IDR does not have to be as high as the radix tree since it uses
+ * signed integers, not unsigned longs.
+ */
+#define IDR_INDEX_BITS         (8 /* CHAR_BIT */ * sizeof(int) - 1)
+#define IDR_MAX_PATH           (DIV_ROUND_UP(IDR_INDEX_BITS, \
+                                               RADIX_TREE_MAP_SHIFT))
+#define IDR_PRELOAD_SIZE       (IDR_MAX_PATH * 2 - 1)
+
+/*
+ * The IDA is even shorter since it uses a bitmap at the last level.
+ */
+#define IDA_INDEX_BITS         (8 * sizeof(int) - 1 - ilog2(IDA_BITMAP_BITS))
+#define IDA_MAX_PATH           (DIV_ROUND_UP(IDA_INDEX_BITS, \
+                                               RADIX_TREE_MAP_SHIFT))
+#define IDA_PRELOAD_SIZE       (IDA_MAX_PATH * 2 - 1)
+
+/*
  * Per-cpu pool of preloaded nodes
  */
 struct radix_tree_preload {
        unsigned nr;
-       /* nodes->private_data points to next preallocated node */
+       /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
@@ -83,35 +101,38 @@ static inline void *node_to_entry(void *ptr)
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
 /* Sibling slots point directly to another slot in the same node */
-static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+static inline
+bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 {
-       void **ptr = node;
+       void __rcu **ptr = node;
        return (parent->slots <= ptr) &&
                        (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
 }
 #else
-static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+static inline
+bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 {
        return false;
 }
 #endif
 
-static inline unsigned long get_slot_offset(struct radix_tree_node *parent,
-                                                void **slot)
+static inline unsigned long
+get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
 {
        return slot - parent->slots;
 }
 
-static unsigned int radix_tree_descend(struct radix_tree_node *parent,
+static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
 {
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
-       void **entry = rcu_dereference_raw(parent->slots[offset]);
+       void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
        if (radix_tree_is_internal_node(entry)) {
                if (is_sibling_entry(parent, entry)) {
-                       void **sibentry = (void **) entry_to_node(entry);
+                       void __rcu **sibentry;
+                       sibentry = (void __rcu **) entry_to_node(entry);
                        offset = get_slot_offset(parent, sibentry);
                        entry = rcu_dereference_raw(*sibentry);
                }
@@ -122,7 +143,7 @@ static unsigned int radix_tree_descend(struct radix_tree_node *parent,
        return offset;
 }
 
-static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
 {
        return root->gfp_mask & __GFP_BITS_MASK;
 }
@@ -139,42 +160,48 @@ static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
        __clear_bit(offset, node->tags[tag]);
 }
 
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
 {
        return test_bit(offset, node->tags[tag]);
 }
 
-static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
 {
-       root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+       root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
 }
 
 static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
 {
-       root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+       root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
 }
 
 static inline void root_tag_clear_all(struct radix_tree_root *root)
 {
-       root->gfp_mask &= __GFP_BITS_MASK;
+       root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1;
+}
+
+static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
+{
+       return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT));
 }
 
-static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+static inline unsigned root_tags_get(const struct radix_tree_root *root)
 {
-       return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+       return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT;
 }
 
-static inline unsigned root_tags_get(struct radix_tree_root *root)
+static inline bool is_idr(const struct radix_tree_root *root)
 {
-       return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
+       return !!(root->gfp_mask & ROOT_IS_IDR);
 }
 
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
  */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+static inline int any_tag_set(const struct radix_tree_node *node,
+                                                       unsigned int tag)
 {
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
@@ -184,6 +211,11 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
        return 0;
 }
 
+static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+       bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
+}
+
 /**
  * radix_tree_find_next_bit - find the next set bit in a memory region
  *
@@ -232,11 +264,18 @@ static inline unsigned long shift_maxindex(unsigned int shift)
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
 }
 
-static inline unsigned long node_maxindex(struct radix_tree_node *node)
+static inline unsigned long node_maxindex(const struct radix_tree_node *node)
 {
        return shift_maxindex(node->shift);
 }
 
+static unsigned long next_index(unsigned long index,
+                               const struct radix_tree_node *node,
+                               unsigned long offset)
+{
+       return (index & ~node_maxindex(node)) + (offset << node->shift);
+}
+
 #ifndef __KERNEL__
 static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
@@ -275,11 +314,59 @@ static void radix_tree_dump(struct radix_tree_root *root)
 {
        pr_debug("radix root: %p rnode %p tags %x\n",
                        root, root->rnode,
-                       root->gfp_mask >> __GFP_BITS_SHIFT);
+                       root->gfp_mask >> ROOT_TAG_SHIFT);
        if (!radix_tree_is_internal_node(root->rnode))
                return;
        dump_node(entry_to_node(root->rnode), 0);
 }
+
+static void dump_ida_node(void *entry, unsigned long index)
+{
+       unsigned long i;
+
+       if (!entry)
+               return;
+
+       if (radix_tree_is_internal_node(entry)) {
+               struct radix_tree_node *node = entry_to_node(entry);
+
+               pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n",
+                       node, node->offset, index * IDA_BITMAP_BITS,
+                       ((index | node_maxindex(node)) + 1) *
+                               IDA_BITMAP_BITS - 1,
+                       node->parent, node->tags[0][0], node->shift,
+                       node->count);
+               for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+                       dump_ida_node(node->slots[i],
+                                       index | (i << node->shift));
+       } else if (radix_tree_exceptional_entry(entry)) {
+               pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n",
+                               entry, (int)(index & RADIX_TREE_MAP_MASK),
+                               index * IDA_BITMAP_BITS,
+                               index * IDA_BITMAP_BITS + BITS_PER_LONG -
+                                       RADIX_TREE_EXCEPTIONAL_SHIFT,
+                               (unsigned long)entry >>
+                                       RADIX_TREE_EXCEPTIONAL_SHIFT);
+       } else {
+               struct ida_bitmap *bitmap = entry;
+
+               pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap,
+                               (int)(index & RADIX_TREE_MAP_MASK),
+                               index * IDA_BITMAP_BITS,
+                               (index + 1) * IDA_BITMAP_BITS - 1);
+               for (i = 0; i < IDA_BITMAP_LONGS; i++)
+                       pr_cont(" %lx", bitmap->bitmap[i]);
+               pr_cont("\n");
+       }
+}
+
+static void ida_dump(struct ida *ida)
+{
+       struct radix_tree_root *root = &ida->ida_rt;
+       pr_debug("ida: %p node %p free %d\n", ida, root->rnode,
+                               root->gfp_mask >> ROOT_TAG_SHIFT);
+       dump_ida_node(root->rnode, 0);
+}
 #endif
 
 /*
@@ -287,13 +374,12 @@ static void radix_tree_dump(struct radix_tree_root *root)
  * that the caller has pinned this thread of control to the current CPU.
  */
 static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root,
-                       struct radix_tree_node *parent,
+radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
+                       struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int exceptional)
 {
        struct radix_tree_node *ret = NULL;
-       gfp_t gfp_mask = root_gfp_mask(root);
 
        /*
         * Preload code isn't irq safe and it doesn't make sense to use
@@ -321,8 +407,7 @@ radix_tree_node_alloc(struct radix_tree_root *root,
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
-                       rtp->nodes = ret->private_data;
-                       ret->private_data = NULL;
+                       rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
@@ -336,11 +421,12 @@ radix_tree_node_alloc(struct radix_tree_root *root,
 out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
-               ret->parent = parent;
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->exceptional = exceptional;
+               ret->parent = parent;
+               ret->root = root;
        }
        return ret;
 }
@@ -399,7 +485,7 @@ static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
                preempt_disable();
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
-                       node->private_data = rtp->nodes;
+                       node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
@@ -510,7 +596,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
        return __radix_tree_preload(gfp_mask, nr_nodes);
 }
 
-static unsigned radix_tree_load_root(struct radix_tree_root *root,
+static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
 {
        struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
@@ -530,10 +616,10 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 /*
  *     Extend a radix tree so it can store key @index.
  */
-static int radix_tree_extend(struct radix_tree_root *root,
+static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
 {
-       struct radix_tree_node *slot;
+       void *entry;
        unsigned int maxshift;
        int tag;
 
@@ -542,32 +628,44 @@ static int radix_tree_extend(struct radix_tree_root *root,
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;
 
-       slot = root->rnode;
-       if (!slot)
+       entry = rcu_dereference_raw(root->rnode);
+       if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;
 
        do {
-               struct radix_tree_node *node = radix_tree_node_alloc(root,
-                                                       NULL, shift, 0, 1, 0);
+               struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
+                                                       root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;
 
-               /* Propagate the aggregated tag info into the new root */
-               for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-                       if (root_tag_get(root, tag))
-                               tag_set(node, tag, 0);
+               if (is_idr(root)) {
+                       all_tag_set(node, IDR_FREE);
+                       if (!root_tag_get(root, IDR_FREE)) {
+                               tag_clear(node, IDR_FREE, 0);
+                               root_tag_set(root, IDR_FREE);
+                       }
+               } else {
+                       /* Propagate the aggregated tag info to the new child */
+                       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+                               if (root_tag_get(root, tag))
+                                       tag_set(node, tag, 0);
+                       }
                }
 
                BUG_ON(shift > BITS_PER_LONG);
-               if (radix_tree_is_internal_node(slot)) {
-                       entry_to_node(slot)->parent = node;
-               } else if (radix_tree_exceptional_entry(slot)) {
+               if (radix_tree_is_internal_node(entry)) {
+                       entry_to_node(entry)->parent = node;
+               } else if (radix_tree_exceptional_entry(entry)) {
                        /* Moving an exceptional root->rnode to a node */
                        node->exceptional = 1;
                }
-               node->slots[0] = slot;
-               slot = node_to_entry(node);
-               rcu_assign_pointer(root->rnode, slot);
+               /*
+                * entry was already in the radix tree, so we do not need
+                * rcu_assign_pointer here
+                */
+               node->slots[0] = (void __rcu *)entry;
+               entry = node_to_entry(node);
+               rcu_assign_pointer(root->rnode, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
 out:
@@ -578,12 +676,14 @@ out:
  *     radix_tree_shrink    -    shrink radix tree to minimum height
  *     @root           radix tree root
  */
-static inline void radix_tree_shrink(struct radix_tree_root *root,
+static inline bool radix_tree_shrink(struct radix_tree_root *root,
                                     radix_tree_update_node_t update_node,
                                     void *private)
 {
+       bool shrunk = false;
+
        for (;;) {
-               struct radix_tree_node *node = root->rnode;
+               struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
                struct radix_tree_node *child;
 
                if (!radix_tree_is_internal_node(node))
@@ -597,7 +697,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root,
                 */
                if (node->count != 1)
                        break;
-               child = node->slots[0];
+               child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;
                if (!radix_tree_is_internal_node(child) && node->shift)
@@ -613,7 +713,9 @@ static inline void radix_tree_shrink(struct radix_tree_root *root,
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->rnode) as far as dependent read barriers go.
                 */
-               root->rnode = child;
+               root->rnode = (void __rcu *)child;
+               if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
+                       root_tag_clear(root, IDR_FREE);
 
                /*
                 * We have a dilemma here. The node's slot[0] must not be
@@ -635,27 +737,34 @@ static inline void radix_tree_shrink(struct radix_tree_root *root,
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
-                       node->slots[0] = RADIX_TREE_RETRY;
+                       node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                        if (update_node)
                                update_node(node, private);
                }
 
                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
+               shrunk = true;
        }
+
+       return shrunk;
 }
 
-static void delete_node(struct radix_tree_root *root,
+static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node,
                        radix_tree_update_node_t update_node, void *private)
 {
+       bool deleted = false;
+
        do {
                struct radix_tree_node *parent;
 
                if (node->count) {
-                       if (node == entry_to_node(root->rnode))
-                               radix_tree_shrink(root, update_node, private);
-                       return;
+                       if (node_to_entry(node) ==
+                                       rcu_dereference_raw(root->rnode))
+                               deleted |= radix_tree_shrink(root, update_node,
+                                                               private);
+                       return deleted;
                }
 
                parent = node->parent;
@@ -663,15 +772,23 @@ static void delete_node(struct radix_tree_root *root,
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
-                       root_tag_clear_all(root);
+                       /*
+                        * Shouldn't the tags already have all been cleared
+                        * by the caller?
+                        */
+                       if (!is_idr(root))
+                               root_tag_clear_all(root);
                        root->rnode = NULL;
                }
 
                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
+               deleted = true;
 
                node = parent;
        } while (node);
+
+       return deleted;
 }
 
 /**
@@ -693,13 +810,14 @@ static void delete_node(struct radix_tree_root *root,
  */
 int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
                        unsigned order, struct radix_tree_node **nodep,
-                       void ***slotp)
+                       void __rcu ***slotp)
 {
        struct radix_tree_node *node = NULL, *child;
-       void **slot = (void **)&root->rnode;
+       void __rcu **slot = (void __rcu **)&root->rnode;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index | ((1UL << order) - 1);
+       gfp_t gfp = root_gfp_mask(root);
 
        shift = radix_tree_load_root(root, &child, &maxindex);
 
@@ -707,18 +825,18 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
        if (order > 0 && max == ((1UL << order) - 1))
                max++;
        if (max > maxindex) {
-               int error = radix_tree_extend(root, max, shift);
+               int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
-               child = root->rnode;
+               child = rcu_dereference_raw(root->rnode);
        }
 
        while (shift > order) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
-                       child = radix_tree_node_alloc(root, node, shift,
+                       child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
@@ -741,7 +859,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
        return 0;
 }
 
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
 /*
  * Free any nodes below this node.  The tree is presumed to not need
  * shrinking, and any user data in the tree is presumed to not need a
@@ -757,7 +874,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
        struct radix_tree_node *child = entry_to_node(node);
 
        for (;;) {
-               void *entry = child->slots[offset];
+               void *entry = rcu_dereference_raw(child->slots[offset]);
                if (radix_tree_is_internal_node(entry) &&
                                        !is_sibling_entry(child, entry)) {
                        child = entry_to_node(entry);
@@ -777,8 +894,9 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
        }
 }
 
-static inline int insert_entries(struct radix_tree_node *node, void **slot,
-                               void *item, unsigned order, bool replace)
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+static inline int insert_entries(struct radix_tree_node *node,
+               void __rcu **slot, void *item, unsigned order, bool replace)
 {
        struct radix_tree_node *child;
        unsigned i, n, tag, offset, tags = 0;
@@ -813,7 +931,7 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot,
        }
 
        for (i = 0; i < n; i++) {
-               struct radix_tree_node *old = slot[i];
+               struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
                if (i) {
                        rcu_assign_pointer(slot[i], child);
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
@@ -840,8 +958,8 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot,
        return n;
 }
 #else
-static inline int insert_entries(struct radix_tree_node *node, void **slot,
-                               void *item, unsigned order, bool replace)
+static inline int insert_entries(struct radix_tree_node *node,
+               void __rcu **slot, void *item, unsigned order, bool replace)
 {
        if (*slot)
                return -EEXIST;
@@ -868,7 +986,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        unsigned order, void *item)
 {
        struct radix_tree_node *node;
-       void **slot;
+       void __rcu **slot;
        int error;
 
        BUG_ON(radix_tree_is_internal_node(item));
@@ -908,16 +1026,17 @@ EXPORT_SYMBOL(__radix_tree_insert);
  *     allocated and @root->rnode is used as a direct slot instead of
  *     pointing to a node, in which case *@nodep will be NULL.
  */
-void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
-                         struct radix_tree_node **nodep, void ***slotp)
+void *__radix_tree_lookup(const struct radix_tree_root *root,
+                         unsigned long index, struct radix_tree_node **nodep,
+                         void __rcu ***slotp)
 {
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
-       void **slot;
+       void __rcu **slot;
 
  restart:
        parent = NULL;
-       slot = (void **)&root->rnode;
+       slot = (void __rcu **)&root->rnode;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;
@@ -952,9 +1071,10 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
  *     exclusive from other writers. Any dereference of the slot must be done
  *     using radix_tree_deref_slot.
  */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
+                               unsigned long index)
 {
-       void **slot;
+       void __rcu **slot;
 
        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
@@ -974,75 +1094,76 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
  *     them safely). No RCU barriers are required to access or modify the
  *     returned item, however.
  */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
 {
        return __radix_tree_lookup(root, index, NULL, NULL);
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
-static inline int slot_count(struct radix_tree_node *node,
-                                               void **slot)
+static inline void replace_sibling_entries(struct radix_tree_node *node,
+                               void __rcu **slot, int count, int exceptional)
 {
-       int n = 1;
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
        void *ptr = node_to_entry(slot);
-       unsigned offset = get_slot_offset(node, slot);
-       int i;
+       unsigned offset = get_slot_offset(node, slot) + 1;
 
-       for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-               if (node->slots[offset + i] != ptr)
+       while (offset < RADIX_TREE_MAP_SIZE) {
+               if (rcu_dereference_raw(node->slots[offset]) != ptr)
                        break;
-               n++;
+               if (count < 0) {
+                       node->slots[offset] = NULL;
+                       node->count--;
+               }
+               node->exceptional += exceptional;
+               offset++;
        }
 #endif
-       return n;
 }
 
-static void replace_slot(struct radix_tree_root *root,
-                        struct radix_tree_node *node,
-                        void **slot, void *item,
-                        bool warn_typeswitch)
+static void replace_slot(void __rcu **slot, void *item,
+               struct radix_tree_node *node, int count, int exceptional)
 {
-       void *old = rcu_dereference_raw(*slot);
-       int count, exceptional;
-
-       WARN_ON_ONCE(radix_tree_is_internal_node(item));
-
-       count = !!item - !!old;
-       exceptional = !!radix_tree_exceptional_entry(item) -
-                     !!radix_tree_exceptional_entry(old);
-
-       WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+       if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
+               return;
 
-       if (node) {
+       if (node && (count || exceptional)) {
                node->count += count;
-               if (exceptional) {
-                       exceptional *= slot_count(node, slot);
-                       node->exceptional += exceptional;
-               }
+               node->exceptional += exceptional;
+               replace_sibling_entries(node, slot, count, exceptional);
        }
 
        rcu_assign_pointer(*slot, item);
 }
 
-static inline void delete_sibling_entries(struct radix_tree_node *node,
-                                               void **slot)
+static bool node_tag_get(const struct radix_tree_root *root,
+                               const struct radix_tree_node *node,
+                               unsigned int tag, unsigned int offset)
 {
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-       bool exceptional = radix_tree_exceptional_entry(*slot);
-       void *ptr = node_to_entry(slot);
-       unsigned offset = get_slot_offset(node, slot);
-       int i;
+       if (node)
+               return tag_get(node, tag, offset);
+       return root_tag_get(root, tag);
+}
 
-       for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-               if (node->slots[offset + i] != ptr)
-                       break;
-               node->slots[offset + i] = NULL;
-               node->count--;
-               if (exceptional)
-                       node->exceptional--;
+/*
+ * IDR users want to be able to store NULL in the tree, so if the slot isn't
+ * free, don't adjust the count, even if it's transitioning between NULL and
+ * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
+ * have empty bits, but it only stores NULL in slots when they're being
+ * deleted.
+ */
+static int calculate_count(struct radix_tree_root *root,
+                               struct radix_tree_node *node, void __rcu **slot,
+                               void *item, void *old)
+{
+       if (is_idr(root)) {
+               unsigned offset = get_slot_offset(node, slot);
+               bool free = node_tag_get(root, node, IDR_FREE, offset);
+               if (!free)
+                       return 0;
+               if (!old)
+                       return 1;
        }
-#endif
+       return !!item - !!old;
 }
 
 /**
@@ -1059,18 +1180,22 @@ static inline void delete_sibling_entries(struct radix_tree_node *node,
  */
 void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
-                         void **slot, void *item,
+                         void __rcu **slot, void *item,
                          radix_tree_update_node_t update_node, void *private)
 {
-       if (!item)
-               delete_sibling_entries(node, slot);
+       void *old = rcu_dereference_raw(*slot);
+       int exceptional = !!radix_tree_exceptional_entry(item) -
+                               !!radix_tree_exceptional_entry(old);
+       int count = calculate_count(root, node, slot, item, old);
+
        /*
         * This function supports replacing exceptional entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->rnode.
         */
-       replace_slot(root, node, slot, item,
-                    !node && slot != (void **)&root->rnode);
+       WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) &&
+                       (count || exceptional));
+       replace_slot(slot, item, node, count, exceptional);
 
        if (!node)
                return;
@@ -1098,9 +1223,9 @@ void __radix_tree_replace(struct radix_tree_root *root,
  * radix_tree_iter_replace().
  */
 void radix_tree_replace_slot(struct radix_tree_root *root,
-                            void **slot, void *item)
+                            void __rcu **slot, void *item)
 {
-       replace_slot(root, NULL, slot, item, true);
+       __radix_tree_replace(root, NULL, slot, item, NULL, NULL);
 }
 EXPORT_SYMBOL(radix_tree_replace_slot);
 
@@ -1114,7 +1239,8 @@ EXPORT_SYMBOL(radix_tree_replace_slot);
  * Caller must hold tree write locked across split and replacement.
  */
 void radix_tree_iter_replace(struct radix_tree_root *root,
-               const struct radix_tree_iter *iter, void **slot, void *item)
+                               const struct radix_tree_iter *iter,
+                               void __rcu **slot, void *item)
 {
        __radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
 }
@@ -1138,7 +1264,7 @@ int radix_tree_join(struct radix_tree_root *root, unsigned long index,
                        unsigned order, void *item)
 {
        struct radix_tree_node *node;
-       void **slot;
+       void __rcu **slot;
        int error;
 
        BUG_ON(radix_tree_is_internal_node(item));
@@ -1173,9 +1299,10 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
                                unsigned order)
 {
        struct radix_tree_node *parent, *node, *child;
-       void **slot;
+       void __rcu **slot;
        unsigned int offset, end;
        unsigned n, tag, tags = 0;
+       gfp_t gfp = root_gfp_mask(root);
 
        if (!__radix_tree_lookup(root, index, &parent, &slot))
                return -ENOENT;
@@ -1189,7 +1316,8 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
                        tags |= 1 << tag;
 
        for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
-               if (!is_sibling_entry(parent, parent->slots[end]))
+               if (!is_sibling_entry(parent,
+                               rcu_dereference_raw(parent->slots[end])))
                        break;
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        if (tags & (1 << tag))
@@ -1213,14 +1341,15 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 
        for (;;) {
                if (node->shift > order) {
-                       child = radix_tree_node_alloc(root, node,
+                       child = radix_tree_node_alloc(gfp, node, root,
                                        node->shift - RADIX_TREE_MAP_SHIFT,
                                        offset, 0, 0);
                        if (!child)
                                goto nomem;
                        if (node != parent) {
                                node->count++;
-                               node->slots[offset] = node_to_entry(child);
+                               rcu_assign_pointer(node->slots[offset],
+                                                       node_to_entry(child));
                                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                                        if (tags & (1 << tag))
                                                tag_set(node, tag, offset);
@@ -1262,6 +1391,22 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 }
 #endif
 
+static void node_tag_set(struct radix_tree_root *root,
+                               struct radix_tree_node *node,
+                               unsigned int tag, unsigned int offset)
+{
+       while (node) {
+               if (tag_get(node, tag, offset))
+                       return;
+               tag_set(node, tag, offset);
+               offset = node->offset;
+               node = node->parent;
+       }
+
+       if (!root_tag_get(root, tag))
+               root_tag_set(root, tag);
+}
+
 /**
  *     radix_tree_tag_set - set a tag on a radix tree node
  *     @root:          radix tree root
@@ -1303,6 +1448,18 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+/**
+ * radix_tree_iter_tag_set - set a tag on the current iterator entry
+ * @root:      radix tree root
+ * @iter:      iterator state
+ * @tag:       tag to set
+ */
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+                       const struct radix_tree_iter *iter, unsigned int tag)
+{
+       node_tag_set(root, iter->node, tag, iter_offset(iter));
+}
+
 static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
@@ -1323,34 +1480,6 @@ static void node_tag_clear(struct radix_tree_root *root,
                root_tag_clear(root, tag);
 }
 
-static void node_tag_set(struct radix_tree_root *root,
-                               struct radix_tree_node *node,
-                               unsigned int tag, unsigned int offset)
-{
-       while (node) {
-               if (tag_get(node, tag, offset))
-                       return;
-               tag_set(node, tag, offset);
-               offset = node->offset;
-               node = node->parent;
-       }
-
-       if (!root_tag_get(root, tag))
-               root_tag_set(root, tag);
-}
-
-/**
- * radix_tree_iter_tag_set - set a tag on the current iterator entry
- * @root:      radix tree root
- * @iter:      iterator state
- * @tag:       tag to set
- */
-void radix_tree_iter_tag_set(struct radix_tree_root *root,
-                       const struct radix_tree_iter *iter, unsigned int tag)
-{
-       node_tag_set(root, iter->node, tag, iter_offset(iter));
-}
-
 /**
  *     radix_tree_tag_clear - clear a tag on a radix tree node
  *     @root:          radix tree root
@@ -1391,6 +1520,18 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
 /**
+  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
+  * @root: radix tree root
+  * @iter: iterator state
+  * @tag: tag to clear
+  */
+void radix_tree_iter_tag_clear(struct radix_tree_root *root,
+                       const struct radix_tree_iter *iter, unsigned int tag)
+{
+       node_tag_clear(root, iter->node, tag, iter_offset(iter));
+}
+
+/**
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:              radix tree root
  * @index:             index key
@@ -1405,7 +1546,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  * the RCU lock is held, unless tag modification and node deletion are excluded
  * from concurrency.
  */
-int radix_tree_tag_get(struct radix_tree_root *root,
+int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
 {
        struct radix_tree_node *node, *parent;
@@ -1417,8 +1558,6 @@ int radix_tree_tag_get(struct radix_tree_root *root,
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;
-       if (node == NULL)
-               return 0;
 
        while (radix_tree_is_internal_node(node)) {
                unsigned offset;
@@ -1426,8 +1565,6 @@ int radix_tree_tag_get(struct radix_tree_root *root,
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
 
-               if (!node)
-                       return 0;
                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
@@ -1454,6 +1591,11 @@ static void set_iter_tags(struct radix_tree_iter *iter,
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;
 
+       if (!node) {
+               iter->tags = 1;
+               return;
+       }
+
        iter->tags = node->tags[tag][tag_long] >> tag_bit;
 
        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
@@ -1468,8 +1610,8 @@ static void set_iter_tags(struct radix_tree_iter *iter,
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-static void **skip_siblings(struct radix_tree_node **nodep,
-                       void **slot, struct radix_tree_iter *iter)
+static void __rcu **skip_siblings(struct radix_tree_node **nodep,
+                       void __rcu **slot, struct radix_tree_iter *iter)
 {
        void *sib = node_to_entry(slot - 1);
 
@@ -1486,8 +1628,8 @@ static void **skip_siblings(struct radix_tree_node **nodep,
        return NULL;
 }
 
-void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
-                                       unsigned flags)
+void __rcu **__radix_tree_next_slot(void __rcu **slot,
+                               struct radix_tree_iter *iter, unsigned flags)
 {
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node = rcu_dereference_raw(*slot);
@@ -1540,20 +1682,20 @@ void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
 }
 EXPORT_SYMBOL(__radix_tree_next_slot);
 #else
-static void **skip_siblings(struct radix_tree_node **nodep,
-                       void **slot, struct radix_tree_iter *iter)
+static void __rcu **skip_siblings(struct radix_tree_node **nodep,
+                       void __rcu **slot, struct radix_tree_iter *iter)
 {
        return slot;
 }
 #endif
 
-void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
+void __rcu **radix_tree_iter_resume(void __rcu **slot,
+                                       struct radix_tree_iter *iter)
 {
        struct radix_tree_node *node;
 
        slot++;
        iter->index = __radix_tree_iter_add(iter, 1);
-       node = rcu_dereference_raw(*slot);
        skip_siblings(&node, slot, iter);
        iter->next_index = iter->index;
        iter->tags = 0;
@@ -1569,7 +1711,7 @@ EXPORT_SYMBOL(radix_tree_iter_resume);
  * @flags:     RADIX_TREE_ITER_* flags and tag index
  * Returns:    pointer to chunk first slot, or NULL if iteration is over
  */
-void **radix_tree_next_chunk(struct radix_tree_root *root,
+void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
 {
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
@@ -1606,7 +1748,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
                iter->tags = 1;
                iter->node = NULL;
                __set_iter_shift(iter, 0);
-               return (void **)&root->rnode;
+               return (void __rcu **)&root->rnode;
        }
 
        do {
@@ -1624,7 +1766,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
                                                offset + 1);
                        else
                                while (++offset < RADIX_TREE_MAP_SIZE) {
-                                       void *slot = node->slots[offset];
+                                       void *slot = rcu_dereference_raw(
+                                                       node->slots[offset]);
                                        if (is_sibling_entry(node, slot))
                                                continue;
                                        if (slot)
@@ -1680,11 +1823,11 @@ EXPORT_SYMBOL(radix_tree_next_chunk);
  *     stored in 'results'.
  */
 unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
 {
        struct radix_tree_iter iter;
-       void **slot;
+       void __rcu **slot;
        unsigned int ret = 0;
 
        if (unlikely(!max_items))
@@ -1725,12 +1868,12 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  *     protection, radix_tree_deref_slot may fail requiring a retry.
  */
 unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root,
-                       void ***results, unsigned long *indices,
+radix_tree_gang_lookup_slot(const struct radix_tree_root *root,
+                       void __rcu ***results, unsigned long *indices,
                        unsigned long first_index, unsigned int max_items)
 {
        struct radix_tree_iter iter;
-       void **slot;
+       void __rcu **slot;
        unsigned int ret = 0;
 
        if (unlikely(!max_items))
@@ -1762,12 +1905,12 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
  *     returns the number of items which were placed at *@results.
  */
 unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
 {
        struct radix_tree_iter iter;
-       void **slot;
+       void __rcu **slot;
        unsigned int ret = 0;
 
        if (unlikely(!max_items))
@@ -1803,12 +1946,12 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
  *     returns the number of slots which were placed at *@results.
  */
 unsigned int
-radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
-               unsigned long first_index, unsigned int max_items,
-               unsigned int tag)
+radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
+               void __rcu ***results, unsigned long first_index,
+               unsigned int max_items, unsigned int tag)
 {
        struct radix_tree_iter iter;
-       void **slot;
+       void __rcu **slot;
        unsigned int ret = 0;
 
        if (unlikely(!max_items))
@@ -1843,59 +1986,83 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
        delete_node(root, node, update_node, private);
 }
 
+static bool __radix_tree_delete(struct radix_tree_root *root,
+                               struct radix_tree_node *node, void __rcu **slot)
+{
+       void *old = rcu_dereference_raw(*slot);
+       int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0;
+       unsigned offset = get_slot_offset(node, slot);
+       int tag;
+
+       if (is_idr(root))
+               node_tag_set(root, node, IDR_FREE, offset);
+       else
+               for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                       node_tag_clear(root, node, tag, offset);
+
+       replace_slot(slot, NULL, node, -1, exceptional);
+       return node && delete_node(root, node, NULL, NULL);
+}
+
 /**
- *     radix_tree_delete_item    -    delete an item from a radix tree
- *     @root:          radix tree root
- *     @index:         index key
- *     @item:          expected item
+ * radix_tree_iter_delete - delete the entry at this iterator position
+ * @root: radix tree root
+ * @iter: iterator state
+ * @slot: pointer to slot
  *
- *     Remove @item at @index from the radix tree rooted at @root.
+ * Delete the entry at the position currently pointed to by the iterator.
+ * This may result in the current node being freed; if it is, the iterator
+ * is advanced so that it will not reference the freed memory.  This
+ * function may be called without any locking if there are no other threads
+ * which can access this tree.
+ */
+void radix_tree_iter_delete(struct radix_tree_root *root,
+                               struct radix_tree_iter *iter, void __rcu **slot)
+{
+       if (__radix_tree_delete(root, iter->node, slot))
+               iter->index = iter->next_index;
+}
+
+/**
+ * radix_tree_delete_item - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
+ * @item: expected item
  *
- *     Returns the address of the deleted item, or NULL if it was not present
- *     or the entry at the given @index was not @item.
+ * Remove @item at @index from the radix tree rooted at @root.
+ *
+ * Return: the deleted entry, or %NULL if it was not present
+ * or the entry at the given @index was not @item.
  */
 void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
 {
-       struct radix_tree_node *node;
-       unsigned int offset;
-       void **slot;
+       struct radix_tree_node *node = NULL;
+       void __rcu **slot;
        void *entry;
-       int tag;
 
        entry = __radix_tree_lookup(root, index, &node, &slot);
-       if (!entry)
+       if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
+                                               get_slot_offset(node, slot))))
                return NULL;
 
        if (item && entry != item)
                return NULL;
 
-       if (!node) {
-               root_tag_clear_all(root);
-               root->rnode = NULL;
-               return entry;
-       }
-
-       offset = get_slot_offset(node, slot);
-
-       /* Clear all tags associated with the item to be deleted.  */
-       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
-               node_tag_clear(root, node, tag, offset);
-
-       __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
+       __radix_tree_delete(root, node, slot);
 
        return entry;
 }
 EXPORT_SYMBOL(radix_tree_delete_item);
 
 /**
- *     radix_tree_delete    -    delete an item from a radix tree
- *     @root:          radix tree root
- *     @index:         index key
+ * radix_tree_delete - delete an entry from a radix tree
+ * @root: radix tree root
+ * @index: index key
  *
- *     Remove the item at @index from the radix tree rooted at @root.
+ * Remove the entry at @index from the radix tree rooted at @root.
  *
- *     Returns the address of the deleted item, or NULL if it was not present.
+ * Return: The deleted entry, or %NULL if it was not present.
  */
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
@@ -1905,15 +2072,14 @@ EXPORT_SYMBOL(radix_tree_delete);
 
 void radix_tree_clear_tags(struct radix_tree_root *root,
                           struct radix_tree_node *node,
-                          void **slot)
+                          void __rcu **slot)
 {
        if (node) {
                unsigned int tag, offset = get_slot_offset(node, slot);
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);
        } else {
-               /* Clear root node tags */
-               root->gfp_mask &= __GFP_BITS_MASK;
+               root_tag_clear_all(root);
        }
 }
 
@@ -1922,12 +2088,147 @@ void radix_tree_clear_tags(struct radix_tree_root *root,
  *     @root:          radix tree root
  *     @tag:           tag to test
  */
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
 {
        return root_tag_get(root, tag);
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
+/**
+ * idr_preload - preload for idr_alloc()
+ * @gfp_mask: allocation mask to use for preloading
+ *
+ * Preallocate memory to use for the next call to idr_alloc().  This function
+ * returns with preemption disabled.  It will be enabled by idr_preload_end().
+ */
+void idr_preload(gfp_t gfp_mask)
+{
+       __radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE);
+}
+EXPORT_SYMBOL(idr_preload);
+
+/**
+ * ida_pre_get - reserve resources for ida allocation
+ * @ida: ida handle
+ * @gfp: memory allocation flags
+ *
+ * This function should be called before calling ida_get_new_above().  If it
+ * is unable to allocate memory, it will return %0.  On success, it returns %1.
+ */
+int ida_pre_get(struct ida *ida, gfp_t gfp)
+{
+       __radix_tree_preload(gfp, IDA_PRELOAD_SIZE);
+       /*
+        * The IDA API has no preload_end() equivalent.  Instead,
+        * ida_get_new() can return -EAGAIN, prompting the caller
+        * to return to the ida_pre_get() step.
+        */
+       preempt_enable();
+
+       if (!this_cpu_read(ida_bitmap)) {
+               struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
+               if (!bitmap)
+                       return 0;
+               bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
+               kfree(bitmap);
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(ida_pre_get);
+
+void __rcu **idr_get_free(struct radix_tree_root *root,
+                       struct radix_tree_iter *iter, gfp_t gfp, int end)
+{
+       struct radix_tree_node *node = NULL, *child;
+       void __rcu **slot = (void __rcu **)&root->rnode;
+       unsigned long maxindex, start = iter->next_index;
+       unsigned long max = end > 0 ? end - 1 : INT_MAX;
+       unsigned int shift, offset = 0;
+
+ grow:
+       shift = radix_tree_load_root(root, &child, &maxindex);
+       if (!radix_tree_tagged(root, IDR_FREE))
+               start = max(start, maxindex + 1);
+       if (start > max)
+               return ERR_PTR(-ENOSPC);
+
+       if (start > maxindex) {
+               int error = radix_tree_extend(root, gfp, start, shift);
+               if (error < 0)
+                       return ERR_PTR(error);
+               shift = error;
+               child = rcu_dereference_raw(root->rnode);
+       }
+
+       while (shift) {
+               shift -= RADIX_TREE_MAP_SHIFT;
+               if (child == NULL) {
+                       /* Have to add a child node.  */
+                       child = radix_tree_node_alloc(gfp, node, root, shift,
+                                                       offset, 0, 0);
+                       if (!child)
+                               return ERR_PTR(-ENOMEM);
+                       all_tag_set(child, IDR_FREE);
+                       rcu_assign_pointer(*slot, node_to_entry(child));
+                       if (node)
+                               node->count++;
+               } else if (!radix_tree_is_internal_node(child))
+                       break;
+
+               node = entry_to_node(child);
+               offset = radix_tree_descend(node, &child, start);
+               if (!tag_get(node, IDR_FREE, offset)) {
+                       offset = radix_tree_find_next_bit(node, IDR_FREE,
+                                                       offset + 1);
+                       start = next_index(start, node, offset);
+                       if (start > max)
+                               return ERR_PTR(-ENOSPC);
+                       while (offset == RADIX_TREE_MAP_SIZE) {
+                               offset = node->offset + 1;
+                               node = node->parent;
+                               if (!node)
+                                       goto grow;
+                               shift = node->shift;
+                       }
+                       child = rcu_dereference_raw(node->slots[offset]);
+               }
+               slot = &node->slots[offset];
+       }
+
+       iter->index = start;
+       if (node)
+               iter->next_index = 1 + min(max, (start | node_maxindex(node)));
+       else
+               iter->next_index = 1;
+       iter->node = node;
+       __set_iter_shift(iter, shift);
+       set_iter_tags(iter, node, offset, IDR_FREE);
+
+       return slot;
+}
+
+/**
+ * idr_destroy - release all internal memory from an IDR
+ * @idr: idr handle
+ *
+ * After this function is called, the IDR is empty, and may be reused or
+ * the data structure containing it may be freed.
+ *
+ * A typical clean-up sequence for objects stored in an idr tree will use
+ * idr_for_each() to free all objects, if necessary, then idr_destroy() to
+ * free the memory used to keep track of those objects.
+ */
+void idr_destroy(struct idr *idr)
+{
+       struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode);
+       if (radix_tree_is_internal_node(node))
+               radix_tree_free_nodes(node);
+       idr->idr_rt.rnode = NULL;
+       root_tag_set(&idr->idr_rt, IDR_FREE);
+}
+EXPORT_SYMBOL(idr_destroy);
+
 static void
 radix_tree_node_ctor(void *arg)
 {
@@ -1971,10 +2272,12 @@ static int radix_tree_cpu_dead(unsigned int cpu)
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
-               rtp->nodes = node->private_data;
+               rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
+       kfree(per_cpu(ida_bitmap, cpu));
+       per_cpu(ida_bitmap, cpu) = NULL;
        return 0;
 }
 
diff --git a/lib/refcount.c b/lib/refcount.c
new file mode 100644 (file)
index 0000000..1d33366
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/refcount.h>
+#include <linux/bug.h>
+
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (!val)
+                       return false;
+
+               if (unlikely(val == UINT_MAX))
+                       return true;
+
+               new = val + i;
+               if (new < val)
+                       new = UINT_MAX;
+               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_add_not_zero);
+
+void refcount_add(unsigned int i, refcount_t *r)
+{
+       WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_add);
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+bool refcount_inc_not_zero(refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               new = val + 1;
+
+               if (!val)
+                       return false;
+
+               if (unlikely(!new))
+                       return true;
+
+               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+void refcount_inc(refcount_t *r)
+{
+       WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_inc);
+
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (unlikely(val == UINT_MAX))
+                       return false;
+
+               new = val - i;
+               if (new > val) {
+                       WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+                       return false;
+               }
+
+               old = atomic_cmpxchg_release(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       return !new;
+}
+EXPORT_SYMBOL_GPL(refcount_sub_and_test);
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_test(refcount_t *r)
+{
+       return refcount_sub_and_test(1, r);
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_test);
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+
+void refcount_dec(refcount_t *r)
+{
+       WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_dec);
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+bool refcount_dec_if_one(refcount_t *r)
+{
+       return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_if_one);
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+bool refcount_dec_not_one(refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (unlikely(val == UINT_MAX))
+                       return true;
+
+               if (val == 1)
+                       return false;
+
+               new = val - 1;
+               if (new > val) {
+                       WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+                       return true;
+               }
+
+               old = atomic_cmpxchg_release(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_not_one);
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+       if (refcount_dec_not_one(r))
+               return false;
+
+       mutex_lock(lock);
+       if (!refcount_dec_and_test(r)) {
+               mutex_unlock(lock);
+               return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+       if (refcount_dec_not_one(r))
+               return false;
+
+       spin_lock(lock);
+       if (!refcount_dec_and_test(r)) {
+               spin_unlock(lock);
+               return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_lock);
+
index 172454e..c5b9b93 100644 (file)
@@ -146,9 +146,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
        if (tbl->nest)
                nested_bucket_table_free(tbl);
 
-       if (tbl)
-               kvfree(tbl->locks);
-
+       kvfree(tbl->locks);
        kvfree(tbl);
 }
 
@@ -1123,12 +1121,13 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
        union nested_table *ntbl;
 
        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
-       ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+       ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;
 
        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
-               ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+               ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+                                                 tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }
index 004fc70..c6cf822 100644 (file)
@@ -651,7 +651,6 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
 {
        unsigned int offset = 0;
        struct sg_mapping_iter miter;
-       unsigned long flags;
        unsigned int sg_flags = SG_MITER_ATOMIC;
 
        if (to_buffer)
@@ -664,9 +663,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
        if (!sg_miter_skip(&miter, skip))
                return false;
 
-       local_irq_save(flags);
-
-       while (sg_miter_next(&miter) && offset < buflen) {
+       while ((offset < buflen) && sg_miter_next(&miter)) {
                unsigned int len;
 
                len = min(miter.length, buflen - offset);
@@ -681,7 +678,6 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
 
        sg_miter_stop(&miter);
 
-       local_irq_restore(flags);
        return offset;
 }
 EXPORT_SYMBOL(sg_copy_buffer);
index fe9f3a7..35e3224 100644 (file)
@@ -334,7 +334,7 @@ static int test_parman_check_array(struct test_parman *test_parman,
                last_priority = item->prio->priority;
 
                if (item->parman_item.index != i) {
-                       pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n",
+                       pr_err("Item has different index in compare to where it actually is (%lu != %d)\n",
                               item->parman_item.index, i);
                        return -EINVAL;
                }
index 0967771..e3bf4e0 100644 (file)
@@ -1739,6 +1739,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
  * 'h', 'l', or 'L' for integer fields
  * 'z' support added 23/7/1999 S.H.
  * 'z' changed to 'Z' --davidm 1/25/99
+ * 'Z' changed to 'z' --adobriyan 2017-01-25
  * 't' added for ptrdiff_t
  *
  * @fmt: the format string
@@ -1838,7 +1839,7 @@ qualifier:
        /* get the conversion qualifier */
        qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
-           _tolower(*fmt) == 'z' || *fmt == 't') {
+           *fmt == 'z' || *fmt == 't') {
                qualifier = *fmt++;
                if (unlikely(qualifier == *fmt)) {
                        if (qualifier == 'l') {
@@ -1907,7 +1908,7 @@ qualifier:
        else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
-       } else if (_tolower(qualifier) == 'z') {
+       } else if (qualifier == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
        } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
@@ -2657,7 +2658,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
-                   _tolower(*fmt) == 'z') {
+                   *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
@@ -2851,7 +2852,6 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
-               case 'Z':
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
index afcc550..79d0fd1 100644 (file)
@@ -90,3 +90,9 @@ config DEBUG_PAGE_REF
          careful when enabling this feature because it adds about 30 KB to the
          kernel code.  However the runtime performance overhead is virtually
          nil until the tracepoints are actually enabled.
+
+config DEBUG_RODATA_TEST
+    bool "Testcase for the marking rodata read-only"
+    depends on STRICT_KERNEL_RWX
+    ---help---
+      This option enables a testcase for the setting rodata read-only.
index aa0aa17..026f6a8 100644 (file)
@@ -85,6 +85,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
index cef82b8..4d90a64 100644 (file)
@@ -93,7 +93,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
                spin_unlock_irq(&pool->lock);
 
                /* per-pool info, no real statistics yet */
-               temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+               temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
                                 pool->name, blocks,
                                 pages * (pool->allocation / pool->size),
                                 pool->size, pages);
index 77ae323..34bce5c 100644 (file)
@@ -420,7 +420,7 @@ int __khugepaged_enter(struct mm_struct *mm)
        list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);
 
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
        if (wakeup)
                wake_up_interruptible(&khugepaged_wait);
 
index cf211c0..520e4c3 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1854,7 +1854,7 @@ int __ksm_enter(struct mm_struct *mm)
        spin_unlock(&ksm_mmlist_lock);
 
        set_bit(MMF_VM_MERGEABLE, &mm->flags);
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
 
        if (needs_wakeup)
                wake_up_interruptible(&ksm_thread_wait);
index 6f4d27c..daf67bb 100644 (file)
@@ -25,7 +25,7 @@ void use_mm(struct mm_struct *mm)
        task_lock(tsk);
        active_mm = tsk->active_mm;
        if (active_mm != mm) {
-               atomic_inc(&mm->mm_count);
+               mmgrab(mm);
                tsk->active_mm = mm;
        }
        tsk->mm = mm;
index f4259e4..32bc9f2 100644 (file)
@@ -275,7 +275,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
                mm->mmu_notifier_mm = mmu_notifier_mm;
                mmu_notifier_mm = NULL;
        }
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
 
        /*
         * Serialize the update against mmu_notifier_unregister. A
index 578321f..51c0918 100644 (file)
@@ -653,7 +653,7 @@ static void mark_oom_victim(struct task_struct *tsk)
 
        /* oom_mm is bound to the signal struct life time. */
        if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
-               atomic_inc(&tsk->signal->oom_mm->mm_count);
+               mmgrab(tsk->signal->oom_mm);
 
        /*
         * Make sure that the task is woken up from uninterruptible sleep
@@ -870,7 +870,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 
        /* Get a reference to safely compare mm after task_unlock(victim) */
        mm = victim->mm;
-       atomic_inc(&mm->mm_count);
+       mmgrab(mm);
        /*
         * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
         * the OOM victim from depleting the memory reserves from the user
index ae6e601..26a6081 100644 (file)
@@ -1797,7 +1797,7 @@ pause:
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
-                * In theory 1 page is enough to keep the comsumer-producer
+                * In theory 1 page is enough to keep the consumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
                 * more page. However wb_dirty has accounting errors.  So use
                 * the larger and more IO friendly wb_stat_error.
index 9f9623d..a7a6aac 100644 (file)
@@ -5925,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
         * the zone and SPARSEMEM is in use. If there are holes within the
         * zone, each populated memory region may cost us one or two extra
         * memmap pages due to alignment because memmap pages for each
-        * populated regions may not naturally algined on page boundary.
+        * populated regions may not be naturally aligned on page boundary.
         * So the (present_pages >> 4) heuristic is a tradeoff for that.
         */
        if (spanned_pages > present_pages + (present_pages >> 4) &&
index 0686f56..5696039 100644 (file)
@@ -43,7 +43,7 @@
  * Chunks can be determined from the address using the index field
  * in the page struct. The index field contains a pointer to the chunk.
  *
- * To use this allocator, arch code should do the followings.
+ * To use this allocator, arch code should do the following:
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
new file mode 100644 (file)
index 0000000..0fd2167
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * rodata_test.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void rodata_test(void)
+{
+       unsigned long start, end;
+       int zero = 0;
+
+       /* test 1: read the value */
+       /* If this test fails, some previous testrun has clobbered the state */
+       if (!rodata_test_data) {
+               pr_err("rodata_test: test 1 fails (start data)\n");
+               return;
+       }
+
+       /* test 2: write to the variable; this should fault */
+       if (!probe_kernel_write((void *)&rodata_test_data,
+                                               (void *)&zero, sizeof(zero))) {
+               pr_err("rodata_test: test data was not read only\n");
+               return;
+       }
+
+       /* test 3: check the value hasn't changed */
+       if (rodata_test_data == zero) {
+               pr_err("rodata_test: test data was changed\n");
+               return;
+       }
+
+       /* test 4: check if the rodata section is PAGE_SIZE aligned */
+       start = (unsigned long)__start_rodata;
+       end = (unsigned long)__end_rodata;
+       if (start & (PAGE_SIZE - 1)) {
+               pr_err("rodata_test: start of .rodata is not page size aligned\n");
+               return;
+       }
+       if (end & (PAGE_SIZE - 1)) {
+               pr_err("rodata_test: end of .rodata is not page size aligned\n");
+               return;
+       }
+
+       pr_info("rodata_test: all tests were successful\n");
+}
index 2cac12c..fadc6a1 100644 (file)
@@ -1671,7 +1671,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
         * that.
         */
        start_mm = &init_mm;
-       atomic_inc(&init_mm.mm_users);
+       mmget(&init_mm);
 
        /*
         * Keep on scanning until all entries have gone.  Usually,
@@ -1720,7 +1720,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
                if (atomic_read(&start_mm->mm_users) == 1) {
                        mmput(start_mm);
                        start_mm = &init_mm;
-                       atomic_inc(&init_mm.mm_users);
+                       mmget(&init_mm);
                }
 
                /*
@@ -1757,13 +1757,13 @@ int try_to_unuse(unsigned int type, bool frontswap,
                        struct mm_struct *prev_mm = start_mm;
                        struct mm_struct *mm;
 
-                       atomic_inc(&new_start_mm->mm_users);
-                       atomic_inc(&prev_mm->mm_users);
+                       mmget(new_start_mm);
+                       mmget(prev_mm);
                        spin_lock(&mmlist_lock);
                        while (swap_count(*swap_map) && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
-                               if (!atomic_inc_not_zero(&mm->mm_users))
+                               if (!mmget_not_zero(mm))
                                        continue;
                                spin_unlock(&mmlist_lock);
                                mmput(prev_mm);
@@ -1781,7 +1781,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 
                                if (set_start_mm && *swap_map < swcount) {
                                        mmput(new_start_mm);
-                                       atomic_inc(&mm->mm_users);
+                                       mmget(mm);
                                        new_start_mm = mm;
                                        set_start_mm = 0;
                                }
index f2db674..6263aff 100644 (file)
@@ -786,7 +786,7 @@ EXPORT_SYMBOL(truncate_setsize);
  */
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 {
-       int bsize = 1 << inode->i_blkbits;
+       int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct page *page;
        pgoff_t index;
index 79ed536..ac839fc 100644 (file)
@@ -355,10 +355,8 @@ void workingset_update_node(struct radix_tree_node *node, void *private)
         * as node->private_list is protected by &mapping->tree_lock.
         */
        if (node->count && node->count == node->exceptional) {
-               if (list_empty(&node->private_list)) {
-                       node->private_data = mapping;
+               if (list_empty(&node->private_list))
                        list_lru_add(&shadow_nodes, &node->private_list);
-               }
        } else {
                if (!list_empty(&node->private_list))
                        list_lru_del(&shadow_nodes, &node->private_list);
@@ -436,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
         */
 
        node = container_of(item, struct radix_tree_node, private_list);
-       mapping = node->private_data;
+       mapping = container_of(node->root, struct address_space, page_tree);
 
        /* Coming from the list, invert the lock order */
        if (!spin_trylock(&mapping->tree_lock)) {
index cabf09e..eedc278 100644 (file)
@@ -76,6 +76,8 @@ static u64 zswap_duplicate_entry;
 * tunables
 **********************************/
 
+#define ZSWAP_PARAM_UNSET ""
+
 /* Enable/disable zswap (disabled by default) */
 static bool zswap_enabled;
 static int zswap_enabled_param_set(const char *,
@@ -185,6 +187,9 @@ static bool zswap_init_started;
 /* fatal error during init */
 static bool zswap_init_failed;
 
+/* init completed, but couldn't create the initial pool */
+static bool zswap_has_pool;
+
 /*********************************
 * helpers and fwd declarations
 **********************************/
@@ -424,7 +429,8 @@ static struct zswap_pool *__zswap_pool_current(void)
        struct zswap_pool *pool;
 
        pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
-       WARN_ON(!pool);
+       WARN_ONCE(!pool && zswap_has_pool,
+                 "%s: no page storage pool!\n", __func__);
 
        return pool;
 }
@@ -443,7 +449,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
        rcu_read_lock();
 
        pool = __zswap_pool_current();
-       if (!pool || !zswap_pool_get(pool))
+       if (!zswap_pool_get(pool))
                pool = NULL;
 
        rcu_read_unlock();
@@ -459,7 +465,9 @@ static struct zswap_pool *zswap_pool_last_get(void)
 
        list_for_each_entry_rcu(pool, &zswap_pools, list)
                last = pool;
-       if (!WARN_ON(!last) && !zswap_pool_get(last))
+       WARN_ONCE(!last && zswap_has_pool,
+                 "%s: no page storage pool!\n", __func__);
+       if (!zswap_pool_get(last))
                last = NULL;
 
        rcu_read_unlock();
@@ -495,6 +503,17 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
        gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
        int ret;
 
+       if (!zswap_has_pool) {
+               /* if either are unset, pool initialization failed, and we
+                * need both params to be set correctly before trying to
+                * create a pool.
+                */
+               if (!strcmp(type, ZSWAP_PARAM_UNSET))
+                       return NULL;
+               if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
+                       return NULL;
+       }
+
        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
        if (!pool) {
                pr_err("pool alloc failed\n");
@@ -544,29 +563,41 @@ error:
 
 static __init struct zswap_pool *__zswap_pool_create_fallback(void)
 {
-       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
-               if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
-                       pr_err("default compressor %s not available\n",
-                              zswap_compressor);
-                       return NULL;
-               }
+       bool has_comp, has_zpool;
+
+       has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+       if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
                pr_err("compressor %s not available, using default %s\n",
                       zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
                param_free_charp(&zswap_compressor);
                zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+               has_comp = crypto_has_comp(zswap_compressor, 0, 0);
        }
-       if (!zpool_has_pool(zswap_zpool_type)) {
-               if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
-                       pr_err("default zpool %s not available\n",
-                              zswap_zpool_type);
-                       return NULL;
-               }
+       if (!has_comp) {
+               pr_err("default compressor %s not available\n",
+                      zswap_compressor);
+               param_free_charp(&zswap_compressor);
+               zswap_compressor = ZSWAP_PARAM_UNSET;
+       }
+
+       has_zpool = zpool_has_pool(zswap_zpool_type);
+       if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
                pr_err("zpool %s not available, using default %s\n",
                       zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
                param_free_charp(&zswap_zpool_type);
                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+               has_zpool = zpool_has_pool(zswap_zpool_type);
+       }
+       if (!has_zpool) {
+               pr_err("default zpool %s not available\n",
+                      zswap_zpool_type);
+               param_free_charp(&zswap_zpool_type);
+               zswap_zpool_type = ZSWAP_PARAM_UNSET;
        }
 
+       if (!has_comp || !has_zpool)
+               return NULL;
+
        return zswap_pool_create(zswap_zpool_type, zswap_compressor);
 }
 
@@ -582,6 +613,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 
 static int __must_check zswap_pool_get(struct zswap_pool *pool)
 {
+       if (!pool)
+               return 0;
+
        return kref_get_unless_zero(&pool->kref);
 }
 
@@ -639,7 +673,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
        }
 
        /* no change required */
-       if (!strcmp(s, *(char **)kp->arg))
+       if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
                return 0;
 
        /* if this is load-time (pre-init) param setting,
@@ -670,21 +704,26 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
        pool = zswap_pool_find_get(type, compressor);
        if (pool) {
                zswap_pool_debug("using existing", pool);
+               WARN_ON(pool == zswap_pool_current());
                list_del_rcu(&pool->list);
-       } else {
-               spin_unlock(&zswap_pools_lock);
-               pool = zswap_pool_create(type, compressor);
-               spin_lock(&zswap_pools_lock);
        }
 
+       spin_unlock(&zswap_pools_lock);
+
+       if (!pool)
+               pool = zswap_pool_create(type, compressor);
+
        if (pool)
                ret = param_set_charp(s, kp);
        else
                ret = -EINVAL;
 
+       spin_lock(&zswap_pools_lock);
+
        if (!ret) {
                put_pool = zswap_pool_current();
                list_add_rcu(&pool->list, &zswap_pools);
+               zswap_has_pool = true;
        } else if (pool) {
                /* add the possibly pre-existing pool to the end of the pools
                 * list; if it's new (and empty) then it'll be removed and
@@ -696,6 +735,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 
        spin_unlock(&zswap_pools_lock);
 
+       if (!zswap_has_pool && !pool) {
+               /* if initial pool creation failed, and this pool creation also
+                * failed, maybe both compressor and zpool params were bad.
+                * Allow changing this param, so pool creation will succeed
+                * when the other param is changed. We already verified this
+                * param is ok in the zpool_has_pool() or crypto_has_comp()
+                * checks above.
+                */
+               ret = param_set_charp(s, kp);
+       }
+
        /* drop the ref from either the old current pool,
         * or the new pool we failed to add
         */
@@ -724,6 +774,10 @@ static int zswap_enabled_param_set(const char *val,
                pr_err("can't enable, initialization failed\n");
                return -ENODEV;
        }
+       if (!zswap_has_pool && zswap_init_started) {
+               pr_err("can't enable, no pool configured\n");
+               return -ENODEV;
+       }
 
        return param_set_bool(val, kp);
 }
@@ -1205,22 +1259,21 @@ static int __init init_zswap(void)
                goto hp_fail;
 
        pool = __zswap_pool_create_fallback();
-       if (!pool) {
+       if (pool) {
+               pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+                       zpool_get_type(pool->zpool));
+               list_add(&pool->list, &zswap_pools);
+               zswap_has_pool = true;
+       } else {
                pr_err("pool creation failed\n");
-               goto pool_fail;
+               zswap_enabled = false;
        }
-       pr_info("loaded using pool %s/%s\n", pool->tfm_name,
-               zpool_get_type(pool->zpool));
-
-       list_add(&pool->list, &zswap_pools);
 
        frontswap_register_ops(&zswap_frontswap_ops);
        if (zswap_debugfs_init())
                pr_warn("debugfs initialization failed\n");
        return 0;
 
-pool_fail:
-       cpuhp_remove_state_nocalls(CPUHP_MM_ZSWP_POOL_PREPARE);
 hp_fail:
        cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
 dstmem_fail:
index 10d2bdc..465cc24 100644 (file)
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
        ddp->deh_dport = usat->sat_port;
        ddp->deh_sport = at->src_port;
 
-       SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+       SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
 
        err = memcpy_from_msg(skb_put(skb, len), msg, len);
        if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
                 */
                aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
        }
-       SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+       SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
 
 out:
        release_sock(sk);
index 3b3b1a2..a190800 100644 (file)
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
                        return;
        }
        if (end_of_tlvs - tlvs != 0)
-               pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+               pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
                        dev->name, end_of_tlvs - tlvs);
 }
 
index 48f9471..f64d656 100644 (file)
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
 
        if (hdev) {
                if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
-                       /* When releasing an user channel exclusive access,
+                       /* When releasing a user channel exclusive access,
                         * call hci_dev_do_close directly instead of calling
                         * hci_dev_close to ensure the exclusive access will
                         * be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
                                /* In case the transport is already up and
                                 * running, clear the error here.
                                 *
-                                * This can happen when opening an user
+                                * This can happen when opening a user
                                 * channel and HCI_AUTO_OFF grace period
                                 * is still active.
                                 */
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
                if (!hci_sock_gen_cookie(sk)) {
                        /* In the case when a cookie has already been assigned,
                         * this socket will transition from a raw socket into
-                        * an user channel socket. For a clean transition, send
+                        * a user channel socket. For a clean transition, send
                         * the close notification first.
                         */
                        skb = create_monitor_ctrl_close(sk);
index 9024283..279527f 100644 (file)
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
        expected_length += ebt_mac_wormhash_size(wh_src);
 
        if (em->match_size != EBT_ALIGN(expected_length)) {
-               pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+               pr_info("wrong size: %d against expected %d, rounded to %zd\n",
                        em->match_size, expected_length,
                        EBT_ALIGN(expected_length));
                return -EINVAL;
index 50f040f..b9233b9 100644 (file)
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
             __func__, lock_name, type, cookie, tag, desc, flags);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            lock_op_page, lock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, lock_op_page,
+                            lock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
 
        dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            unlock_op_page, unlock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, unlock_op_page,
+                            unlock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
             cookie, ENTITY_NAME(*locker));
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            break_op_page, break_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, break_op_page,
+                            break_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
        int get_info_op_buf_size;
        int name_len = strlen(lock_name);
        struct page *get_info_op_page, *reply_page;
-       size_t reply_len;
+       size_t reply_len = PAGE_SIZE;
        void *p, *end;
        int ret;
 
index 80d7c3a..5bf94c0 100644 (file)
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 {
        kfree(b->item_weights);
        kfree(b->sum_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b->node_weights);
        kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 {
        kfree(b->straws);
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
 {
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
index 130ab40..b5cd8c2 100644 (file)
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
        return -1;
 }
 
-
 /*
  * bucket choose methods
  *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
  * Since this is expensive, we optimize for the r=0 case, which
  * captures the vast majority of calls.
  */
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+                             struct crush_work_bucket *work,
                              int x, int r)
 {
        unsigned int pr = r % bucket->size;
        unsigned int i, s;
 
        /* start a new permutation if @x has changed */
-       if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+       if (work->perm_x != (__u32)x || work->perm_n == 0) {
                dprintk("bucket %d new x=%d\n", bucket->id, x);
-               bucket->perm_x = x;
+               work->perm_x = x;
 
                /* optimize common r=0 case */
                if (pr == 0) {
                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
                                bucket->size;
-                       bucket->perm[0] = s;
-                       bucket->perm_n = 0xffff;   /* magic value, see below */
+                       work->perm[0] = s;
+                       work->perm_n = 0xffff;   /* magic value, see below */
                        goto out;
                }
 
                for (i = 0; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm_n = 0;
-       } else if (bucket->perm_n == 0xffff) {
+                       work->perm[i] = i;
+               work->perm_n = 0;
+       } else if (work->perm_n == 0xffff) {
                /* clean up after the r=0 case above */
                for (i = 1; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm[bucket->perm[0]] = 0;
-               bucket->perm_n = 1;
+                       work->perm[i] = i;
+               work->perm[work->perm[0]] = 0;
+               work->perm_n = 1;
        }
 
        /* calculate permutation up to pr */
-       for (i = 0; i < bucket->perm_n; i++)
-               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-       while (bucket->perm_n <= pr) {
-               unsigned int p = bucket->perm_n;
+       for (i = 0; i < work->perm_n; i++)
+               dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+       while (work->perm_n <= pr) {
+               unsigned int p = work->perm_n;
                /* no point in swapping the final entry */
                if (p < bucket->size - 1) {
                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
                                (bucket->size - p);
                        if (i) {
-                               unsigned int t = bucket->perm[p + i];
-                               bucket->perm[p + i] = bucket->perm[p];
-                               bucket->perm[p] = t;
+                               unsigned int t = work->perm[p + i];
+                               work->perm[p + i] = work->perm[p];
+                               work->perm[p] = t;
                        }
                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
                }
-               bucket->perm_n++;
+               work->perm_n++;
        }
        for (i = 0; i < bucket->size; i++)
-               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+               dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
 
-       s = bucket->perm[pr];
+       s = work->perm[pr];
 out:
        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
                bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
 }
 
 /* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+                                struct crush_work_bucket *work, int x, int r)
 {
-       return bucket_perm_choose(&bucket->h, x, r);
+       return bucket_perm_choose(&bucket->h, work, x, r);
 }
 
 /* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
                              int x, int r)
 {
        int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
                w *= bucket->sum_weights[i];
                w = w >> 16;
                /*dprintk(" scaled %llx\n", w);*/
-               if (w < bucket->item_weights[i])
+               if (w < bucket->item_weights[i]) {
                        return bucket->h.items[i];
+               }
        }
 
        dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
        return x & 1;
 }
 
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
                              int x, int r)
 {
        int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 
 /* straw */
 
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
                               int x, int r)
 {
        __u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
                                int x, int r)
 {
        unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
                        high_draw = draw;
                }
        }
+
        return bucket->h.items[high];
 }
 
 
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+                              struct crush_work_bucket *work,
+                              int x, int r)
 {
        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        BUG_ON(in->size == 0);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
-               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                         x, r);
+               return bucket_uniform_choose(
+                       (const struct crush_bucket_uniform *)in,
+                       work, x, r);
        case CRUSH_BUCKET_LIST:
-               return bucket_list_choose((struct crush_bucket_list *)in,
+               return bucket_list_choose((const struct crush_bucket_list *)in,
                                          x, r);
        case CRUSH_BUCKET_TREE:
-               return bucket_tree_choose((struct crush_bucket_tree *)in,
+               return bucket_tree_choose((const struct crush_bucket_tree *)in,
                                          x, r);
        case CRUSH_BUCKET_STRAW:
-               return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                          x, r);
+               return bucket_straw_choose(
+                       (const struct crush_bucket_straw *)in,
+                       x, r);
        case CRUSH_BUCKET_STRAW2:
-               return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
-                                           x, r);
+               return bucket_straw2_choose(
+                       (const struct crush_bucket_straw2 *)in,
+                       x, r);
        default:
                dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
                return in->items[0];
        }
 }
 
-
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
  * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int numrep, int type,
                               int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
        int rep;
        unsigned int ftotal, flocal;
        int retry_descent, retry_bucket, skip_rep;
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int r;
        int i;
        int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
                                if (local_fallback_retries > 0 &&
                                    flocal >= (in->size>>1) &&
                                    flocal > local_fallback_retries)
-                                       item = bucket_perm_choose(in, x, r);
+                                       item = bucket_perm_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                else
-                                       item = crush_bucket_choose(in, x, r);
+                                       item = crush_bucket_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
                                                        sub_r = r >> (vary_r-1);
                                                else
                                                        sub_r = 0;
-                                               if (crush_choose_firstn(map,
-                                                        map->buckets[-1-item],
-                                                        weight, weight_max,
-                                                        x, stable ? 1 : outpos+1, 0,
-                                                        out2, outpos, count,
-                                                        recurse_tries, 0,
-                                                        local_retries,
-                                                        local_fallback_retries,
-                                                        0,
-                                                        vary_r,
-                                                        stable,
-                                                        NULL,
-                                                        sub_r) <= outpos)
+                                               if (crush_choose_firstn(
+                                                           map,
+                                                           work,
+                                                           map->buckets[-1-item],
+                                                           weight, weight_max,
+                                                           x, stable ? 1 : outpos+1, 0,
+                                                           out2, outpos, count,
+                                                           recurse_tries, 0,
+                                                           local_retries,
+                                                           local_fallback_retries,
+                                                           0,
+                                                           vary_r,
+                                                           stable,
+                                                           NULL,
+                                                           sub_r) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
                                        } else {
@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
                                        }
                                }
 
-                               if (!reject) {
+                               if (!reject && !collide) {
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
                                                                weight_max,
                                                                item, x);
-                                       else
-                                               reject = 0;
                                }
 
 reject:
@@ -600,7 +611,8 @@ reject:
  *
  */
 static void crush_choose_indep(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int left, int numrep, int type,
                               int *out, int outpos,
@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
                               int *out2,
                               int parent_r)
 {
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int endpos = outpos + left;
        int rep;
        unsigned int ftotal;
@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
                                        break;
                                }
 
-                               item = crush_bucket_choose(in, x, r);
+                               item = crush_bucket_choose(
+                                       in, work->work[-1-in->id],
+                                       x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
 
                                if (recurse_to_leaf) {
                                        if (item < 0) {
-                                               crush_choose_indep(map,
-                                                  map->buckets[-1-item],
-                                                  weight, weight_max,
-                                                  x, 1, numrep, 0,
-                                                  out2, rep,
-                                                  recurse_tries, 0,
-                                                  0, NULL, r);
+                                               crush_choose_indep(
+                                                       map,
+                                                       work,
+                                                       map->buckets[-1-item],
+                                                       weight, weight_max,
+                                                       x, 1, numrep, 0,
+                                                       out2, rep,
+                                                       recurse_tries, 0,
+                                                       0, NULL, r);
                                                if (out2[rep] == CRUSH_ITEM_NONE) {
                                                        /* placed nothing; no leaf */
                                                        break;
@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
 #endif
 }
 
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+       struct crush_work *w = v;
+       __s32 b;
+
+       /*
+        * We work by moving through the available space and setting
+        * values and pointers as we go.
+        *
+        * It's a bit like Forth's use of the 'allot' word since we
+        * set the pointer first and then reserve the space for it to
+        * point to by incrementing the point.
+        */
+       v += sizeof(struct crush_work *);
+       w->work = v;
+       v += map->max_buckets * sizeof(struct crush_work_bucket *);
+       for (b = 0; b < map->max_buckets; ++b) {
+               if (!map->buckets[b])
+                       continue;
+
+               w->work[b] = v;
+               switch (map->buckets[b]->alg) {
+               default:
+                       v += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               w->work[b]->perm_x = 0;
+               w->work[b]->perm_n = 0;
+               w->work[b]->perm = v;
+               v += map->buckets[b]->size * sizeof(__u32);
+       }
+       BUG_ON(v - (void *)w != map->working_size);
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
  */
 int crush_do_rule(const struct crush_map *map,
                  int ruleno, int x, int *result, int result_max,
                  const __u32 *weight, int weight_max,
-                 int *scratch)
+                 void *cwin)
 {
        int result_len;
-       int *a = scratch;
-       int *b = scratch + result_max;
-       int *c = scratch + result_max*2;
+       struct crush_work *cw = cwin;
+       int *a = cwin + map->working_size;
+       int *b = a + result_max;
+       int *c = b + result_max;
+       int *w = a;
+       int *o = b;
        int recurse_to_leaf;
-       int *w;
        int wsize = 0;
-       int *o;
        int osize;
        int *tmp;
-       struct crush_rule *rule;
+       const struct crush_rule *rule;
        __u32 step;
        int i, j;
        int numrep;
@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
 
        rule = map->rules[ruleno];
        result_len = 0;
-       w = a;
-       o = b;
 
        for (step = 0; step < rule->len; step++) {
                int firstn = 0;
-               struct crush_rule_step *curstep = &rule->steps[step];
+               const struct crush_rule_step *curstep = &rule->steps[step];
 
                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
                                                recurse_tries = choose_tries;
                                        osize += crush_choose_firstn(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, numrep,
@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
                                                    numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, out_size, numrep,
@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
                        break;
                }
        }
+
        return result_len;
 }
index 292e33b..85747b7 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
index f3378ba..b65bbf9 100644 (file)
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
 
        kref_init(&req->r_kref);
        init_completion(&req->r_completion);
-       init_completion(&req->r_done_completion);
        RB_CLEAR_NODE(&req->r_node);
        RB_CLEAR_NODE(&req->r_mc_node);
        INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
        BUG_ON(length > previous);
 
        op->extent.length = length;
-       op->indata_len -= previous - length;
+       if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+               op->indata_len -= previous - length;
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
        bool need_send = false;
        bool promoted = false;
 
-       WARN_ON(req->r_tid || req->r_got_reply);
+       WARN_ON(req->r_tid);
        dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
@@ -1704,17 +1704,10 @@ promote:
 
 static void account_request(struct ceph_osd_request *req)
 {
-       unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+       WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
-       if (req->r_flags & CEPH_OSD_FLAG_READ) {
-               WARN_ON(req->r_flags & mask);
-               req->r_flags |= CEPH_OSD_FLAG_ACK;
-       } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-               WARN_ON(!(req->r_flags & mask));
-       else
-               WARN_ON(1);
-
-       WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+       req->r_flags |= CEPH_OSD_FLAG_ONDISK;
        atomic_inc(&req->r_osdc->num_requests);
 }
 
@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-       if (req->r_callback)
+       if (req->r_callback) {
+               dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+                    req->r_tid, req->r_callback, req->r_result);
                req->r_callback(req);
-       else
-               complete_all(&req->r_completion);
+       }
 }
 
 /*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
  */
 static void complete_request(struct ceph_osd_request *req, int err)
 {
@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
        req->r_result = err;
        finish_request(req);
        __complete_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
 
        cancel_map_check(req);
        finish_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
        mutex_lock(&lreq->lock);
        dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
             lreq->linger_id, req->r_result);
-       WARN_ON(!__linger_registered(lreq));
        linger_reg_commit_complete(lreq, req->r_result);
        lreq->committed = true;
 
@@ -2785,31 +2777,8 @@ e_inval:
 }
 
 /*
- * We are done with @req if
- *   - @m is a safe reply, or
- *   - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
-                        const struct MOSDOpReply *m)
-{
-       return (m->result < 0 ||
-               (m->flags & CEPH_OSD_FLAG_ONDISK) ||
-               !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set? yes                     no
- *
- * first reply is OK (needed   r_cb/r_completion,      r_cb/r_completion,
- * any or needed/got safe)     r_done_completion       r_done_completion
- *
- * first reply is unsafe       r_unsafe_cb(true)       (nothing)
- *
- * when we get the safe reply  r_unsafe_cb(false),     r_cb/r_completion,
- *                             r_done_completion       r_done_completion
+ * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
+ * specified.
  */
 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
        struct MOSDOpReply m;
        u64 tid = le64_to_cpu(msg->hdr.tid);
        u32 data_len = 0;
-       bool already_acked;
        int ret;
        int i;
 
@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                       le32_to_cpu(msg->hdr.data_len), req->r_tid);
                goto fail_request;
        }
-       dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
-            req, req->r_tid, req->r_got_reply, m.result, data_len);
-
-       already_acked = req->r_got_reply;
-       if (!already_acked) {
-               req->r_result = m.result ?: data_len;
-               req->r_replay_version = m.replay_version; /* struct */
-               req->r_got_reply = true;
-       } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
-               dout("req %p tid %llu dup ack\n", req, req->r_tid);
-               goto out_unlock_session;
-       }
-
-       if (done_request(req, &m)) {
-               finish_request(req);
-               if (req->r_linger) {
-                       WARN_ON(req->r_unsafe_callback);
-                       dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-       }
+       dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+            req, req->r_tid, m.result, data_len);
 
+       /*
+        * Since we only ever request ONDISK, we should only ever get
+        * one (type of) reply back.
+        */
+       WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+       req->r_result = m.result ?: data_len;
+       finish_request(req);
        mutex_unlock(&osd->lock);
        up_read(&osdc->lock);
 
-       if (done_request(req, &m)) {
-               if (already_acked && req->r_unsafe_callback) {
-                       dout("req %p tid %llu safe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, false);
-               } else if (!req->r_linger) {
-                       dout("req %p tid %llu cb\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-               complete_all(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-       } else {
-               if (req->r_unsafe_callback) {
-                       dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, true);
-               } else {
-                       WARN_ON(1);
-               }
-       }
-
+       __complete_request(req);
+       complete_all(&req->r_completion);
+       ceph_osdc_put_request(req);
        return;
 
 fail_request:
@@ -3540,7 +3480,7 @@ again:
                        up_read(&osdc->lock);
                        dout("%s waiting on req %p tid %llu last_tid %llu\n",
                             __func__, req, req->r_tid, last_tid);
-                       wait_for_completion(&req->r_done_completion);
+                       wait_for_completion(&req->r_completion);
                        ceph_osdc_put_request(req);
                        goto again;
                }
@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&lreq->t.base_oid, oid);
        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-       lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       lreq->t.flags = CEPH_OSD_FLAG_WRITE;
        lreq->mtime = CURRENT_TIME;
 
        lreq->reg_req = alloc_linger_request(lreq);
@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-       req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       req->r_flags = CEPH_OSD_FLAG_WRITE;
        req->r_mtime = CURRENT_TIME;
        osd_req_op_watch_init(req, 0, lreq->linger_id,
                              CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
  * Execute an OSD class method on an object.
  *
  * @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
  */
 int ceph_osdc_call(struct ceph_osd_client *osdc,
                   struct ceph_object_id *oid,
@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
        struct ceph_osd_request *req;
        int ret;
 
+       if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+               return -E2BIG;
+
        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
        if (!req)
                return -ENOMEM;
@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
                                                  0, false, false);
        if (resp_page)
                osd_req_op_cls_response_data_pages(req, 0, &resp_page,
-                                                  PAGE_SIZE, 0, false, false);
+                                                  *resp_len, 0, false, false);
 
        ceph_osdc_start_request(osdc, req, false);
        ret = ceph_osdc_wait_request(osdc, req);
@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int page_align = off & ~PAGE_MASK;
 
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    snapc, truncate_seq, truncate_size,
                                    true);
        if (IS_ERR(req))
index d243688..6824c0e 100644 (file)
@@ -153,6 +153,32 @@ bad:
         return -EINVAL;
 }
 
+static void crush_finalize(struct crush_map *c)
+{
+       __s32 b;
+
+       /* Space for the array of pointers to per-bucket workspace */
+       c->working_size = sizeof(struct crush_work) +
+           c->max_buckets * sizeof(struct crush_work_bucket *);
+
+       for (b = 0; b < c->max_buckets; b++) {
+               if (!c->buckets[b])
+                       continue;
+
+               switch (c->buckets[b]->alg) {
+               default:
+                       /*
+                        * The base case, permutation variables and
+                        * the pointer to the permutation array.
+                        */
+                       c->working_size += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               /* Every bucket has a permutation array. */
+               c->working_size += c->buckets[b]->size * sizeof(__u32);
+       }
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
        struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
                if (b->items == NULL)
                        goto badmem;
-               b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-               if (b->perm == NULL)
-                       goto badmem;
-               b->perm_n = 0;
 
                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
                for (j = 0; j < b->size; j++)
@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
        dout("crush decode tunable chooseleaf_stable = %d\n",
             c->chooseleaf_stable);
 
+       crush_finalize(c);
+
 done:
        dout("crush_decode success\n");
        return c;
@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
        map->pool_max = -1;
        map->pg_temp = RB_ROOT;
        map->primary_temp = RB_ROOT;
-       mutex_init(&map->crush_scratch_mutex);
+       mutex_init(&map->crush_workspace_mutex);
 
        return map;
 }
@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_weight);
        kfree(map->osd_addr);
        kfree(map->osd_primary_affinity);
+       kfree(map->crush_workspace);
        kfree(map);
 }
 
@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
        return 0;
 }
 
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+       void *workspace;
+       size_t work_size;
+
+       if (IS_ERR(crush))
+               return PTR_ERR(crush);
+
+       work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+       dout("%s work_size %zu bytes\n", __func__, work_size);
+       workspace = kmalloc(work_size, GFP_NOIO);
+       if (!workspace) {
+               crush_destroy(crush);
+               return -ENOMEM;
+       }
+       crush_init_workspace(crush, workspace);
+
+       if (map->crush)
+               crush_destroy(map->crush);
+       kfree(map->crush_workspace);
+       map->crush = crush;
+       map->crush_workspace = workspace;
+       return 0;
+}
+
 #define OSDMAP_WRAPPER_COMPAT_VER      7
 #define OSDMAP_CLIENT_DATA_COMPAT_VER  1
 
@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
        /* crush */
        ceph_decode_32_safe(p, end, len, e_inval);
-       map->crush = crush_decode(*p, min(*p + len, end));
-       if (IS_ERR(map->crush)) {
-               err = PTR_ERR(map->crush);
-               map->crush = NULL;
+       err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+       if (err)
                goto bad;
-       }
-       *p += len;
 
        /* ignore the rest */
        *p = end;
@@ -1375,7 +1421,6 @@ e_inval:
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                                             struct ceph_osdmap *map)
 {
-       struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
        u32 epoch = 0;
        struct ceph_timespec modified;
@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        /* new crush? */
        ceph_decode_32_safe(p, end, len, e_inval);
        if (len > 0) {
-               newcrush = crush_decode(*p, min(*p+len, end));
-               if (IS_ERR(newcrush)) {
-                       err = PTR_ERR(newcrush);
-                       newcrush = NULL;
+               err = osdmap_set_crush(map,
+                                      crush_decode(*p, min(*p + len, end)));
+               if (err)
                        goto bad;
-               }
                *p += len;
        }
 
@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
        map->epoch++;
        map->modified = modified;
-       if (newcrush) {
-               if (map->crush)
-                       crush_destroy(map->crush);
-               map->crush = newcrush;
-               newcrush = NULL;
-       }
 
        /* new_pools */
        err = decode_new_pools(p, end, map);
@@ -1505,8 +1542,6 @@ bad:
        print_hex_dump(KERN_DEBUG, "osdmap: ",
                       DUMP_PREFIX_OFFSET, 16, 1,
                       start, end - start, true);
-       if (newcrush)
-               crush_destroy(newcrush);
        return ERR_PTR(err);
 }
 
@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
        BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
-       mutex_lock(&map->crush_scratch_mutex);
+       mutex_lock(&map->crush_workspace_mutex);
        r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-                         weight, weight_max, map->crush_scratch_ary);
-       mutex_unlock(&map->crush_scratch_mutex);
+                         weight, weight_max, map->crush_workspace);
+       mutex_unlock(&map->crush_workspace_mutex);
 
        return r;
 }
@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
                return;
        }
 
-       len = do_crush(osdmap, ruleno, pps, raw->osds,
-                      min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+       if (pi->size > ARRAY_SIZE(raw->osds)) {
+               pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+                      pi->id, pi->crush_ruleset, pi->type, pi->size,
+                      ARRAY_SIZE(raw->osds));
+               return;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
index 154683f..705414e 100644 (file)
@@ -18,8 +18,6 @@
  * 02110-1301, USA.
  */
 
-#include <stddef.h>
-
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/ceph/libceph.h>
index e0bd013..eedba76 100644 (file)
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
        pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
        if (size > mtu) {
-               pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+               pr_debug("size = %zu, mtu = %u\n", size, mtu);
                err = -EMSGSIZE;
                goto out_dev;
        }
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
        pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
        if (size > mtu) {
-               pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+               pr_debug("size = %zu, mtu = %u\n", size, mtu);
                err = -EMSGSIZE;
                goto out_dev;
        }
index 7db2ad2..b39a791 100644 (file)
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        int ret, no_addr;
        struct fib_result res;
        struct flowi4 fl4;
-       struct net *net;
+       struct net *net = dev_net(dev);
        bool dev_match;
 
        fl4.flowi4_oif = 0;
@@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_tun_key.tun_id = 0;
        fl4.flowi4_flags = 0;
+       fl4.flowi4_uid = sock_net_uid(net, NULL);
 
        no_addr = idev->ifa_list == NULL;
 
@@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 
        trace_fib_validate_source(dev, &fl4);
 
-       net = dev_net(dev);
        if (fib_lookup(net, &fl4, &res, 0))
                goto last_resort;
        if (res.type != RTN_UNICAST &&
            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
                goto e_inval;
-       if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+       if (!rpf && !fib_num_tclassid_users(net) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
                goto last_resort;
        fib_combine_itag(itag, &res);
index d8cea21..2f0d823 100644 (file)
@@ -2388,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
        seq_printf(seq,
                   "Basic info: size of leaf:"
-                  " %Zd bytes, size of tnode: %Zd bytes.\n",
+                  " %zd bytes, size of tnode: %zd bytes.\n",
                   LEAF_SIZE, TNODE_SIZE(0));
 
        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
index beacd02..c0317c9 100644 (file)
@@ -2596,7 +2596,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
                const char *name =  vif->dev ? vif->dev->name : "none";
 
                seq_printf(seq,
-                          "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+                          "%2zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
                           vif - mrt->vif_table,
                           name, vif->bytes_in, vif->pkt_in,
                           vif->bytes_out, vif->pkt_out,
index fcfd071..bc1486f 100644 (file)
@@ -235,7 +235,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
        }
 
        if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
-               pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+               pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
                         *len, sizeof(struct sockaddr_in));
                return -EINVAL;
        }
index f6f7133..2f3895d 100644 (file)
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
 
        ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
        if (ap == NULL) {
-               nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+               nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
                               skb->len - sizeof(_arph));
                return;
        }
index cb494a5..8471dd1 100644 (file)
@@ -1876,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
+       fl4.flowi4_uid = sock_net_uid(net, NULL);
        err = fib_lookup(net, &fl4, &res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
@@ -2008,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        int res;
 
+       tos &= IPTOS_RT_MASK;
        rcu_read_lock();
 
        /* Multicast recognition logic is moved from route cache to here.
index c795fee..644ba59 100644 (file)
@@ -693,6 +693,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
        u->link = p->link;
        u->i_key = p->i_key;
        u->o_key = p->o_key;
+       if (u->i_key)
+               u->i_flags |= GRE_KEY;
+       if (u->o_key)
+               u->o_flags |= GRE_KEY;
        u->proto = p->proto;
 
        memcpy(u->name, p->name, sizeof(u->name));
index babaf3e..6ba6c90 100644 (file)
@@ -1666,6 +1666,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
        struct net *net = sock_net(sk);
        struct mr6_table *mrt;
 
+       if (sk->sk_type != SOCK_RAW ||
+           inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+               return -EOPNOTSUPP;
+
        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;
@@ -1677,9 +1681,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
        switch (optname) {
        case MRT6_INIT:
-               if (sk->sk_type != SOCK_RAW ||
-                   inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
-                       return -EOPNOTSUPP;
                if (optlen < sizeof(int))
                        return -EINVAL;
 
@@ -1815,6 +1816,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
        struct net *net = sock_net(sk);
        struct mr6_table *mrt;
 
+       if (sk->sk_type != SOCK_RAW ||
+           inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+               return -EOPNOTSUPP;
+
        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;
index 055c51b..97c7242 100644 (file)
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
        nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
        /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-       nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+       nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
               ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
               (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
               ih->hop_limit,
index 1215693..35dbf3d 100644 (file)
@@ -51,7 +51,7 @@ irnet_ctrl_write(irnet_socket *       ap,
   char *       next;           /* Next command to process */
   int          length;         /* Length of current command */
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
   /* Check for overflow... */
   DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +66,7 @@ irnet_ctrl_write(irnet_socket *       ap,
 
   /* Safe terminate the string */
   command[count] = '\0';
-  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
        command, count);
 
   /* Check every commands in the command line */
@@ -285,7 +285,7 @@ irnet_ctrl_read(irnet_socket *      ap,
   char         event[75];
   ssize_t      ret = 0;
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
 #ifdef INITIAL_DISCOVERY
   /* Check if we have read the log */
@@ -328,7 +328,7 @@ irnet_ctrl_read(irnet_socket *      ap,
   if(ret != 0)
     {
       /* No, return the error code */
-      DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+      DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
       return ret;
     }
 
@@ -568,7 +568,7 @@ dev_irnet_write(struct file *       file,
 {
   irnet_socket *       ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
        file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
@@ -592,7 +592,7 @@ dev_irnet_read(struct file *        file,
 {
   irnet_socket *       ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
        file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
index b58000e..8adab63 100644 (file)
@@ -1058,10 +1058,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
 
        /* Debug */
        if (session->send_seq)
-               l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n",
+               l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
                         session->name, data_len, session->ns - 1);
        else
-               l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n",
+               l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
                         session->name, data_len);
 
        if (session->debug & L2TP_MSG_DATA) {
index c597120..d25038c 100644 (file)
@@ -388,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
 drop:
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
        kfree_skb(skb);
-       return -1;
+       return 0;
 }
 
 /* Userspace will call sendmsg() on the tunnel socket to send L2TP
index c28b0af..6e7b6a0 100644 (file)
@@ -681,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
                   2 + /* NULL SSID */
                   /* Channel Switch Announcement */
                   2 + sizeof(struct ieee80211_channel_sw_ie) +
-                  /* Mesh Channel Swith Parameters */
+                  /* Mesh Channel Switch Parameters */
                   2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
                   2 + 8 + /* supported rates */
                   2 + 3; /* DS params */
index a3af6e1..0dd7c35 100644 (file)
@@ -462,9 +462,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
        unsigned long flags;
 
        spin_lock_irqsave(&local->ack_status_lock, flags);
-       skb = idr_find(&local->ack_status_frames, info->ack_frame_id);
-       if (skb)
-               idr_remove(&local->ack_status_frames, info->ack_frame_id);
+       skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
        spin_unlock_irqrestore(&local->ack_status_lock, flags);
 
        if (!skb)
index 096a451..e6a2753 100644 (file)
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
                "(size=%d, memory=%ldKbytes)\n",
                ip_vs_conn_tab_size,
                (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
-       IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+       IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
                  sizeof(struct ip_vs_conn));
 
        for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
index 6be5c53..75f798f 100644 (file)
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
                return -ENOMEM;
 
        svc->sched_data = s;
-       IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+       IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
                  "current service\n",
                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
 
        /* release the table itself */
        kfree_rcu(s, rcu_head);
-       IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+       IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 }
 
index cccf4d6..5824927 100644 (file)
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
                return -ENOMEM;
 
        svc->sched_data = tbl;
-       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+       IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
                  "current service\n", sizeof(*tbl));
 
        /*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 
        /* release the table itself */
        kfree_rcu(tbl, rcu_head);
-       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+       IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
                  sizeof(*tbl));
 }
 
index 796d70e..703f118 100644 (file)
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
                return -ENOMEM;
 
        svc->sched_data = tbl;
-       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+       IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
                  "current service\n", sizeof(*tbl));
 
        /*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 
        /* release the table itself */
        kfree_rcu(tbl, rcu_head);
-       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+       IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
                  sizeof(*tbl));
 }
 
index 1e373a5..16aaac6 100644 (file)
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
                return -ENOMEM;
 
        svc->sched_data = s;
-       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+       IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
                  "current service\n",
                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
 
        /* release the table itself */
        kfree_rcu(s, rcu_head);
-       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+       IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 }
 
index 9350530..b03c280 100644 (file)
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
        u16 mtu, min_mtu;
 
        IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
-       IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+       IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
                  sizeof(struct ip_vs_sync_conn_v0));
 
        if (!ipvs->sync_state) {
index e19a697..4b2e1fb 100644 (file)
@@ -410,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
        struct net *net = nf_ct_exp_net(expect);
        struct hlist_node *next;
        unsigned int h;
-       int ret = 1;
+       int ret = 0;
 
        if (!master_help) {
                ret = -ESHUTDOWN;
@@ -460,14 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 
        spin_lock_bh(&nf_conntrack_expect_lock);
        ret = __nf_ct_expect_check(expect);
-       if (ret <= 0)
+       if (ret < 0)
                goto out;
 
        nf_ct_expect_insert(expect);
 
        spin_unlock_bh(&nf_conntrack_expect_lock);
        nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
-       return ret;
+       return 0;
 out:
        spin_unlock_bh(&nf_conntrack_expect_lock);
        return ret;
index e3ed200..4aecef4 100644 (file)
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
 {
        size_t i = plen;
 
-       pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+       pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
 
        if (dlen <= plen) {
                /* Short packet: try for partial? */
index 3b79f34..de87823 100644 (file)
@@ -48,7 +48,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
        if (helper == NULL)
                return NF_DROP;
 
-       /* This is an user-space helper not yet configured, skip. */
+       /* This is a user-space helper not yet configured, skip. */
        if ((helper->flags &
            (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
             NF_CT_HELPER_F_USERSPACE)
index c6b8022..bf548a7 100644 (file)
@@ -528,6 +528,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
                if (!nft_ct_tmpl_alloc_pcpu())
                        return -ENOMEM;
                nft_ct_pcpu_template_refcnt++;
+               len = sizeof(u16);
                break;
 #endif
        default:
index 97f9649..152d226 100644 (file)
@@ -258,7 +258,7 @@ static int nft_bitmap_init(const struct nft_set *set,
 {
        struct nft_bitmap *priv = nft_set_priv(set);
 
-       priv->bitmap_size = nft_bitmap_total_size(set->klen);
+       priv->bitmap_size = nft_bitmap_size(set->klen);
 
        return 0;
 }
index 016db6b..14857af 100644 (file)
@@ -667,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
            COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
                return -EINVAL;
 
-       /* compat_xt_entry match has less strict aligment requirements,
+       /* compat_xt_entry match has less strict alignment requirements,
         * otherwise they are identical.  In case of padding differences
         * we need to add compat version of xt_check_entry_match.
         */
index 8d70884..91fe46f 100644 (file)
@@ -111,8 +111,7 @@ static void rds_ib_dev_free(struct work_struct *work)
                kfree(i_ipaddr);
        }
 
-       if (rds_ibdev->vector_load)
-               kfree(rds_ibdev->vector_load);
+       kfree(rds_ibdev->vector_load);
 
        kfree(rds_ibdev);
 }
index 57bb523..5438f67 100644 (file)
@@ -641,12 +641,12 @@ static int rds_tcp_init(void)
        ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
        if (ret) {
                pr_warn("could not register rds_tcp_dev_notifier\n");
-               goto out;
+               goto out_slab;
        }
 
        ret = register_pernet_subsys(&rds_tcp_net_ops);
        if (ret)
-               goto out_slab;
+               goto out_notifier;
 
        ret = rds_tcp_recv_init();
        if (ret)
@@ -664,9 +664,10 @@ out_recv:
        rds_tcp_recv_exit();
 out_pernet:
        unregister_pernet_subsys(&rds_tcp_net_ops);
-out_slab:
+out_notifier:
        if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
                pr_warn("could not unregister rds_tcp_dev_notifier\n");
+out_slab:
        kmem_cache_destroy(rds_tcp_conn_slab);
 out:
        return ret;
index 18c737a..0a4e284 100644 (file)
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
 
                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
-                       toksize += 8 * 4;       /* viceid, kvno, key*2, begin,
+                       toksize += 9 * 4;       /* viceid, kvno, key*2 + len, begin,
                                                 * end, primary, tktlen */
                        toksize += RND(token->kad->ticket_len);
                        break;
index c29362d..f3a688e 100644 (file)
@@ -320,8 +320,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 
        /* Barriers against rxrpc_input_data(). */
        hard_ack = call->rx_hard_ack;
-       top = smp_load_acquire(&call->rx_top);
-       for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+       seq = hard_ack + 1;
+       while (top = smp_load_acquire(&call->rx_top),
+              before_eq(seq, top)
+              ) {
                ix = seq & RXRPC_RXTX_BUFF_MASK;
                skb = call->rxtx_buffer[ix];
                if (!skb) {
@@ -394,6 +396,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
                        ret = 1;
                        goto out;
                }
+
+               seq++;
        }
 
 out:
index f219ff3..b70aa57 100644 (file)
@@ -613,8 +613,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
                        goto err_mod;
                }
 
-               err = nla_memdup_cookie(a, tb);
-               if (err < 0) {
+               if (nla_memdup_cookie(a, tb) < 0) {
+                       err = -ENOMEM;
                        tcf_hash_release(a, bind);
                        goto err_mod;
                }
@@ -859,10 +859,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
                goto out_module_put;
 
        err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
-       if (err < 0)
+       if (err <= 0)
                goto out_module_put;
-       if (err == 0)
-               goto noflush_out;
 
        nla_nest_end(skb, nest);
 
@@ -879,7 +877,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 out_module_put:
        module_put(ops->owner);
 err_out:
-noflush_out:
        kfree_skb(skb);
        return err;
 }
index 85406d5..71ce6b9 100644 (file)
@@ -177,7 +177,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 {
        sctp_xmit_t retval;
 
-       pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
+       pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
                 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
 
        switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
index 8227bbb..1b6d457 100644 (file)
@@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
                              sctp_scope_t scope, gfp_t gfp, int copy_flags)
 {
        struct sctp_sockaddr_entry *addr;
+       union sctp_addr laddr;
        int error = 0;
 
        rcu_read_lock();
@@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
                     !(copy_flags & SCTP_ADDR6_PEERSUPP)))
                        continue;
 
-               if (sctp_bind_addr_state(bp, &addr->a) != -1)
+               laddr = addr->a;
+               /* also works for setting ipv6 address port */
+               laddr.v4.sin_port = htons(bp->port);
+               if (sctp_bind_addr_state(bp, &laddr) != -1)
                        continue;
 
                error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
index b532148..465a9c8 100644 (file)
@@ -4862,6 +4862,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
        if (!asoc)
                return -EINVAL;
 
+       /* If there is a thread waiting on more sndbuf space for
+        * sending on this asoc, it cannot be peeled.
+        */
+       if (waitqueue_active(&asoc->wait))
+               return -EBUSY;
+
        /* An association cannot be branched off from an already peeled-off
         * socket, nor is this supported for tcp style sockets.
         */
@@ -7599,8 +7605,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
                 */
                release_sock(sk);
                current_timeo = schedule_timeout(current_timeo);
-               if (sk != asoc->base.sk)
-                       goto do_error;
                lock_sock(sk);
 
                *timeo_p = current_timeo;
index 5b63ceb..3379668 100644 (file)
@@ -643,9 +643,7 @@ void sctp_transport_reset(struct sctp_transport *t)
        t->srtt = 0;
        t->rttvar = 0;
 
-       /* Reset these additional varibles so that we have a clean
-        * slate.
-        */
+       /* Reset these additional variables so that we have a clean slate. */
        t->partial_bytes_acked = 0;
        t->flight_size = 0;
        t->error_count = 0;
index cdeb1d8..4f16953 100644 (file)
@@ -763,7 +763,7 @@ err_put_ctx:
 err:
        kfree(buf);
 out:
-       dprintk("RPC:       %s returning %Zd\n", __func__, err);
+       dprintk("RPC:       %s returning %zd\n", __func__, err);
        return err;
 }
 
index 1530825..a54a7a3 100644 (file)
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
        case RPC_GSS_PROC_DESTROY:
                if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
                        goto auth_err;
-               rsci->h.expiry_time = seconds_since_boot();
-               set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+               /* Delete the entry from the cache_list and call cache_put */
+               sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
                if (resv->iov_len + 4 > PAGE_SIZE)
                        goto drop;
                svc_putnl(resv, RPC_SUCCESS);
index f39e3e1..d8639da 100644 (file)
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
        cache_purge(cd);
        spin_lock(&cache_list_lock);
        write_lock(&cd->hash_lock);
-       if (cd->entries) {
-               write_unlock(&cd->hash_lock);
-               spin_unlock(&cache_list_lock);
-               goto out;
-       }
        if (current_detail == cd)
                current_detail = NULL;
        list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
                /* module must be being unloaded so its safe to kill the worker */
                cancel_delayed_work_sync(&cache_cleaner);
        }
-       return;
-out:
-       printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
 }
 EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
 
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
 
 void cache_purge(struct cache_detail *detail)
 {
-       time_t now = seconds_since_boot();
-       if (detail->flush_time >= now)
-               now = detail->flush_time + 1;
-       /* 'now' is the maximum value any 'last_refresh' can have */
-       detail->flush_time = now;
-       detail->nextcheck = seconds_since_boot();
-       cache_flush();
+       struct cache_head *ch = NULL;
+       struct hlist_head *head = NULL;
+       struct hlist_node *tmp = NULL;
+       int i = 0;
+
+       write_lock(&detail->hash_lock);
+       if (!detail->entries) {
+               write_unlock(&detail->hash_lock);
+               return;
+       }
+
+       dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+       for (i = 0; i < detail->hash_size; i++) {
+               head = &detail->hash_table[i];
+               hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+                       hlist_del_init(&ch->cache_list);
+                       detail->entries--;
+
+                       set_bit(CACHE_CLEANED, &ch->flags);
+                       write_unlock(&detail->hash_lock);
+                       cache_fresh_unlocked(ch, detail);
+                       cache_put(ch, detail);
+                       write_lock(&detail->hash_lock);
+               }
+       }
+       write_unlock(&detail->hash_lock);
 }
 EXPORT_SYMBOL_GPL(cache_purge);
 
@@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
 
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+       write_lock(&cd->hash_lock);
+       if (!hlist_unhashed(&h->cache_list)){
+               hlist_del_init(&h->cache_list);
+               cd->entries--;
+               write_unlock(&cd->hash_lock);
+               cache_put(h, cd);
+       } else
+               write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
index 75f290b..b94efd9 100644 (file)
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
                for (i = 0; i < progp->pg_nvers; i++) {
                        if (progp->pg_vers[i] == NULL)
                                continue;
-                       if (progp->pg_vers[i]->vs_hidden == 0)
+                       if (!progp->pg_vers[i]->vs_hidden)
                                return 1;
                }
        }
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
                        if (vers->vs_hidden)
                                continue;
 
+                       /*
+                        * Don't register a UDP port if we need congestion
+                        * control.
+                        */
+                       if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+                               continue;
+
                        error = __svc_register(net, progp->pg_name, progp->pg_prog,
                                                i, family, proto, port);
 
@@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
          !(versp = progp->pg_vers[vers]))
                goto err_bad_vers;
 
+       /*
+        * Some protocol versions (namely NFSv4) require some form of
+        * congestion control.  (See RFC 7530 section 3.1 paragraph 2)
+        * In other words, UDP is not allowed. We mark those when setting
+        * up the svc_xprt, and verify that here.
+        *
+        * The spec is not very clear about what error should be returned
+        * when someone tries to access a server that is listening on UDP
+        * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+        * fit.
+        */
+       if (versp->vs_need_cong_ctrl &&
+           !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+               goto err_bad_vers;
+
        procp = versp->vs_proc + proc;
        if (proc >= versp->vs_nproc || !procp->pc_func)
                goto err_bad_proc;
@@ -1260,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
        return 0;
 
 err_short_len:
-       svc_printk(rqstp, "short len %Zd, dropping request\n",
+       svc_printk(rqstp, "short len %zd, dropping request\n",
                        argv->iov_len);
        goto close;
 
index de066ac..8931e33 100644 (file)
@@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
                               rqstp->rq_respages[0], tailoff);
 
 out:
-       dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+       dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
                svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
                xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
 
@@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
        if (len == buflen)
                set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
-       dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+       dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
                svsk, iov[0].iov_base, iov[0].iov_len, len);
        return len;
 }
@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
        svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
                      &svsk->sk_xprt, serv);
        set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+       set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
        if (sk->sk_state == TCP_LISTEN) {
                dprintk("setting up TCP socket for listening\n");
                set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
index cb1e48e..ff1df40 100644 (file)
@@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
 {
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+       __be32 *p;
        int rc;
 
        /* Space in the send buffer for an RPC/RDMA header is reserved
         * via xprt->tsh_size.
         */
-       headerp->rm_xid = rqst->rq_xid;
-       headerp->rm_vers = rpcrdma_version;
-       headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
-       headerp->rm_type = rdma_msg;
-       headerp->rm_body.rm_chunks[0] = xdr_zero;
-       headerp->rm_body.rm_chunks[1] = xdr_zero;
-       headerp->rm_body.rm_chunks[2] = xdr_zero;
+       p = rqst->rq_buffer;
+       *p++ = rqst->rq_xid;
+       *p++ = rpcrdma_version;
+       *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+       *p++ = rdma_msg;
+       *p++ = xdr_zero;
+       *p++ = xdr_zero;
+       *p   = xdr_zero;
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
        pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
index 0ba9887..1c4aabf 100644 (file)
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
 
 #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
 
-/*
- * Decodes a read chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    position : __be32 offset into XDR stream
- *    handle   : __be32 RKEY
- *    . . .
- *  end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
 {
-       struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+       __be32 *next;
 
-       while (ch->rc_discrim != xdr_zero) {
-               if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
-                   (unsigned long)vaend) {
-                       dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+       while (*p++ != xdr_zero) {
+               next = p + rpcrdma_readchunk_maxsz - 1;
+               if (next > end)
                        return NULL;
-               }
-               ch++;
+               p = next;
        }
-       return &ch->rc_position;
+       return p;
 }
 
-/*
- * Decodes a write chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    nchunks  : <count>
- *       handle   : __be32 RKEY           ---+
- *       length   : __be32 <len of segment>  |
- *       offset   : remove va                + <count>
- *       . . .                               |
- *                                        ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
 {
-       unsigned long start, end;
-       int nchunks;
-
-       struct rpcrdma_write_array *ary =
-               (struct rpcrdma_write_array *)va;
+       __be32 *next;
 
-       /* Check for not write-array */
-       if (ary->wc_discrim == xdr_zero)
-               return &ary->wc_nchunks;
-
-       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-           (unsigned long)vaend) {
-               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-               return NULL;
-       }
-       nchunks = be32_to_cpu(ary->wc_nchunks);
-
-       start = (unsigned long)&ary->wc_array[0];
-       end = (unsigned long)vaend;
-       if (nchunks < 0 ||
-           nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-           (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-                       ary, nchunks, vaend);
-               return NULL;
+       while (*p++ != xdr_zero) {
+               next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+               if (next > end)
+                       return NULL;
+               p = next;
        }
-       /*
-        * rs_length is the 2nd 4B field in wc_target and taking its
-        * address skips the list terminator
-        */
-       return &ary->wc_array[nchunks].wc_target.rs_length;
+       return p;
 }
 
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
 {
-       unsigned long start, end;
-       int nchunks;
-       struct rpcrdma_write_array *ary =
-               (struct rpcrdma_write_array *)va;
-
-       /* Check for no reply-array */
-       if (ary->wc_discrim == xdr_zero)
-               return &ary->wc_nchunks;
-
-       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-           (unsigned long)vaend) {
-               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-               return NULL;
-       }
-       nchunks = be32_to_cpu(ary->wc_nchunks);
-
-       start = (unsigned long)&ary->wc_array[0];
-       end = (unsigned long)vaend;
-       if (nchunks < 0 ||
-           nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-           (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-                       ary, nchunks, vaend);
-               return NULL;
+       __be32 *next;
+
+       if (*p++ != xdr_zero) {
+               next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+               if (next > end)
+                       return NULL;
+               p = next;
        }
-       return (__be32 *)&ary->wc_array[nchunks];
+       return p;
 }
 
 /**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
  */
 int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
-       struct rpcrdma_msg *rmsgp;
-       __be32 *va, *vaend;
-       unsigned int len;
-       u32 hdr_len;
+       __be32 *p, *end, *rdma_argp;
+       unsigned int hdr_len;
 
        /* Verify that there's enough bytes for header + something */
-       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
-               dprintk("svcrdma: header too short = %d\n",
-                       rq_arg->len);
-               return -EINVAL;
-       }
+       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+               goto out_short;
 
-       rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
-       if (rmsgp->rm_vers != rpcrdma_version) {
-               dprintk("%s: bad version %u\n", __func__,
-                       be32_to_cpu(rmsgp->rm_vers));
-               return -EPROTONOSUPPORT;
-       }
+       rdma_argp = rq_arg->head[0].iov_base;
+       if (*(rdma_argp + 1) != rpcrdma_version)
+               goto out_version;
 
-       switch (be32_to_cpu(rmsgp->rm_type)) {
-       case RDMA_MSG:
-       case RDMA_NOMSG:
+       switch (*(rdma_argp + 3)) {
+       case rdma_msg:
+       case rdma_nomsg:
                break;
 
-       case RDMA_DONE:
-               /* Just drop it */
-               dprintk("svcrdma: dropping RDMA_DONE message\n");
-               return 0;
-
-       case RDMA_ERROR:
-               /* Possible if this is a backchannel reply.
-                * XXX: We should cancel this XID, though.
-                */
-               dprintk("svcrdma: dropping RDMA_ERROR message\n");
-               return 0;
-
-       case RDMA_MSGP:
-               /* Pull in the extra for the padded case, bump our pointer */
-               rmsgp->rm_body.rm_padded.rm_align =
-                       be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
-               rmsgp->rm_body.rm_padded.rm_thresh =
-                       be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
-               va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-               rq_arg->head[0].iov_base = va;
-               len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-               rq_arg->head[0].iov_len -= len;
-               if (len > rq_arg->len)
-                       return -EINVAL;
-               return len;
-       default:
-               dprintk("svcrdma: bad rdma procedure (%u)\n",
-                       be32_to_cpu(rmsgp->rm_type));
-               return -EINVAL;
-       }
+       case rdma_done:
+               goto out_drop;
 
-       /* The chunk list may contain either a read chunk list or a write
-        * chunk list and a reply chunk list.
-        */
-       va = &rmsgp->rm_body.rm_chunks[0];
-       vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
-       va = decode_read_list(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode read list\n");
-               return -EINVAL;
-       }
-       va = decode_write_list(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode write list\n");
-               return -EINVAL;
-       }
-       va = decode_reply_array(va, vaend);
-       if (!va) {
-               dprintk("svcrdma: failed to decode reply chunk\n");
-               return -EINVAL;
+       case rdma_error:
+               goto out_drop;
+
+       default:
+               goto out_proc;
        }
 
-       rq_arg->head[0].iov_base = va;
-       hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+       end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+       p = xdr_check_read_list(rdma_argp + 4, end);
+       if (!p)
+               goto out_inval;
+       p = xdr_check_write_list(p, end);
+       if (!p)
+               goto out_inval;
+       p = xdr_check_reply_chunk(p, end);
+       if (!p)
+               goto out_inval;
+       if (p > end)
+               goto out_inval;
+
+       rq_arg->head[0].iov_base = p;
+       hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
        rq_arg->head[0].iov_len -= hdr_len;
        return hdr_len;
+
+out_short:
+       dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+       return -EINVAL;
+
+out_version:
+       dprintk("svcrdma: bad xprt version: %u\n",
+               be32_to_cpup(rdma_argp + 1));
+       return -EPROTONOSUPPORT;
+
+out_drop:
+       dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+       return 0;
+
+out_proc:
+       dprintk("svcrdma: bad rdma procedure (%u)\n",
+               be32_to_cpup(rdma_argp + 3));
+       return -EINVAL;
+
+out_inval:
+       dprintk("svcrdma: failed to parse transport header\n");
+       return -EINVAL;
 }
 
 int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 
        *va++ = rmsgp->rm_xid;
        *va++ = rmsgp->rm_vers;
-       *va++ = cpu_to_be32(xprt->sc_max_requests);
+       *va++ = xprt->sc_fc_credits;
        *va++ = rdma_error;
        *va++ = cpu_to_be32(err);
        if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
        return (int)((unsigned long)va - (unsigned long)startp);
 }
 
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
 {
-       struct rpcrdma_write_array *wr_ary;
+       unsigned int nsegs;
+       __be32 *p;
 
-       /* There is no read-list in a reply */
+       p = rdma_resp;
 
-       /* skip write list */
-       wr_ary = (struct rpcrdma_write_array *)
-               &rmsgp->rm_body.rm_chunks[1];
-       if (wr_ary->wc_discrim)
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
-                       wc_target.rs_length;
-       else
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_nchunks;
-
-       /* skip reply array */
-       if (wr_ary->wc_discrim)
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
-       else
-               wr_ary = (struct rpcrdma_write_array *)
-                       &wr_ary->wc_nchunks;
-
-       return (unsigned long) wr_ary - (unsigned long) rmsgp;
+       /* RPC-over-RDMA V1 replies never have a Read list. */
+       p += rpcrdma_fixed_maxsz + 1;
+
+       /* Skip Write list. */
+       while (*p++ != xdr_zero) {
+               nsegs = be32_to_cpup(p++);
+               p += nsegs * rpcrdma_segment_maxsz;
+       }
+
+       /* Skip Reply chunk. */
+       if (*p++ != xdr_zero) {
+               nsegs = be32_to_cpup(p++);
+               p += nsegs * rpcrdma_segment_maxsz;
+       }
+
+       return (unsigned long)p - (unsigned long)rdma_resp;
 }
 
 void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
        seg->rs_offset = rs_offset;
        seg->rs_length = cpu_to_be32(write_len);
 }
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
-                                 struct rpcrdma_msg *rdma_argp,
-                                 struct rpcrdma_msg *rdma_resp,
-                                 enum rpcrdma_proc rdma_type)
-{
-       rdma_resp->rm_xid = rdma_argp->rm_xid;
-       rdma_resp->rm_vers = rdma_argp->rm_vers;
-       rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
-       rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
-       /* Encode <nul> chunks lists */
-       rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
-       rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
-       rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
index 172b537..f7b2daf 100644 (file)
@@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
        dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-       spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+       spin_lock(&rdma_xprt->sc_rq_dto_lock);
        if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
-               ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
-               spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+               ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
+               spin_unlock(&rdma_xprt->sc_rq_dto_lock);
                rdma_read_complete(rqstp, ctxt);
                goto complete;
        } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
-               ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
        } else {
                atomic_inc(&rdma_stat_rq_starve);
                clear_bit(XPT_DATA, &xprt->xpt_flags);
                ctxt = NULL;
        }
-       spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+       spin_unlock(&rdma_xprt->sc_rq_dto_lock);
        if (!ctxt) {
                /* This is the EAGAIN path. The svc_recv routine will
                 * return -EAGAIN, the nfsd thread will go to call into
index ad4d286..515221b 100644 (file)
@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 
        /* Prepare the SGE for the RPCRDMA Header */
        ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-       ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+       ctxt->sge[0].length =
+           svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
        ctxt->sge[0].addr =
            ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
                            ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        struct rpcrdma_msg *rdma_argp;
        struct rpcrdma_msg *rdma_resp;
        struct rpcrdma_write_array *wr_ary, *rp_ary;
-       enum rpcrdma_proc reply_type;
        int ret;
        int inline_bytes;
        struct page *res_page;
        struct svc_rdma_req_map *vec;
        u32 inv_rkey;
+       __be32 *p;
 
        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        if (!res_page)
                goto err0;
        rdma_resp = page_address(res_page);
-       if (rp_ary)
-               reply_type = RDMA_NOMSG;
-       else
-               reply_type = RDMA_MSG;
-       svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
-                                        rdma_resp, reply_type);
+
+       p = &rdma_resp->rm_xid;
+       *p++ = rdma_argp->rm_xid;
+       *p++ = rdma_argp->rm_vers;
+       *p++ = rdma->sc_fc_credits;
+       *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+       /* Start with empty chunks */
+       *p++ = xdr_zero;
+       *p++ = xdr_zero;
+       *p   = xdr_zero;
 
        /* Send any write-chunk data and build resp write-list */
        if (wr_ary) {
index 39652d3..c13a5c3 100644 (file)
@@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
        ctxt = kmalloc(sizeof(*ctxt), flags);
        if (ctxt) {
                ctxt->xprt = xprt;
-               INIT_LIST_HEAD(&ctxt->free);
-               INIT_LIST_HEAD(&ctxt->dto_q);
+               INIT_LIST_HEAD(&ctxt->list);
        }
        return ctxt;
 }
@@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
                        dprintk("svcrdma: No memory for RDMA ctxt\n");
                        return false;
                }
-               list_add(&ctxt->free, &xprt->sc_ctxts);
+               list_add(&ctxt->list, &xprt->sc_ctxts);
        }
        return true;
 }
@@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
        struct svc_rdma_op_ctxt *ctxt = NULL;
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used++;
        if (list_empty(&xprt->sc_ctxts))
                goto out_empty;
 
        ctxt = list_first_entry(&xprt->sc_ctxts,
-                               struct svc_rdma_op_ctxt, free);
-       list_del_init(&ctxt->free);
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+                               struct svc_rdma_op_ctxt, list);
+       list_del(&ctxt->list);
+       spin_unlock(&xprt->sc_ctxt_lock);
 
 out:
        ctxt->count = 0;
@@ -209,15 +208,15 @@ out_empty:
        /* Either pre-allocation missed the mark, or send
         * queue accounting is broken.
         */
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       spin_unlock(&xprt->sc_ctxt_lock);
 
        ctxt = alloc_ctxt(xprt, GFP_NOIO);
        if (ctxt)
                goto out;
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used--;
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       spin_unlock(&xprt->sc_ctxt_lock);
        WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
        return NULL;
 }
@@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
                for (i = 0; i < ctxt->count; i++)
                        put_page(ctxt->pages[i]);
 
-       spin_lock_bh(&xprt->sc_ctxt_lock);
+       spin_lock(&xprt->sc_ctxt_lock);
        xprt->sc_ctxt_used--;
-       list_add(&ctxt->free, &xprt->sc_ctxts);
-       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       list_add(&ctxt->list, &xprt->sc_ctxts);
+       spin_unlock(&xprt->sc_ctxt_lock);
 }
 
 static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
                struct svc_rdma_op_ctxt *ctxt;
 
                ctxt = list_first_entry(&xprt->sc_ctxts,
-                                       struct svc_rdma_op_ctxt, free);
-               list_del(&ctxt->free);
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                kfree(ctxt);
        }
 }
@@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        /* All wc fields are now known to be valid */
        ctxt->byte_len = wc->byte_len;
        spin_lock(&xprt->sc_rq_dto_lock);
-       list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+       list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
        spin_unlock(&xprt->sc_rq_dto_lock);
 
        set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
 
                read_hdr = ctxt->read_hdr;
                spin_lock(&xprt->sc_rq_dto_lock);
-               list_add_tail(&read_hdr->dto_q,
+               list_add_tail(&read_hdr->list,
                              &xprt->sc_read_complete_q);
                spin_unlock(&xprt->sc_rq_dto_lock);
 
@@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
                return NULL;
        svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
        INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
-       INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
        INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
        INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
        INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        spin_lock_init(&cma_xprt->sc_ctxt_lock);
        spin_lock_init(&cma_xprt->sc_map_lock);
 
+       /*
+        * Note that this implies that the underlying transport support
+        * has some form of congestion control (see RFC 7530 section 3.1
+        * paragraph 2). For now, we assume that all supported RDMA
+        * transports are suitable here.
+        */
+       set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
        if (listener)
                set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
 
@@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
 {
        struct svc_rdma_fastreg_mr *frmr = NULL;
 
-       spin_lock_bh(&rdma->sc_frmr_q_lock);
+       spin_lock(&rdma->sc_frmr_q_lock);
        if (!list_empty(&rdma->sc_frmr_q)) {
                frmr = list_entry(rdma->sc_frmr_q.next,
                                  struct svc_rdma_fastreg_mr, frmr_list);
                list_del_init(&frmr->frmr_list);
                frmr->sg_nents = 0;
        }
-       spin_unlock_bh(&rdma->sc_frmr_q_lock);
+       spin_unlock(&rdma->sc_frmr_q_lock);
        if (frmr)
                return frmr;
 
@@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
        if (frmr) {
                ib_dma_unmap_sg(rdma->sc_cm_id->device,
                                frmr->sg, frmr->sg_nents, frmr->direction);
-               spin_lock_bh(&rdma->sc_frmr_q_lock);
+               spin_lock(&rdma->sc_frmr_q_lock);
                WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
                list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
-               spin_unlock_bh(&rdma->sc_frmr_q_lock);
+               spin_unlock(&rdma->sc_frmr_q_lock);
        }
 }
 
@@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        newxprt->sc_max_req_size = svcrdma_max_req_size;
        newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
                                         svcrdma_max_requests);
+       newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
        newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
                                            svcrdma_max_bc_requests);
        newxprt->sc_rq_depth = newxprt->sc_max_requests +
@@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                goto errout;
        }
        newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-                                       0, IB_POLL_SOFTIRQ);
+                                       0, IB_POLL_WORKQUEUE);
        if (IS_ERR(newxprt->sc_sq_cq)) {
                dprintk("svcrdma: error creating SQ CQ for connect request\n");
                goto errout;
        }
        newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
-                                       0, IB_POLL_SOFTIRQ);
+                                       0, IB_POLL_WORKQUEUE);
        if (IS_ERR(newxprt->sc_rq_cq)) {
                dprintk("svcrdma: error creating RQ CQ for connect request\n");
                goto errout;
@@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work)
         */
        while (!list_empty(&rdma->sc_read_complete_q)) {
                struct svc_rdma_op_ctxt *ctxt;
-               ctxt = list_entry(rdma->sc_read_complete_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma->sc_read_complete_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                svc_rdma_put_context(ctxt, 1);
        }
 
        /* Destroy queued, but not processed recv completions */
        while (!list_empty(&rdma->sc_rq_dto_q)) {
                struct svc_rdma_op_ctxt *ctxt;
-               ctxt = list_entry(rdma->sc_rq_dto_q.next,
-                                 struct svc_rdma_op_ctxt,
-                                 dto_q);
-               list_del_init(&ctxt->dto_q);
+               ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+                                       struct svc_rdma_op_ctxt, list);
+               list_del(&ctxt->list);
                svc_rdma_put_context(ctxt, 1);
        }
 
index af392d9..956c7bc 100644 (file)
@@ -1188,7 +1188,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
        char *p;
 
        len = sizeof(transport->tcp_xid) - transport->tcp_offset;
-       dprintk("RPC:       reading XID (%Zu bytes)\n", len);
+       dprintk("RPC:       reading XID (%zu bytes)\n", len);
        p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
        used = xdr_skb_read_bits(desc, p, len);
        transport->tcp_offset += used;
@@ -1219,7 +1219,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
         */
        offset = transport->tcp_offset - sizeof(transport->tcp_xid);
        len = sizeof(transport->tcp_calldir) - offset;
-       dprintk("RPC:       reading CALL/REPLY flag (%Zu bytes)\n", len);
+       dprintk("RPC:       reading CALL/REPLY flag (%zu bytes)\n", len);
        p = ((char *) &transport->tcp_calldir) + offset;
        used = xdr_skb_read_bits(desc, p, len);
        transport->tcp_offset += used;
@@ -1310,7 +1310,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
                return;
        }
 
-       dprintk("RPC:       XID %08x read %Zd bytes\n",
+       dprintk("RPC:       XID %08x read %zd bytes\n",
                        ntohl(transport->tcp_xid), r);
        dprintk("RPC:       xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
                        "tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1456,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
        desc->count -= len;
        desc->offset += len;
        transport->tcp_offset += len;
-       dprintk("RPC:       discarded %Zu bytes\n", len);
+       dprintk("RPC:       discarded %zu bytes\n", len);
        xs_tcp_check_fraghdr(transport);
 }
 
index e9295fa..4512e83 100644 (file)
@@ -1505,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 {
        struct sk_buff_head xmitq;
        struct tipc_node *n;
-       struct tipc_msg *hdr = buf_msg(skb);
-       int usr = msg_user(hdr);
+       struct tipc_msg *hdr;
        int bearer_id = b->identity;
        struct tipc_link_entry *le;
-       u16 bc_ack = msg_bcast_ack(hdr);
        u32 self = tipc_own_addr(net);
-       int rc = 0;
+       int usr, rc = 0;
+       u16 bc_ack;
 
        __skb_queue_head_init(&xmitq);
 
-       /* Ensure message is well-formed */
+       /* Ensure message is well-formed before touching the header */
        if (unlikely(!tipc_msg_validate(skb)))
                goto discard;
+       hdr = buf_msg(skb);
+       usr = msg_user(hdr);
+       bc_ack = msg_bcast_ack(hdr);
 
        /* Handle arrival of discovery or broadcast packet */
        if (unlikely(msg_non_seq(hdr))) {
index 5f3e878..0806dcc 100644 (file)
@@ -2836,14 +2836,8 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
        return mtu ? : dst_mtu(dst->path);
 }
 
-static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
-                                          struct sk_buff *skb,
-                                          const void *daddr)
-{
-       return dst->path->ops->neigh_lookup(dst, skb, daddr);
-}
-
-static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
+                                       const void *daddr)
 {
        const struct dst_entry *path = dst->path;
 
@@ -2857,6 +2851,25 @@ static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
+       return daddr;
+}
+
+static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
+                                          struct sk_buff *skb,
+                                          const void *daddr)
+{
+       const struct dst_entry *path = dst->path;
+
+       if (!skb)
+               daddr = xfrm_get_dst_nexthop(dst, daddr);
+       return path->ops->neigh_lookup(path, skb, daddr);
+}
+
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+       const struct dst_entry *path = dst->path;
+
+       daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
 }
 
index 918259a..baa3c7b 100755 (executable)
@@ -1848,6 +1848,8 @@ my $prefix = '';
 sub show_type {
        my ($type) = @_;
 
+       $type =~ tr/[a-z]/[A-Z]/;
+
        return defined $use_type{$type} if (scalar keys %use_type > 0);
 
        return !defined $ignore_type{$type};
@@ -5204,18 +5206,27 @@ sub process {
                             "Consecutive strings are generally better as a single string\n" . $herecurr);
                }
 
-# check for %L{u,d,i} and 0x%[udi] in strings
-               my $string;
+# check for non-standard and hex prefixed decimal printf formats
+               my $show_L = 1; #don't show the same defect twice
+               my $show_Z = 1;
                while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
-                       $string = substr($rawline, $-[1], $+[1] - $-[1]);
+                       my $string = substr($rawline, $-[1], $+[1] - $-[1]);
                        $string =~ s/%%/__/g;
-                       if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
+                       # check for %L
+                       if ($show_L && $string =~ /%[\*\d\.\$]*L([diouxX])/) {
                                WARN("PRINTF_L",
-                                    "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
-                               last;
-                       }
-                       if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
-                               ERROR("PRINTF_0xDECIMAL",
+                                    "\%L$1 is non-standard C, use %ll$1\n" . $herecurr);
+                               $show_L = 0;
+                       }
+                       # check for %Z
+                       if ($show_Z && $string =~ /%[\*\d\.\$]*Z([diouxX])/) {
+                               WARN("PRINTF_Z",
+                                    "%Z$1 is non-standard C, use %z$1\n" . $herecurr);
+                               $show_Z = 0;
+                       }
+                       # check for 0x<decimal>
+                       if ($string =~ /0x%[\*\d\.\$\Llzth]*[diou]/) {
+                               ERROR("PRINTF_0XDECIMAL",
                                      "Prefixing 0x with decimal output is defective\n" . $herecurr);
                        }
                }
index faac4b1..0b6002b 100755 (executable)
@@ -318,7 +318,7 @@ if ($arch eq "x86_64") {
     # instruction or the addiu one. herein, we record the address of the
     # first one, and then we can replace this instruction by a branch
     # instruction to jump over the profiling function to filter the
-    # indicated functions, or swith back to the lui instruction to trace
+    # indicated functions, or switch back to the lui instruction to trace
     # them, which means dynamic tracing.
     #
     #       c: 3c030000        lui     v1,0x0
index b3a1994..0458b03 100644 (file)
@@ -62,15 +62,19 @@ adress||address
 adresses||addresses
 adviced||advised
 afecting||affecting
+againt||against
 agaist||against
 albumns||albums
 alegorical||allegorical
+algined||aligned
 algorith||algorithm
 algorithmical||algorithmically
 algoritm||algorithm
 algoritms||algorithms
 algorrithm||algorithm
 algorritm||algorithm
+aligment||alignment
+alignement||alignment
 allign||align
 allocatrd||allocated
 allocte||allocate
@@ -86,6 +90,10 @@ alue||value
 ambigious||ambiguous
 amoung||among
 amout||amount
+an union||a union
+an user||a user
+an userspace||a userspace
+an one||a one
 analysator||analyzer
 ang||and
 anniversery||anniversary
@@ -98,6 +106,7 @@ appearence||appearance
 applicaion||application
 appliction||application
 applictions||applications
+applys||applies
 appplications||applications
 appropiate||appropriate
 appropriatly||appropriately
@@ -237,6 +246,9 @@ commited||committed
 commiting||committing
 committ||commit
 commoditiy||commodity
+comsume||consume
+comsumer||consumer
+comsuming||consuming
 compability||compatibility
 compaibility||compatibility
 compatability||compatibility
@@ -258,6 +270,7 @@ comunication||communication
 conbination||combination
 conditionaly||conditionally
 conected||connected
+configuartion||configuration
 configuratoin||configuration
 configuraton||configuration
 configuretion||configuration
@@ -310,6 +323,9 @@ defintion||definition
 defintions||definitions
 defualt||default
 defult||default
+deintializing||deinitializing
+deintialize||deinitialize
+deintialized||deinitialized
 deivce||device
 delared||declared
 delare||declare
@@ -352,6 +368,7 @@ differrence||difference
 difinition||definition
 diplay||display
 direectly||directly
+disassocation||disassociation
 disapear||disappear
 disapeared||disappeared
 disappared||disappeared
@@ -375,10 +392,12 @@ easilly||easily
 ecspecially||especially
 edditable||editable
 editting||editing
+efective||effective
 efficently||efficiently
 ehther||ether
 eigth||eight
 eletronic||electronic
+embeded||embedded
 enabledi||enabled
 enchanced||enhanced
 encorporating||incorporating
@@ -414,6 +433,7 @@ expecially||especially
 explicite||explicit
 explicitely||explicitly
 explict||explicit
+explictely||explicitly
 explictly||explicitly
 expresion||expression
 exprimental||experimental
@@ -445,6 +465,7 @@ finsih||finish
 flusing||flushing
 folloing||following
 followign||following
+followings||following
 follwing||following
 forseeable||foreseeable
 forse||force
@@ -537,6 +558,7 @@ initalise||initialize
 initalize||initialize
 initation||initiation
 initators||initiators
+initialiazation||initialization
 initializiation||initialization
 initialzed||initialized
 initilization||initialization
@@ -566,6 +588,7 @@ interruptted||interrupted
 interupted||interrupted
 interupt||interrupt
 intial||initial
+intialization||initialization
 intialized||initialized
 intialize||initialize
 intregral||integral
@@ -666,6 +689,7 @@ neccecary||necessary
 neccesary||necessary
 neccessary||necessary
 necesary||necessary
+neded||needed
 negaive||negative
 negoitation||negotiation
 negotation||negotiation
@@ -688,6 +712,8 @@ occure||occurred
 occured||occurred
 occuring||occurring
 offet||offset
+omited||omitted
+omiting||omitting
 omitt||omit
 ommiting||omitting
 ommitted||omitted
@@ -706,8 +732,11 @@ oustanding||outstanding
 overaall||overall
 overhread||overhead
 overlaping||overlapping
+overrided||overridden
 overriden||overridden
 overun||overrun
+overwritting||overwriting
+overwriten||overwritten
 pacakge||package
 pachage||package
 packacge||package
@@ -718,6 +747,7 @@ pakage||package
 pallette||palette
 paln||plan
 paramameters||parameters
+paramaters||parameters
 paramater||parameter
 parametes||parameters
 parametised||parametrised
@@ -962,6 +992,7 @@ straming||streaming
 struc||struct
 structres||structures
 stuct||struct
+strucuture||structure
 stucture||structure
 sturcture||structure
 subdirectoires||subdirectories
@@ -991,6 +1022,13 @@ suspeneded||suspended
 suspicously||suspiciously
 swaping||swapping
 switchs||switches
+swith||switch
+swithable||switchable
+swithc||switch
+swithced||switched
+swithcing||switching
+swithed||switched
+swithing||switching
 symetric||symmetric
 synax||syntax
 synchonized||synchronized
@@ -1007,6 +1045,7 @@ targetting||targeting
 teh||the
 temorary||temporary
 temproarily||temporarily
+therfore||therefore
 thier||their
 threds||threads
 threshhold||threshold
@@ -1050,6 +1089,7 @@ unkmown||unknown
 unknonw||unknown
 unknow||unknown
 unkown||unknown
+unneded||unneeded
 unneedingly||unnecessarily
 unnsupported||unsupported
 unmached||unmatched
@@ -1078,6 +1118,7 @@ vaid||valid
 vaild||valid
 valide||valid
 variantions||variations
+varible||variable
 varient||variant
 vaule||value
 verbse||verbose
index 7d10e5d..9db4709 100644 (file)
@@ -360,7 +360,7 @@ int ebitmap_read(struct ebitmap *e, void *fp)
 
        if (mapunit != BITS_PER_U64) {
                printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
-                      "match my size %Zd (high bit was %d)\n",
+                      "match my size %zd (high bit was %d)\n",
                       mapunit, BITS_PER_U64, e->highbit);
                goto bad;
        }
index d719db4..9c92f29 100644 (file)
@@ -2266,7 +2266,7 @@ int policydb_read(struct policydb *p, void *fp)
        len = le32_to_cpu(buf[1]);
        if (len != strlen(POLICYDB_STRING)) {
                printk(KERN_ERR "SELinux:  policydb string length %d does not "
-                      "match expected length %Zu\n",
+                      "match expected length %zu\n",
                       len, strlen(POLICYDB_STRING));
                goto bad;
        }
index f4234ed..8cf0dc7 100644 (file)
@@ -3093,7 +3093,7 @@ static int patch_cm9739(struct snd_ac97 * ac97)
        /* set-up multi channel */
        /* bit 14: 0 = SPDIF, 1 = EAPD */
        /* bit 13: enable internal vref output for mic */
-       /* bit 12: disable center/lfe (swithable) */
+       /* bit 12: disable center/lfe (switchable) */
        /* bit 10: disable surround/line (switchable) */
        /* bit 9: mix 2 surround off */
        /* bit 4: undocumented; 0 mutes the CM9739A, which defaults to 1 */
index 5cf920b..be56947 100644 (file)
@@ -203,7 +203,7 @@ struct dsp_task_tree_context_block {
 
        u32       saverfe;                                      
 
-       /* Value may be overwriten by stack save algorithm.
+       /* Value may be overwritten by stack save algorithm.
           Retain the size of the stack data saved here if used */
        ___DSP_DUAL_16BIT_ALLOC(
              reserved1,        
index 9ec4dba..07a9deb 100644 (file)
@@ -2866,7 +2866,7 @@ static unsigned int ca0132_capture_pcm_delay(struct hda_pcm_stream *info,
 #define CA0132_CODEC_MUTE(xname, nid, dir) \
        CA0132_CODEC_MUTE_MONO(xname, nid, 3, dir)
 
-/* The followings are for tuning of products */
+/* The following are for tuning of products */
 #ifdef ENABLE_TUNING_CONTROLS
 
 static unsigned int voice_focus_vals_lookup[] = {
index f7ac8d5..27c03e4 100644 (file)
@@ -254,7 +254,7 @@ static int snd_wm8766_ctl_put(struct snd_kcontrol *kcontrol,
        int n = kcontrol->private_value;
        u16 val, regval1, regval2;
 
-       /* this also works for enum because value is an union */
+       /* this also works for enum because value is a union */
        regval1 = ucontrol->value.integer.value[0];
        regval2 = ucontrol->value.integer.value[1];
        if (wm->ctl[n].flags & WM8766_FLAG_INVERT) {
index ebd2fe4..553669b 100644 (file)
@@ -528,7 +528,7 @@ static int snd_wm8776_ctl_put(struct snd_kcontrol *kcontrol,
        int n = kcontrol->private_value;
        u16 val, regval1, regval2;
 
-       /* this also works for enum because value is an union */
+       /* this also works for enum because value is a union */
        regval1 = ucontrol->value.integer.value[0];
        regval2 = ucontrol->value.integer.value[1];
        if (wm->ctl[n].flags & WM8776_FLAG_INVERT) {
index 565f7f5..1e25095 100644 (file)
@@ -2051,7 +2051,7 @@ static void snd_korg1212_proc_read(struct snd_info_entry *entry,
        snd_iprintf(buffer, korg1212->card->longname);
        snd_iprintf(buffer, " (index #%d)\n", korg1212->card->number + 1);
        snd_iprintf(buffer, "\nGeneral settings\n");
-       snd_iprintf(buffer, "    period size: %Zd bytes\n", K1212_PERIOD_BYTES);
+       snd_iprintf(buffer, "    period size: %zd bytes\n", K1212_PERIOD_BYTES);
        snd_iprintf(buffer, "     clock mode: %s\n", clockSourceName[korg1212->clkSrcRate] );
        snd_iprintf(buffer, "  left ADC Sens: %d\n", korg1212->leftADCInSens );
        snd_iprintf(buffer, " right ADC Sens: %d\n", korg1212->rightADCInSens );
@@ -2276,7 +2276,7 @@ static int snd_korg1212_create(struct snd_card *card, struct pci_dev *pci,
 
        if (snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, snd_dma_pci_data(pci),
                                sizeof(struct KorgSharedBuffer), &korg1212->dma_shared) < 0) {
-               snd_printk(KERN_ERR "korg1212: can not allocate shared buffer memory (%Zd bytes)\n", sizeof(struct KorgSharedBuffer));
+               snd_printk(KERN_ERR "korg1212: can not allocate shared buffer memory (%zd bytes)\n", sizeof(struct KorgSharedBuffer));
                 snd_korg1212_free(korg1212);
                 return -ENOMEM;
         }
index 8063305..a99808a 100644 (file)
@@ -292,7 +292,7 @@ static int pcxhr_dsp_load(struct pcxhr_mgr *mgr, int index,
        int err, card_index;
 
        dev_dbg(&mgr->pci->dev,
-               "loading dsp [%d] size = %Zd\n", index, dsp->size);
+               "loading dsp [%d] size = %zd\n", index, dsp->size);
 
        switch (index) {
        case PCXHR_FIRMWARE_XLX_INT_INDEX:
index 56aa1ba..5f97791 100644 (file)
@@ -201,7 +201,7 @@ static int vxp_load_xilinx_binary(struct vx_core *_chip, const struct firmware *
        c |= (int)vx_inb(chip, RXM) << 8;
        c |= vx_inb(chip, RXL);
 
-       snd_printdd(KERN_DEBUG "xilinx: dsp size received 0x%x, orig 0x%Zx\n", c, fw->size);
+       snd_printdd(KERN_DEBUG "xilinx: dsp size received 0x%x, orig 0x%zx\n", c, fw->size);
 
        vx_outb(chip, ICR, ICR_HF0);
 
index b84d7d3..cdd44ab 100644 (file)
@@ -883,7 +883,7 @@ static void snd_ps3_audio_set_base_addr(uint64_t ioaddr_start)
 static void snd_ps3_audio_fixup(struct snd_ps3_card_info *card)
 {
        /*
-        * avsetting driver seems to never change the followings
+        * avsetting driver seems to never change the following
         * so, init them here once
         */
 
index 818b052..ec1067a 100644 (file)
@@ -506,7 +506,7 @@ static int acp_init(void __iomem *acp_mmio)
        return 0;
 }
 
-/* Deintialize ACP */
+/* Deinitialize ACP */
 static int acp_deinit(void __iomem *acp_mmio)
 {
        u32 val;
index 624b3b9..63b2745 100644 (file)
@@ -1269,7 +1269,7 @@ void wm_hubs_set_bias_level(struct snd_soc_codec *codec,
                break;
 
        case SND_SOC_BIAS_ON:
-               /* Turn off any unneded single ended outputs */
+               /* Turn off any unneeded single ended outputs */
                val = 0;
                mask = 0;
 
index 1d82f68..8cfffa7 100644 (file)
@@ -368,7 +368,7 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair *pair)
        fsl_asrc_set_watermarks(pair, ASRC_INPUTFIFO_THRESHOLD,
                                ASRC_INPUTFIFO_THRESHOLD);
 
-       /* Configure the followings only for Ideal Ratio mode */
+       /* Configure the following only for Ideal Ratio mode */
        if (!ideal)
                return 0;
 
index 924971b..9b03135 100644 (file)
@@ -82,7 +82,7 @@ struct lpass_variant {
         **/
        u32     dmactl_audif_start;
        u32     wrdma_channel_start;
-       /* SOC specific intialization like clocks */
+       /* SOC specific initialization like clocks */
        int (*init)(struct platform_device *pdev);
        int (*exit)(struct platform_device *pdev);
        int (*alloc_dma_channel)(struct lpass_data *data, int direction);
index a110d39..6dca408 100644 (file)
@@ -3041,7 +3041,7 @@ static int snd_soc_register_dais(struct snd_soc_component *component,
        unsigned int i;
        int ret;
 
-       dev_dbg(dev, "ASoC: dai register %s #%Zu\n", dev_name(dev), count);
+       dev_dbg(dev, "ASoC: dai register %s #%zu\n", dev_name(dev), count);
 
        component->dai_drv = dai_drv;
 
index aff3d81..3e9b1c0 100644 (file)
@@ -344,7 +344,7 @@ static int soc_tplg_widget_load(struct soc_tplg *tplg,
        return 0;
 }
 
-/* pass DAI configurations to component driver for extra intialization */
+/* pass DAI configurations to component driver for extra initialization */
 static int soc_tplg_dai_load(struct soc_tplg *tplg,
        struct snd_soc_dai_driver *dai_drv)
 {
@@ -354,7 +354,7 @@ static int soc_tplg_dai_load(struct soc_tplg *tplg,
        return 0;
 }
 
-/* pass link configurations to component driver for extra intialization */
+/* pass link configurations to component driver for extra initialization */
 static int soc_tplg_dai_link_load(struct soc_tplg *tplg,
        struct snd_soc_dai_link *link)
 {
index aaf7ed3..477f00e 100644 (file)
@@ -35,8 +35,8 @@ all: $(OUTPUT)fixdep
 
 clean:
        $(call QUIET_CLEAN, fixdep)
-       $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
-       $(Q)rm -f fixdep
+       $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+       $(Q)rm -f $(OUTPUT)fixdep
 
 $(OUTPUT)fixdep-in.o: FORCE
        $(Q)$(MAKE) $(build)=fixdep
index ad22e4e..d360f39 100644 (file)
@@ -3,4 +3,7 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 fixdep:
        $(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
 
+fixdep-clean:
+       $(Q)$(MAKE) -C $(srctree)/tools/build clean
+
 .PHONY: fixdep
index 18663f5..68b8c15 100644 (file)
@@ -20,4 +20,7 @@ static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
                (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0;
 }
 
+#define __set_bit(nr, addr)    set_bit(nr, addr)
+#define __clear_bit(nr, addr)  clear_bit(nr, addr)
+
 #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */
index beda1a8..4790f04 100644 (file)
        unlikely(__ret_warn_on);                \
 })
 
+#define WARN_ON(condition) ({                                  \
+       int __ret_warn_on = !!(condition);                      \
+       if (unlikely(__ret_warn_on))                            \
+               __WARN_printf("assertion failed at %s:%d\n",    \
+                               __FILE__, __LINE__);            \
+       unlikely(__ret_warn_on);                                \
+})
+
 #define WARN_ON_ONCE(condition) ({                     \
        static int __warned;                            \
        int __ret_warn_once = !!(condition);            \
index eef41d5..e8b9f51 100644 (file)
@@ -4,6 +4,7 @@
 #include <string.h>
 #include <linux/bitops.h>
 #include <stdlib.h>
+#include <linux/kernel.h>
 
 #define DECLARE_BITMAP(name,bits) \
        unsigned long name[BITS_TO_LONGS(bits)]
index fc44634..1aecad3 100644 (file)
@@ -2,7 +2,6 @@
 #define _TOOLS_LINUX_BITOPS_H_
 
 #include <asm/types.h>
-#include <linux/kernel.h>
 #include <linux/compiler.h>
 
 #ifndef __WORDSIZE
index 6326ede..8de163b 100644 (file)
@@ -25,6 +25,8 @@
 #endif
 
 #define __user
+#define __rcu
+#define __read_mostly
 
 #ifndef __attribute_const__
 # define __attribute_const__
@@ -54,6 +56,8 @@
 # define unlikely(x)           __builtin_expect(!!(x), 0)
 #endif
 
+#define uninitialized_var(x) x = *(&(x))
+
 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
 
 #include <linux/types.h>
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
new file mode 100644 (file)
index 0000000..58397dc
--- /dev/null
@@ -0,0 +1,5 @@
+#define spinlock_t             pthread_mutex_t
+#define DEFINE_SPINLOCK(x)     pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER;
+
+#define spin_lock_irqsave(x, f)                (void)f, pthread_mutex_lock(x)
+#define spin_unlock_irqrestore(x, f)   (void)f, pthread_mutex_unlock(x)
index d48b70c..207c2ee 100644 (file)
@@ -27,7 +27,7 @@
 #include "bpf.h"
 
 /*
- * When building perf, unistd.h is overrided. __NR_bpf is
+ * When building perf, unistd.h is overridden. __NR_bpf is
  * required to be defined explicitly.
  */
 #ifndef __NR_bpf
index f2ea780..7ce724f 100644 (file)
@@ -5225,13 +5225,13 @@ int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec)
 }
 
 /**
- * pevent_data_prempt_count - parse the preempt count from the record
+ * pevent_data_preempt_count - parse the preempt count from the record
  * @pevent: a handle to the pevent
  * @rec: the record to parse
  *
  * This returns the preempt count from a record.
  */
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec)
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec)
 {
        return parse_common_pc(pevent, rec->data);
 }
index 74cecba..6634280 100644 (file)
@@ -710,7 +710,7 @@ void pevent_data_lat_fmt(struct pevent *pevent,
 int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
 struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type);
 int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
-int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec);
+int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec);
 int pevent_data_flags(struct pevent *pevent, struct pevent_record *rec);
 const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
 struct cmdline;
index f7350fc..a59e061 100644 (file)
@@ -31,9 +31,8 @@
 #define INSN_CALL_DYNAMIC      8
 #define INSN_RETURN            9
 #define INSN_CONTEXT_SWITCH    10
-#define INSN_BUG               11
-#define INSN_NOP               12
-#define INSN_OTHER             13
+#define INSN_NOP               11
+#define INSN_OTHER             12
 #define INSN_LAST              INSN_OTHER
 
 int arch_decode_instruction(struct elf *elf, struct section *sec,
index 039636f..6ac99e3 100644 (file)
@@ -118,9 +118,6 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
                         op2 == 0x35)
                        /* sysenter, sysret */
                        *type = INSN_CONTEXT_SWITCH;
-               else if (op2 == 0x0b || op2 == 0xb9)
-                       /* ud2 */
-                       *type = INSN_BUG;
                else if (op2 == 0x0d || op2 == 0x1f)
                        /* nopl/nopw */
                        *type = INSN_NOP;
index e8a1f69..5fc52ee 100644 (file)
@@ -51,7 +51,7 @@ struct instruction {
        unsigned int len, state;
        unsigned char type;
        unsigned long immediate;
-       bool alt_group, visited;
+       bool alt_group, visited, dead_end;
        struct symbol *call_dest;
        struct instruction *jump_dest;
        struct list_head alts;
@@ -330,6 +330,54 @@ static int decode_instructions(struct objtool_file *file)
 }
 
 /*
+ * Find all uses of the unreachable() macro, which are code path dead ends.
+ */
+static int add_dead_ends(struct objtool_file *file)
+{
+       struct section *sec;
+       struct rela *rela;
+       struct instruction *insn;
+       bool found;
+
+       sec = find_section_by_name(file->elf, ".rela__unreachable");
+       if (!sec)
+               return 0;
+
+       list_for_each_entry(rela, &sec->rela_list, list) {
+               if (rela->sym->type != STT_SECTION) {
+                       WARN("unexpected relocation symbol type in .rela__unreachable");
+                       return -1;
+               }
+               insn = find_insn(file, rela->sym->sec, rela->addend);
+               if (insn)
+                       insn = list_prev_entry(insn, list);
+               else if (rela->addend == rela->sym->sec->len) {
+                       found = false;
+                       list_for_each_entry_reverse(insn, &file->insn_list, list) {
+                               if (insn->sec == rela->sym->sec) {
+                                       found = true;
+                                       break;
+                               }
+                       }
+
+                       if (!found) {
+                               WARN("can't find unreachable insn at %s+0x%x",
+                                    rela->sym->sec->name, rela->addend);
+                               return -1;
+                       }
+               } else {
+                       WARN("can't find unreachable insn at %s+0x%x",
+                            rela->sym->sec->name, rela->addend);
+                       return -1;
+               }
+
+               insn->dead_end = true;
+       }
+
+       return 0;
+}
+
+/*
  * Warnings shouldn't be reported for ignored functions.
  */
 static void add_ignores(struct objtool_file *file)
@@ -843,6 +891,10 @@ static int decode_sections(struct objtool_file *file)
        if (ret)
                return ret;
 
+       ret = add_dead_ends(file);
+       if (ret)
+               return ret;
+
        add_ignores(file);
 
        ret = add_jump_destinations(file);
@@ -1037,13 +1089,13 @@ static int validate_branch(struct objtool_file *file,
 
                        return 0;
 
-               case INSN_BUG:
-                       return 0;
-
                default:
                        break;
                }
 
+               if (insn->dead_end)
+                       return 0;
+
                insn = next_insn_same_sec(file, insn);
                if (!insn) {
                        WARN("%s: unexpected end of section", sec->name);
index 8ffbd27..a89273d 100644 (file)
@@ -39,6 +39,10 @@ OPTIONS
 --verbose::
         Be more verbose. (Show symbol address, etc)
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.
index 66dbe3d..a79c84a 100644 (file)
@@ -73,6 +73,10 @@ OPTIONS
        Be verbose, for instance, show the raw counts in addition to the
        diff.
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -f::
 --force::
         Don't do ownership validation.
index 27256bc..b16003e 100644 (file)
@@ -157,7 +157,7 @@ OPTIONS
 
 -a::
 --all-cpus::
-        System-wide collection from all CPUs.
+        System-wide collection from all CPUs (default if no target is specified).
 
 -p::
 --pid=::
index f2914f0..c04cc06 100644 (file)
@@ -25,6 +25,10 @@ OPTIONS
 --verbose::
         Be more verbose. (show symbol address, etc)
 
+-q::
+--quiet::
+       Do not show any message.  (Suppress -v)
+
 -n::
 --show-nr-samples::
        Show the number of samples for each symbol
index d96ccd4..aecf2a8 100644 (file)
@@ -63,7 +63,7 @@ report::
 
 -a::
 --all-cpus::
-        system-wide collection from all CPUs
+        system-wide collection from all CPUs (default if no target is specified)
 
 -c::
 --scale::
index 8a6479c..170b028 100644 (file)
@@ -22,7 +22,7 @@ If you have debuginfo enabled, try: perf report -s sym,srcline
 For memory address profiling, try: perf mem record / perf mem report
 For tracepoint events, try: perf report -s trace_fields
 To record callchains for each sample: perf record -g
-To record every process run by an user: perf record -u <user>
+To record every process run by a user: perf record -u <user>
 Skip collecing build-id when recording: perf record -B
 To change sampling frequency to 100 Hz: perf record -F 100
 See assembly instructions with percentage: perf annotate <symbol>
index 2b941ef..27c9fbc 100644 (file)
@@ -175,6 +175,10 @@ PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
 PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
 PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
 
+ifeq ($(CC), clang)
+  PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+endif
+
 FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
 FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
 FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
@@ -601,6 +605,9 @@ else
       PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
       PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
       PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+      ifeq ($(CC), clang)
+        PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+      endif
       FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 
       ifneq ($(feature-libpython), 1)
index 4da19b6..79fe31f 100644 (file)
@@ -726,13 +726,13 @@ config-clean:
        $(call QUIET_CLEAN, config)
        $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
-clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean
        $(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
        $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
        $(Q)$(RM) $(OUTPUT).config-detected
        $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
        $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
-               $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
+               $(OUTPUT)util/intel-pt-decoder/inat-tables.c \
                $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
                $(OUTPUT)pmu-events/pmu-events.c
        $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
index ebb6283..4f52d85 100644 (file)
@@ -410,6 +410,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
        OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
@@ -463,6 +464,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
                annotate.sym_hist_filter = argv[0];
        }
 
+       if (quiet)
+               perf_quiet_option();
+
        file.path  = input_name;
 
        annotate.session = perf_session__new(&file, false, &annotate.tool);
index 70a2893..1b96a31 100644 (file)
@@ -691,7 +691,7 @@ static void hists__process(struct hists *hists)
        hists__precompute(hists);
        hists__output_resort(hists, NULL);
 
-       hists__fprintf(hists, true, 0, 0, 0, stdout,
+       hists__fprintf(hists, !quiet, 0, 0, 0, stdout,
                       symbol_conf.use_callchain);
 }
 
@@ -739,12 +739,14 @@ static void data_process(void)
                                hists__link(hists_base, hists);
                }
 
-               fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
-                       perf_evsel__name(evsel_base));
+               if (!quiet) {
+                       fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
+                               perf_evsel__name(evsel_base));
+               }
 
                first = false;
 
-               if (verbose || data__files_cnt > 2)
+               if (verbose > 0 || ((data__files_cnt > 2) && !quiet))
                        data__fprintf();
 
                /* Don't sort callchain for perf diff */
@@ -807,6 +809,7 @@ static const char * const diff_usage[] = {
 static const struct option options[] = {
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
        OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
                    "Show only items with match in baseline"),
        OPT_CALLBACK('c', "compute", &compute,
@@ -1328,6 +1331,9 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
 
        argc = parse_options(argc, argv, options, diff_usage, 0);
 
+       if (quiet)
+               perf_quiet_option();
+
        if (symbol__init(NULL) < 0)
                return -1;
 
index cd7bc4d..6114e07 100644 (file)
@@ -42,8 +42,8 @@ static int parse_record_events(const struct option *opt,
 
                fprintf(stderr, "%-13s%-*s%s\n",
                        e->tag,
-                       verbose ? 25 : 0,
-                       verbose ? perf_mem_events__name(j) : "",
+                       verbose > 0 ? 25 : 0,
+                       verbose > 0 ? perf_mem_events__name(j) : "",
                        e->supported ? ": available" : "");
        }
        exit(0);
index 6cd6776..bc84a37 100644 (file)
@@ -432,7 +432,7 @@ static int record__open(struct record *rec)
 try_again:
                if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
                        if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s\n", msg);
                                goto try_again;
                        }
@@ -1677,8 +1677,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 
        argc = parse_options(argc, argv, record_options, record_usage,
                            PARSE_OPT_STOP_AT_NON_OPTION);
+       if (quiet)
+               perf_quiet_option();
+
+       /* Make system wide (-a) the default target. */
        if (!argc && target__none(&rec->opts.target))
-               usage_with_options(record_usage, record_options);
+               rec->opts.target.system_wide = true;
 
        if (nr_cgroups && !rec->opts.target.system_wide) {
                usage_with_options_msg(record_usage, record_options,
index dbd7fa0..0a88670 100644 (file)
@@ -320,6 +320,9 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
        size_t size = sizeof(buf);
        int socked_id = hists->socket_filter;
 
+       if (quiet)
+               return 0;
+
        if (symbol_conf.filter_relative) {
                nr_samples = hists->stats.nr_non_filtered_samples;
                nr_events = hists->stats.total_non_filtered_period;
@@ -372,7 +375,11 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 {
        struct perf_evsel *pos;
 
-       fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", evlist->stats.total_lost_samples);
+       if (!quiet) {
+               fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n",
+                       evlist->stats.total_lost_samples);
+       }
+
        evlist__for_each_entry(evlist, pos) {
                struct hists *hists = evsel__hists(pos);
                const char *evname = perf_evsel__name(pos);
@@ -382,7 +389,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
                        continue;
 
                hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
-               hists__fprintf(hists, true, 0, 0, rep->min_percent, stdout,
+               hists__fprintf(hists, !quiet, 0, 0, rep->min_percent, stdout,
                               symbol_conf.use_callchain);
                fprintf(stdout, "\n\n");
        }
@@ -716,6 +723,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                    "input file name"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
        OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
@@ -863,6 +871,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                report.symbol_filter_str = argv[0];
        }
 
+       if (quiet)
+               perf_quiet_option();
+
        if (symbol_conf.vmlinux_name &&
            access(symbol_conf.vmlinux_name, R_OK)) {
                pr_err("Invalid file: %s\n", symbol_conf.vmlinux_name);
@@ -983,14 +994,14 @@ repeat:
                goto error;
        }
 
-       if (report.header || report.header_only) {
+       if ((report.header || report.header_only) && !quiet) {
                perf_session__fprintf_info(session, stdout,
                                           report.show_full_info);
                if (report.header_only) {
                        ret = 0;
                        goto error;
                }
-       } else if (use_browser == 0) {
+       } else if (use_browser == 0 && !quiet) {
                fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n",
                      stdout);
        }
@@ -1009,7 +1020,7 @@ repeat:
                 * providing it only in verbose mode not to bloat too
                 * much struct symbol.
                 */
-               if (verbose) {
+               if (verbose > 0) {
                        /*
                         * XXX: Need to provide a less kludgy way to ask for
                         * more space per symbol, the u32 is for the index on
index 270eb2d..b94cf0d 100644 (file)
@@ -460,7 +460,7 @@ static struct task_desc *register_pid(struct perf_sched *sched,
        BUG_ON(!sched->tasks);
        sched->tasks[task->nr] = task;
 
-       if (verbose)
+       if (verbose > 0)
                printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 
        return task;
@@ -794,7 +794,7 @@ replay_wakeup_event(struct perf_sched *sched,
        const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
        struct task_desc *waker, *wakee;
 
-       if (verbose) {
+       if (verbose > 0) {
                printf("sched_wakeup event %p\n", evsel);
 
                printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
@@ -822,7 +822,7 @@ static int replay_switch_event(struct perf_sched *sched,
        int cpu = sample->cpu;
        s64 delta;
 
-       if (verbose)
+       if (verbose > 0)
                printf("sched_switch event %p\n", evsel);
 
        if (cpu >= MAX_CPUS || cpu < 0)
@@ -870,7 +870,7 @@ static int replay_fork_event(struct perf_sched *sched,
                goto out_put;
        }
 
-       if (verbose) {
+       if (verbose > 0) {
                printf("fork event\n");
                printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
                printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
@@ -1573,7 +1573,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 
        timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
        color_fprintf(stdout, color, "  %12s secs ", stimestamp);
-       if (new_shortname || (verbose && sched_in->tid)) {
+       if (new_shortname || (verbose > 0 && sched_in->tid)) {
                const char *pid_color = color;
 
                if (thread__has_color(sched_in))
@@ -2050,7 +2050,7 @@ static void save_task_callchain(struct perf_sched *sched,
 
        if (thread__resolve_callchain(thread, cursor, evsel, sample,
                                      NULL, NULL, sched->max_stack + 2) != 0) {
-               if (verbose)
+               if (verbose > 0)
                        error("Failed to resolve callchain. Skipping\n");
 
                return;
index f287191..13b5499 100644 (file)
@@ -573,7 +573,7 @@ try_again:
                        if (errno == EINVAL || errno == ENOSYS ||
                            errno == ENOENT || errno == EOPNOTSUPP ||
                            errno == ENXIO) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s event is not supported by the kernel.\n",
                                                    perf_evsel__name(counter));
                                counter->supported = false;
@@ -582,7 +582,7 @@ try_again:
                                    !(counter->leader->nr_members > 1))
                                        continue;
                        } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
-                                if (verbose)
+                                if (verbose > 0)
                                         ui__warning("%s\n", msg);
                                 goto try_again;
                         }
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i
 
        cpu = map->map[idx];
 
-       if (cpu >= env->nr_cpus_online)
+       if (cpu >= env->nr_cpus_avail)
                return -1;
 
        return cpu;
@@ -2445,8 +2445,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
        } else if (big_num_opt == 0) /* User passed --no-big-num */
                big_num = false;
 
+       /* Make system wide (-a) the default target. */
        if (!argc && target__none(&target))
-               usage_with_options(stat_usage, stat_options);
+               target.system_wide = true;
 
        if (run_count < 0) {
                pr_err("Run count must be a positive number\n");
@@ -2538,7 +2539,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 
        status = 0;
        for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
-               if (run_count != 1 && verbose)
+               if (run_count != 1 && verbose > 0)
                        fprintf(output, "[ perf stat: executing run #%d ... ]\n",
                                run_idx + 1);
 
index 5a7fd7a..ab90779 100644 (file)
@@ -871,7 +871,7 @@ try_again:
                if (perf_evsel__open(counter, top->evlist->cpus,
                                     top->evlist->threads) < 0) {
                        if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
-                               if (verbose)
+                               if (verbose > 0)
                                        ui__warning("%s\n", msg);
                                goto try_again;
                        }
index 40ef9b2..256f1fa 100644 (file)
@@ -1399,7 +1399,7 @@ static struct syscall *trace__syscall_info(struct trace *trace,
        return &trace->syscalls.table[id];
 
 out_cant_read:
-       if (verbose) {
+       if (verbose > 0) {
                fprintf(trace->output, "Problems reading syscall %d", id);
                if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
                        fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
@@ -1801,10 +1801,10 @@ static void print_location(FILE *f, struct perf_sample *sample,
                           bool print_dso, bool print_sym)
 {
 
-       if ((verbose || print_dso) && al->map)
+       if ((verbose > 0 || print_dso) && al->map)
                fprintf(f, "%s@", al->map->dso->long_name);
 
-       if ((verbose || print_sym) && al->sym)
+       if ((verbose > 0 || print_sym) && al->sym)
                fprintf(f, "%s+0x%" PRIx64, al->sym->name,
                        al->addr - al->sym->start);
        else if (al->map)
index f67bbb0..0544398 100644 (file)
@@ -49,7 +49,7 @@ static char *mapfile(const char *fn, size_t *size)
        int err;
        int fd = open(fn, O_RDONLY);
 
-       if (fd < 0 && verbose && fn) {
+       if (fd < 0 && verbose > 0 && fn) {
                pr_err("Error opening events file '%s': %s\n", fn,
                                strerror(errno));
        }
index 28d1605..88dc51f 100644 (file)
@@ -144,7 +144,7 @@ static int run_dir(const char *d, const char *perf)
        int vcnt = min(verbose, (int) sizeof(v) - 1);
        char cmd[3*PATH_MAX];
 
-       if (verbose)
+       if (verbose > 0)
                vcnt++;
 
        snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %.*s",
index 37e326b..83c4669 100644 (file)
@@ -299,7 +299,7 @@ static int run_test(struct test *test, int subtest)
                if (!dont_fork) {
                        pr_debug("test child forked, pid %d\n", getpid());
 
-                       if (!verbose) {
+                       if (verbose <= 0) {
                                int nullfd = open("/dev/null", O_WRONLY);
 
                                if (nullfd >= 0) {
index ff5bc63..d1f6930 100644 (file)
@@ -599,7 +599,7 @@ static int do_test_code_reading(bool try_kcore)
                                continue;
                        }
 
-                       if (verbose) {
+                       if (verbose > 0) {
                                char errbuf[512];
                                perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
                                pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
index a2b5ff9..bc5982f 100644 (file)
@@ -19,7 +19,7 @@ static int fdarray__fprintf_prefix(struct fdarray *fda, const char *prefix, FILE
 {
        int printed = 0;
 
-       if (!verbose)
+       if (verbose <= 0)
                return 0;
 
        printed += fprintf(fp, "\n%s: ", prefix);
index d357dab..482b536 100644 (file)
@@ -76,7 +76,7 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
         * Skip this test if user's .perfconfig doesn't set [llvm] section
         * and clang is not found in $PATH, and this is not perf test -v
         */
-       if (!force && (verbose == 0 &&
+       if (!force && (verbose <= 0 &&
                       !llvm_param.user_set_param &&
                       llvm__search_clang())) {
                pr_debug("No clang and no verbosive, skip this test\n");
index aa9276b..1dc8380 100644 (file)
@@ -1808,7 +1808,7 @@ static void debug_warn(const char *warn, va_list params)
 {
        char msg[1024];
 
-       if (!verbose)
+       if (verbose <= 0)
                return;
 
        vsnprintf(msg, sizeof(msg), warn, params);
index 541da7a..87893f3 100644 (file)
@@ -172,13 +172,13 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 
                                err = perf_evlist__parse_sample(evlist, event, &sample);
                                if (err < 0) {
-                                       if (verbose)
+                                       if (verbose > 0)
                                                perf_event__fprintf(event, stderr);
                                        pr_debug("Couldn't parse sample\n");
                                        goto out_delete_evlist;
                                }
 
-                               if (verbose) {
+                               if (verbose > 0) {
                                        pr_info("%" PRIu64" %d ", sample.time, sample.cpu);
                                        perf_event__fprintf(event, stderr);
                                }
index 7a52834..fa79509 100644 (file)
@@ -15,7 +15,7 @@ int test__python_use(int subtest __maybe_unused)
        int ret;
 
        if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s",
-                    PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0)
+                    PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0)
                return -1;
 
        ret = system(cmd) ? -1 : 0;
index a4a4b46..f2d2e54 100644 (file)
@@ -109,7 +109,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
        TEST_ASSERT_VAL("failed to allocate thread_map",
                        threads);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to remove thread",
@@ -117,7 +117,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
 
        TEST_ASSERT_VAL("thread_map count != 1", threads->nr == 1);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to remove thread",
@@ -125,7 +125,7 @@ int test__thread_map_remove(int subtest __maybe_unused)
 
        TEST_ASSERT_VAL("thread_map count != 0", threads->nr == 0);
 
-       if (verbose)
+       if (verbose > 0)
                thread_map__fprintf(threads, stderr);
 
        TEST_ASSERT_VAL("failed to not remove thread",
index 98fe69a..803f893 100644 (file)
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
        session = perf_session__new(&file, false, NULL);
        TEST_ASSERT_VAL("can't get session", session);
 
-       for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+       for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+               if (!cpu_map__has(map, i))
+                       continue;
                pr_debug("CPU %d, core %d, socket %d\n", i,
                         session->header.env.cpu[i].core_id,
                         session->header.env.cpu[i].socket_id);
index a508233..862b043 100644 (file)
@@ -168,7 +168,7 @@ next_pair:
                err = -1;
        }
 
-       if (!verbose)
+       if (verbose <= 0)
                goto out;
 
        header_printed = false;
index 98a3466..9ce142d 100644 (file)
@@ -73,7 +73,7 @@ static int map_browser__run(struct map_browser *browser)
 
        if (ui_browser__show(&browser->b, browser->map->dso->long_name,
                             "Press ESC to exit, %s / to search",
-                            verbose ? "" : "restart with -v to use") < 0)
+                            verbose > 0 ? "" : "restart with -v to use") < 0)
                return -1;
 
        while (1) {
@@ -81,7 +81,7 @@ static int map_browser__run(struct map_browser *browser)
 
                switch (key) {
                case '/':
-                       if (verbose)
+                       if (verbose > 0)
                                map_browser__search(browser);
                default:
                        break;
@@ -117,7 +117,7 @@ int map__browse(struct map *map)
 
                if (maxaddr < pos->end)
                        maxaddr = pos->end;
-               if (verbose) {
+               if (verbose > 0) {
                        u32 *idx = symbol__browser_index(pos);
                        *idx = mb.b.nr_entries;
                }
index 18cfcdc..5d632dc 100644 (file)
@@ -648,7 +648,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
                ret += fmt->width(fmt, &dummy_hpp, hists);
        }
 
-       if (verbose && hists__has(hists, sym)) /* Addr + origin */
+       if (verbose > 0 && hists__has(hists, sym)) /* Addr + origin */
                ret += 3 + BITS_PER_LONG / 4;
 
        return ret;
index 06cc04e..273f21f 100644 (file)
@@ -1768,7 +1768,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
        printf("%-*.*s----\n",
               graph_dotted_len, graph_dotted_len, graph_dotted_line);
 
-       if (verbose)
+       if (verbose > 0)
                symbol__annotate_hits(sym, evsel);
 
        list_for_each_entry(pos, &notes->src->source, node) {
index 8fdee24..eafbf11 100644 (file)
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
        FILE *fp;
        char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
+       char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
        char *token, *saved_ptr = NULL;
-       int found = 0;
 
        fp = fopen("/proc/mounts", "r");
        if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
         * and inspect every cgroupfs mount point to find one that has
         * perf_event subsystem
         */
+       path_v1[0] = '\0';
+       path_v2[0] = '\0';
+
        while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
                                STR(PATH_MAX)"s %*d %*d\n",
                                mountpoint, type, tokens) == 3) {
 
-               if (!strcmp(type, "cgroup")) {
+               if (!path_v1[0] && !strcmp(type, "cgroup")) {
 
                        token = strtok_r(tokens, ",", &saved_ptr);
 
                        while (token != NULL) {
                                if (!strcmp(token, "perf_event")) {
-                                       found = 1;
+                                       strcpy(path_v1, mountpoint);
                                        break;
                                }
                                token = strtok_r(NULL, ",", &saved_ptr);
                        }
                }
-               if (found)
+
+               if (!path_v2[0] && !strcmp(type, "cgroup2"))
+                       strcpy(path_v2, mountpoint);
+
+               if (path_v1[0] && path_v2[0])
                        break;
        }
        fclose(fp);
-       if (!found)
+
+       if (path_v1[0])
+               path = path_v1;
+       else if (path_v2[0])
+               path = path_v2;
+       else
                return -1;
 
-       if (strlen(mountpoint) < maxlen) {
-               strcpy(buf, mountpoint);
+       if (strlen(path) < maxlen) {
+               strcpy(buf, path);
                return 0;
        }
        return -1;
index 2c0b522..8c75049 100644 (file)
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
        /* set up default */
        max_cpu_num = 4096;
+       max_present_cpu_num = 4096;
 
        mnt = sysfs__mountpoint();
        if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
        }
 
        ret = get_max_num(path, &max_cpu_num);
+       if (ret)
+               goto out;
+
+       /* get the highest present cpu number for a sparse allocation */
+       ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+       if (ret == PATH_MAX) {
+               pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+               goto out;
+       }
+
+       ret = get_max_num(path, &max_present_cpu_num);
 
 out:
        if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
        return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+       if (unlikely(!max_present_cpu_num))
+               set_max_cpu_num();
+
+       return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
        if (unlikely(cpunode_map == NULL)) {
index 06bd689..1a0549a 100644 (file)
@@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
index c1838b6..03eb81f 100644 (file)
@@ -203,11 +203,28 @@ int perf_debug_option(const char *str)
                v = (v < 0) || (v > 10) ? 0 : v;
        }
 
+       if (quiet)
+               v = -1;
+
        *var->ptr = v;
        free(s);
        return 0;
 }
 
+int perf_quiet_option(void)
+{
+       struct debug_variable *var = &debug_variables[0];
+
+       /* disable all debug messages */
+       while (var->name) {
+               *var->ptr = -1;
+               var++;
+       }
+
+       quiet = true;
+       return 0;
+}
+
 #define DEBUG_WRAPPER(__n, __l)                                \
 static int pr_ ## __n ## _wrapper(const char *fmt, ...)        \
 {                                                      \
index d242adc..98832f5 100644 (file)
@@ -54,5 +54,6 @@ int veprintf(int level, int var, const char *fmt, va_list args);
 
 int perf_debug_option(const char *str);
 void perf_debug_setup(void);
+int perf_quiet_option(void);
 
 #endif /* __PERF_DEBUG_H */
index 28d41e7..d38b62a 100644 (file)
@@ -951,7 +951,7 @@ static struct dso *__dso__findlink_by_longname(struct rb_root *root,
                if (rc == 0) {
                        /*
                         * In case the new DSO is a duplicate of an existing
-                        * one, print an one-time warning & put the new entry
+                        * one, print a one-time warning & put the new entry
                         * at the end of the list of duplicates.
                         */
                        if (!dso || (dso == this))
@@ -1058,7 +1058,7 @@ int dso__name_len(const struct dso *dso)
 {
        if (!dso)
                return strlen("[unknown]");
-       if (verbose)
+       if (verbose > 0)
                return dso->long_name_len;
 
        return dso->short_name_len;
index bb964e8..075fc77 100644 (file)
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
                return 0;
 
        if (env->nr_cpus_avail == 0)
-               env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+               env->nr_cpus_avail = cpu__max_present_cpu();
 
        nr_cpus = env->nr_cpus_avail;
        if (nr_cpus == -1)
index 3d12c16..05714d5 100644 (file)
@@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused,
        u32 nrc, nra;
        int ret;
 
-       nr = sysconf(_SC_NPROCESSORS_CONF);
-       if (nr < 0)
-               return -1;
-
-       nrc = (u32)(nr & UINT_MAX);
+       nrc = cpu__max_present_cpu();
 
        nr = sysconf(_SC_NPROCESSORS_ONLN);
        if (nr < 0)
@@ -505,24 +501,29 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-       struct cpu_topo *tp;
+       struct cpu_topo *tp = NULL;
        void *addr;
        u32 nr, i;
        size_t sz;
        long ncpus;
        int ret = -1;
+       struct cpu_map *map;
 
-       ncpus = sysconf(_SC_NPROCESSORS_CONF);
-       if (ncpus < 0)
+       ncpus = cpu__max_present_cpu();
+
+       /* build online CPU map */
+       map = cpu_map__new(NULL);
+       if (map == NULL) {
+               pr_debug("failed to get system cpumap\n");
                return NULL;
+       }
 
        nr = (u32)(ncpus & UINT_MAX);
 
        sz = nr * sizeof(char *);
-
        addr = calloc(1, sizeof(*tp) + 2 * sz);
        if (!addr)
-               return NULL;
+               goto out_free;
 
        tp = addr;
        tp->cpu_nr = nr;
@@ -532,10 +533,16 @@ static struct cpu_topo *build_cpu_topology(void)
        tp->thread_siblings = addr;
 
        for (i = 0; i < nr; i++) {
+               if (!cpu_map__has(map, i))
+                       continue;
+
                ret = build_cpu_topo(tp, i);
                if (ret < 0)
                        break;
        }
+
+out_free:
+       cpu_map__put(map);
        if (ret) {
                free_cpu_topo(tp);
                tp = NULL;
@@ -1126,7 +1133,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused,
 {
        int nr, i;
        char *str;
-       int cpu_nr = ph->env.nr_cpus_online;
+       int cpu_nr = ph->env.nr_cpus_avail;
 
        nr = ph->env.nr_sibling_cores;
        str = ph->env.sibling_cores;
@@ -1781,7 +1788,7 @@ static int process_cpu_topology(struct perf_file_section *section,
        u32 nr, i;
        char *str;
        struct strbuf sb;
-       int cpu_nr = ph->env.nr_cpus_online;
+       int cpu_nr = ph->env.nr_cpus_avail;
        u64 size = 0;
 
        ph->env.cpu = calloc(cpu_nr, sizeof(*ph->env.cpu));
@@ -1862,7 +1869,7 @@ static int process_cpu_topology(struct perf_file_section *section,
                if (ph->needs_swap)
                        nr = bswap_32(nr);
 
-               if (nr > (u32)cpu_nr) {
+               if (nr != (u32)-1 && nr > (u32)cpu_nr) {
                        pr_debug("socket_id number is too big."
                                 "You may need to upgrade the perf tool.\n");
                        goto free_cpu;
index 32c6a93..eaf72a9 100644 (file)
@@ -69,7 +69,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         */
        if (h->ms.sym) {
                symlen = h->ms.sym->namelen + 4;
-               if (verbose)
+               if (verbose > 0)
                        symlen += BITS_PER_LONG / 4 + 2 + 3;
                hists__new_col_len(hists, HISTC_SYMBOL, symlen);
        } else {
@@ -93,7 +93,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
        if (h->branch_info) {
                if (h->branch_info->from.sym) {
                        symlen = (int)h->branch_info->from.sym->namelen + 4;
-                       if (verbose)
+                       if (verbose > 0)
                                symlen += BITS_PER_LONG / 4 + 2 + 3;
                        hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
 
@@ -107,7 +107,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
                if (h->branch_info->to.sym) {
                        symlen = (int)h->branch_info->to.sym->namelen + 4;
-                       if (verbose)
+                       if (verbose > 0)
                                symlen += BITS_PER_LONG / 4 + 2 + 3;
                        hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
 
index 281e44a..67a8aeb 100644 (file)
@@ -2318,24 +2318,20 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term)
        return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
 }
 
-static int new_term(struct parse_events_term **_term, int type_val,
-                   int type_term, char *config,
-                   char *str, u64 num, int err_term, int err_val)
+static int new_term(struct parse_events_term **_term,
+                   struct parse_events_term *temp,
+                   char *str, u64 num)
 {
        struct parse_events_term *term;
 
-       term = zalloc(sizeof(*term));
+       term = malloc(sizeof(*term));
        if (!term)
                return -ENOMEM;
 
+       *term = *temp;
        INIT_LIST_HEAD(&term->list);
-       term->type_val  = type_val;
-       term->type_term = type_term;
-       term->config = config;
-       term->err_term = err_term;
-       term->err_val  = err_val;
 
-       switch (type_val) {
+       switch (term->type_val) {
        case PARSE_EVENTS__TERM_TYPE_NUM:
                term->val.num = num;
                break;
@@ -2353,15 +2349,22 @@ static int new_term(struct parse_events_term **_term, int type_val,
 
 int parse_events_term__num(struct parse_events_term **term,
                           int type_term, char *config, u64 num,
+                          bool no_value,
                           void *loc_term_, void *loc_val_)
 {
        YYLTYPE *loc_term = loc_term_;
        YYLTYPE *loc_val = loc_val_;
 
-       return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
-                       config, NULL, num,
-                       loc_term ? loc_term->first_column : 0,
-                       loc_val ? loc_val->first_column : 0);
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_NUM,
+               .type_term = type_term,
+               .config    = config,
+               .no_value  = no_value,
+               .err_term  = loc_term ? loc_term->first_column : 0,
+               .err_val   = loc_val  ? loc_val->first_column  : 0,
+       };
+
+       return new_term(term, &temp, NULL, num);
 }
 
 int parse_events_term__str(struct parse_events_term **term,
@@ -2371,37 +2374,45 @@ int parse_events_term__str(struct parse_events_term **term,
        YYLTYPE *loc_term = loc_term_;
        YYLTYPE *loc_val = loc_val_;
 
-       return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
-                       config, str, 0,
-                       loc_term ? loc_term->first_column : 0,
-                       loc_val ? loc_val->first_column : 0);
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_STR,
+               .type_term = type_term,
+               .config    = config,
+               .err_term  = loc_term ? loc_term->first_column : 0,
+               .err_val   = loc_val  ? loc_val->first_column  : 0,
+       };
+
+       return new_term(term, &temp, str, 0);
 }
 
 int parse_events_term__sym_hw(struct parse_events_term **term,
                              char *config, unsigned idx)
 {
        struct event_symbol *sym;
+       struct parse_events_term temp = {
+               .type_val  = PARSE_EVENTS__TERM_TYPE_STR,
+               .type_term = PARSE_EVENTS__TERM_TYPE_USER,
+               .config    = config ?: (char *) "event",
+       };
 
        BUG_ON(idx >= PERF_COUNT_HW_MAX);
        sym = &event_symbols_hw[idx];
 
-       if (config)
-               return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
-                               PARSE_EVENTS__TERM_TYPE_USER, config,
-                               (char *) sym->symbol, 0, 0, 0);
-       else
-               return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
-                               PARSE_EVENTS__TERM_TYPE_USER,
-                               (char *) "event", (char *) sym->symbol,
-                               0, 0, 0);
+       return new_term(term, &temp, (char *) sym->symbol, 0);
 }
 
 int parse_events_term__clone(struct parse_events_term **new,
                             struct parse_events_term *term)
 {
-       return new_term(new, term->type_val, term->type_term, term->config,
-                       term->val.str, term->val.num,
-                       term->err_term, term->err_val);
+       struct parse_events_term temp = {
+               .type_val  = term->type_val,
+               .type_term = term->type_term,
+               .config    = term->config,
+               .err_term  = term->err_term,
+               .err_val   = term->err_val,
+       };
+
+       return new_term(new, &temp, term->val.str, term->val.num);
 }
 
 void parse_events_terms__purge(struct list_head *terms)
index da246a3..1af6a26 100644 (file)
@@ -94,6 +94,7 @@ struct parse_events_term {
        int type_term;
        struct list_head list;
        bool used;
+       bool no_value;
 
        /* error string indexes for within parsed string */
        int err_term;
@@ -122,6 +123,7 @@ void parse_events__shrink_config_terms(void);
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
 int parse_events_term__num(struct parse_events_term **term,
                           int type_term, char *config, u64 num,
+                          bool novalue,
                           void *loc_term, void *loc_val);
 int parse_events_term__str(struct parse_events_term **term,
                           int type_term, char *config, char *str,
index a14b47a..30f018e 100644 (file)
@@ -252,7 +252,7 @@ PE_KERNEL_PMU_EVENT sep_dc
                        if (!strcasecmp(alias->name, $1)) {
                                ALLOC_LIST(head);
                                ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1, &@1, NULL));
+                                       $1, 1, false, &@1, NULL));
                                list_add_tail(&term->list, head);
 
                                if (!parse_events_add_pmu(data, list,
@@ -282,7 +282,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
        ALLOC_LIST(head);
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       &pmu_name, 1, &@1, NULL));
+                                       &pmu_name, 1, false, &@1, NULL));
        list_add_tail(&term->list, head);
 
        ALLOC_LIST(list);
@@ -548,7 +548,7 @@ PE_NAME '=' PE_VALUE
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $3, &@1, &@3));
+                                       $1, $3, false, &@1, &@3));
        $$ = term;
 }
 |
@@ -566,7 +566,7 @@ PE_NAME
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1, &@1, NULL));
+                                       $1, 1, true, &@1, NULL));
        $$ = term;
 }
 |
@@ -591,7 +591,7 @@ PE_TERM '=' PE_VALUE
 {
        struct parse_events_term *term;
 
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, false, &@1, &@3));
        $$ = term;
 }
 |
@@ -599,7 +599,7 @@ PE_TERM
 {
        struct parse_events_term *term;
 
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, true, &@1, NULL));
        $$ = term;
 }
 |
@@ -620,7 +620,7 @@ PE_NAME array '=' PE_VALUE
        struct parse_events_term *term;
 
        ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $4, &@1, &@4));
+                                       $1, $4, false, &@1, &@4));
        term->array = $2;
        $$ = term;
 }
index 49bfee0..12f84dd 100644 (file)
@@ -745,7 +745,7 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
                }
        }
 
-       if (verbose)
+       if (verbose > 0)
                printf("Required parameter '%s' not specified\n", term->config);
 
        return -1;
@@ -803,7 +803,7 @@ static int pmu_config_term(struct list_head *formats,
 
        format = pmu_find_format(formats, term->config);
        if (!format) {
-               if (verbose)
+               if (verbose > 0)
                        printf("Invalid event/parameter '%s'\n", term->config);
                if (err) {
                        char *pmu_term = pmu_formats_string(formats);
@@ -834,11 +834,20 @@ static int pmu_config_term(struct list_head *formats,
         * Either directly use a numeric term, or try to translate string terms
         * using event parameters.
         */
-       if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM)
+       if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) {
+               if (term->no_value &&
+                   bitmap_weight(format->bits, PERF_PMU_FORMAT_BITS) > 1) {
+                       if (err) {
+                               err->idx = term->err_val;
+                               err->str = strdup("no value assigned for term");
+                       }
+                       return -EINVAL;
+               }
+
                val = term->val.num;
-       else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
+       else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
                if (strcmp(term->val.str, "?")) {
-                       if (verbose) {
+                       if (verbose > 0) {
                                pr_info("Invalid sysfs entry %s=%s\n",
                                                term->config, term->val.str);
                        }
@@ -1223,7 +1232,7 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
                        printf("%*s", 8, "[");
                        wordwrap(aliases[j].desc, 8, columns, 0);
                        printf("]\n");
-                       if (verbose)
+                       if (verbose > 0)
                                printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str);
                } else
                        printf("  %-50s [Kernel PMU event]\n", aliases[j].name);
index 35f5b7b..28fb62c 100644 (file)
@@ -594,7 +594,7 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp,
        pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
                 tp->module ? : "kernel");
 
-       dinfo = debuginfo_cache__open(tp->module, verbose == 0);
+       dinfo = debuginfo_cache__open(tp->module, verbose <= 0);
        if (dinfo)
                ret = debuginfo__find_probe_point(dinfo,
                                                 (unsigned long)addr, pp);
index 0d9d6e0..57cd268 100644 (file)
@@ -464,7 +464,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
                /* Verify it is a data structure  */
                tag = dwarf_tag(&type);
                if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) {
-                       pr_warning("%s is not a data structure nor an union.\n",
+                       pr_warning("%s is not a data structure nor a union.\n",
                                   varname);
                        return -EINVAL;
                }
@@ -479,7 +479,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
        } else {
                /* Verify it is a data structure  */
                if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) {
-                       pr_warning("%s is not a data structure nor an union.\n",
+                       pr_warning("%s is not a data structure nor a union.\n",
                                   varname);
                        return -EINVAL;
                }
index 581e0ef..783326c 100644 (file)
@@ -369,10 +369,10 @@ static PyObject *python_process_callchain(struct perf_sample *sample,
                if (node->map) {
                        struct map *map = node->map;
                        const char *dsoname = "[unknown]";
-                       if (map && map->dso && (map->dso->name || map->dso->long_name)) {
+                       if (map && map->dso) {
                                if (symbol_conf.show_kernel_path && map->dso->long_name)
                                        dsoname = map->dso->long_name;
-                               else if (map->dso->name)
+                               else
                                        dsoname = map->dso->name;
                        }
                        pydict_set_item_string_decref(pyelem, "dso",
index 4cdbc8f..1dd617d 100644 (file)
@@ -932,7 +932,7 @@ static void branch_stack__printf(struct perf_sample *sample)
 
                printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
                        i, e->from, e->to,
-                       e->flags.cycles,
+                       (unsigned short)e->flags.cycles,
                        e->flags.mispred ? "M" : " ",
                        e->flags.predicted ? "P" : " ",
                        e->flags.abort ? "A" : " ",
index c868098..af415fe 100644 (file)
@@ -1,8 +1,15 @@
 #!/usr/bin/python2
 
-from distutils.core import setup, Extension
 from os import getenv
 
+cc = getenv("CC")
+if cc == "clang":
+    from _sysconfigdata import build_time_vars
+    from re import sub
+    build_time_vars["CFLAGS"] = sub("-specs=[^ ]+", "", build_time_vars["CFLAGS"])
+
+from distutils.core import setup, Extension
+
 from distutils.command.build_ext   import build_ext   as _build_ext
 from distutils.command.install_lib import install_lib as _install_lib
 
index df622f4..0ff6222 100644 (file)
@@ -151,7 +151,7 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
        if (!dso_l || !dso_r)
                return cmp_null(dso_r, dso_l);
 
-       if (verbose) {
+       if (verbose > 0) {
                dso_name_l = dso_l->long_name;
                dso_name_r = dso_r->long_name;
        } else {
@@ -172,8 +172,8 @@ static int _hist_entry__dso_snprintf(struct map *map, char *bf,
                                     size_t size, unsigned int width)
 {
        if (map && map->dso) {
-               const char *dso_name = !verbose ? map->dso->short_name :
-                       map->dso->long_name;
+               const char *dso_name = verbose > 0 ? map->dso->long_name :
+                       map->dso->short_name;
                return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name);
        }
 
@@ -261,7 +261,7 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 {
        size_t ret = 0;
 
-       if (verbose) {
+       if (verbose > 0) {
                char o = map ? dso__symtab_origin(map->dso) : '!';
                ret += repsep_snprintf(bf, size, "%-#*llx %c ",
                                       BITS_PER_LONG / 4 + 2, ip, o);
index 7aff317..796c847 100644 (file)
@@ -108,7 +108,7 @@ struct hist_entry {
                /*
                 * Since perf diff only supports the stdio output, TUI
                 * fields are only accessed from perf report (or perf
-                * top).  So make it an union to reduce memory usage.
+                * top).  So make it a union to reduce memory usage.
                 */
                struct hist_entry_diff  diff;
                struct /* for TUI */ {
index 39345c2..0d51334 100644 (file)
@@ -344,7 +344,7 @@ int perf_stat_process_counter(struct perf_stat_config *config,
        for (i = 0; i < 3; i++)
                update_stats(&ps->res_stats[i], count[i]);
 
-       if (verbose) {
+       if (verbose > 0) {
                fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
                        perf_evsel__name(counter), count[0], count[1], count[2]);
        }
index adbc6c0..4e59dde 100644 (file)
@@ -213,7 +213,7 @@ static bool want_demangle(bool is_kernel_sym)
 
 static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
 {
-       int demangle_flags = verbose ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
+       int demangle_flags = verbose > 0 ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
        char *demangled = NULL;
 
        /*
index be93ab0..6e4eb2f 100755 (executable)
@@ -179,6 +179,7 @@ my $localversion;
 my $iteration = 0;
 my $successes = 0;
 my $stty_orig;
+my $run_command_status = 0;
 
 my $bisect_good;
 my $bisect_bad;
@@ -1325,26 +1326,44 @@ sub wait_for_monitor;
 
 sub reboot {
     my ($time) = @_;
+    my $powercycle = 0;
 
-    # Make sure everything has been written to disk
-    run_ssh("sync");
+    # test if the machine can be connected to within 5 seconds
+    my $stat = run_ssh("echo check machine status", 5);
+    if (!$stat) {
+       doprint("power cycle\n");
+       $powercycle = 1;
+    }
+
+    if ($powercycle) {
+       run_command "$power_cycle";
 
-    if (defined($time)) {
        start_monitor;
        # flush out current monitor
        # May contain the reboot success line
        wait_for_monitor 1;
-    }
 
-    # try to reboot normally
-    if (run_command $reboot) {
-       if (defined($powercycle_after_reboot)) {
-           sleep $powercycle_after_reboot;
+    } else {
+       # Make sure everything has been written to disk
+       run_ssh("sync");
+
+       if (defined($time)) {
+           start_monitor;
+           # flush out current monitor
+           # May contain the reboot success line
+           wait_for_monitor 1;
+       }
+
+       # try to reboot normally
+       if (run_command $reboot) {
+           if (defined($powercycle_after_reboot)) {
+               sleep $powercycle_after_reboot;
+               run_command "$power_cycle";
+           }
+       } else {
+           # nope? power cycle it.
            run_command "$power_cycle";
        }
-    } else {
-       # nope? power cycle it.
-       run_command "$power_cycle";
     }
 
     if (defined($time)) {
@@ -1412,6 +1431,10 @@ sub dodie {
            system("stty $stty_orig");
     }
 
+    if (defined($post_test)) {
+       run_command $post_test;
+    }
+
     die @_, "\n";
 }
 
@@ -1624,10 +1647,6 @@ sub save_logs {
 
 sub fail {
 
-       if (defined($post_test)) {
-               run_command $post_test;
-       }
-
        if ($die_on_failure) {
                dodie @_;
        }
@@ -1660,23 +1679,26 @@ sub fail {
            save_logs "fail", $store_failures;
         }
 
+       if (defined($post_test)) {
+               run_command $post_test;
+       }
+
        return 1;
 }
 
 sub run_command {
-    my ($command, $redirect) = @_;
+    my ($command, $redirect, $timeout) = @_;
     my $start_time;
     my $end_time;
     my $dolog = 0;
     my $dord = 0;
     my $pid;
 
-    $start_time = time;
-
     $command =~ s/\$SSH_USER/$ssh_user/g;
     $command =~ s/\$MACHINE/$machine/g;
 
     doprint("$command ... ");
+    $start_time = time;
 
     $pid = open(CMD, "$command 2>&1 |") or
        (fail "unable to exec $command" and return 0);
@@ -1693,13 +1715,30 @@ sub run_command {
        $dord = 1;
     }
 
-    while (<CMD>) {
-       print LOG if ($dolog);
-       print RD  if ($dord);
+    my $hit_timeout = 0;
+
+    while (1) {
+       my $fp = \*CMD;
+       if (defined($timeout)) {
+           doprint "timeout = $timeout\n";
+       }
+       my $line = wait_for_input($fp, $timeout);
+       if (!defined($line)) {
+           my $now = time;
+           if (defined($timeout) && (($now - $start_time) >= $timeout)) {
+               doprint "Hit timeout of $timeout, killing process\n";
+               $hit_timeout = 1;
+               kill 9, $pid;
+           }
+           last;
+       }
+       print LOG $line if ($dolog);
+       print RD $line if ($dord);
     }
 
     waitpid($pid, 0);
-    my $failed = $?;
+    # shift 8 for real exit status
+    $run_command_status = $? >> 8;
 
     close(CMD);
     close(LOG) if ($dolog);
@@ -1714,21 +1753,25 @@ sub run_command {
        doprint "[$delta seconds] ";
     }
 
-    if ($failed) {
+    if ($hit_timeout) {
+       $run_command_status = 1;
+    }
+
+    if ($run_command_status) {
        doprint "FAILED!\n";
     } else {
        doprint "SUCCESS\n";
     }
 
-    return !$failed;
+    return !$run_command_status;
 }
 
 sub run_ssh {
-    my ($cmd) = @_;
+    my ($cmd, $timeout) = @_;
     my $cp_exec = $ssh_exec;
 
     $cp_exec =~ s/\$SSH_COMMAND/$cmd/g;
-    return run_command "$cp_exec";
+    return run_command "$cp_exec", undef , $timeout;
 }
 
 sub run_scp {
@@ -2489,10 +2532,6 @@ sub halt {
 sub success {
     my ($i) = @_;
 
-    if (defined($post_test)) {
-       run_command $post_test;
-    }
-
     $successes++;
 
     my $name = "";
@@ -2517,6 +2556,10 @@ sub success {
        doprint "Reboot and wait $sleep_time seconds\n";
        reboot_to_good $sleep_time;
     }
+
+    if (defined($post_test)) {
+       run_command $post_test;
+    }
 }
 
 sub answer_bisect {
@@ -2537,16 +2580,15 @@ sub answer_bisect {
 }
 
 sub child_run_test {
-    my $failed = 0;
 
     # child should have no power
     $reboot_on_error = 0;
     $poweroff_on_error = 0;
     $die_on_failure = 1;
 
-    run_command $run_test, $testlog or $failed = 1;
+    run_command $run_test, $testlog;
 
-    exit $failed;
+    exit $run_command_status;
 }
 
 my $child_done;
@@ -2629,7 +2671,7 @@ sub do_run_test {
     }
 
     waitpid $child_pid, 0;
-    $child_exit = $?;
+    $child_exit = $? >> 8;
 
     my $end_time = time;
     $test_time = $end_time - $start_time;
@@ -3330,7 +3372,6 @@ sub config_bisect {
     save_config \%good_configs, $good_config;
     save_config \%bad_configs, $bad_config;
 
-
     if (defined($config_bisect_check) && $config_bisect_check ne "0") {
        if ($config_bisect_check ne "good") {
            doprint "Testing bad config\n";
index 11d888c..d4706c0 100644 (file)
@@ -1,2 +1,6 @@
+generated/map-shift.h
+idr.c
+idr-test
 main
+multiorder
 radix-tree.c
index 3635e4d..f11315b 100644 (file)
@@ -1,29 +1,47 @@
 
-CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE
+CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE -fsanitize=address
 LDFLAGS += -lpthread -lurcu
-TARGETS = main
-OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
-        regression1.o regression2.o regression3.o multiorder.o \
-        iteration_check.o benchmark.o
+TARGETS = main idr-test multiorder
+CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
+OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
+        tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
 
-ifdef BENCHMARK
-       CFLAGS += -DBENCHMARK=1
+ifndef SHIFT
+       SHIFT=3
 endif
 
-targets: $(TARGETS)
+targets: mapshift $(TARGETS)
 
 main:  $(OFILES)
-       $(CC) $(CFLAGS) $(LDFLAGS) $(OFILES) -o main
+       $(CC) $(CFLAGS) $(LDFLAGS) $^ -o main
+
+idr-test: idr-test.o $(CORE_OFILES)
+       $(CC) $(CFLAGS) $(LDFLAGS) $^ -o idr-test
+
+multiorder: multiorder.o $(CORE_OFILES)
+       $(CC) $(CFLAGS) $(LDFLAGS) $^ -o multiorder
 
 clean:
-       $(RM) -f $(TARGETS) *.o radix-tree.c
+       $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h
 
-find_next_bit.o: ../../lib/find_bit.c
-       $(CC) $(CFLAGS) -c -o $@ $<
+vpath %.c ../../lib
 
-$(OFILES): *.h */*.h \
+$(OFILES): *.h */*.h generated/map-shift.h \
        ../../include/linux/*.h \
-       ../../../include/linux/radix-tree.h
+       ../../include/asm/*.h \
+       ../../../include/linux/radix-tree.h \
+       ../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
        sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
+idr.c: ../../../lib/idr.c
+       sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
+.PHONY: mapshift
+
+mapshift:
+       @if ! grep -qw $(SHIFT) generated/map-shift.h; then             \
+               echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" >          \
+                               generated/map-shift.h;                  \
+       fi
index 215ca86..9b09ddf 100644 (file)
@@ -71,7 +71,7 @@ static void benchmark_size(unsigned long size, unsigned long step, int order)
        tagged = benchmark_iter(&tree, true);
        normal = benchmark_iter(&tree, false);
 
-       printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n",
+       printv(2, "Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n",
                size, step, order, tagged, normal);
 
        item_kill_tree(&tree);
@@ -85,8 +85,8 @@ void benchmark(void)
                                128, 256, 512, 12345, 0};
        int c, s;
 
-       printf("starting benchmarks\n");
-       printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT);
+       printv(1, "starting benchmarks\n");
+       printv(1, "RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT);
 
        for (c = 0; size[c]; c++)
                for (s = 0; step[s]; s++)
index ad18cf5..cf88dc5 100644 (file)
@@ -1,3 +1 @@
 #define CONFIG_RADIX_TREE_MULTIORDER 1
-#define CONFIG_SHMEM 1
-#define CONFIG_SWAP 1
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
new file mode 100644 (file)
index 0000000..a26098c
--- /dev/null
@@ -0,0 +1,444 @@
+/*
+ * idr-test.c: Test the IDR API
+ * Copyright (c) 2016 Matthew Wilcox <willy@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/bitmap.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include "test.h"
+
+#define DUMMY_PTR      ((void *)0x12)
+
+int item_idr_free(int id, void *p, void *data)
+{
+       struct item *item = p;
+       assert(item->index == id);
+       free(p);
+
+       return 0;
+}
+
+void item_idr_remove(struct idr *idr, int id)
+{
+       struct item *item = idr_find(idr, id);
+       assert(item->index == id);
+       idr_remove(idr, id);
+       free(item);
+}
+
+void idr_alloc_test(void)
+{
+       unsigned long i;
+       DEFINE_IDR(idr);
+
+       assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0, 0x4000, GFP_KERNEL) == 0);
+       assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0x3ffd, 0x4000, GFP_KERNEL) == 0x3ffd);
+       idr_remove(&idr, 0x3ffd);
+       idr_remove(&idr, 0);
+
+       for (i = 0x3ffe; i < 0x4003; i++) {
+               int id;
+               struct item *item;
+
+               if (i < 0x4000)
+                       item = item_create(i, 0);
+               else
+                       item = item_create(i - 0x3fff, 0);
+
+               id = idr_alloc_cyclic(&idr, item, 1, 0x4000, GFP_KERNEL);
+               assert(id == item->index);
+       }
+
+       idr_for_each(&idr, item_idr_free, &idr);
+       idr_destroy(&idr);
+}
+
+void idr_replace_test(void)
+{
+       DEFINE_IDR(idr);
+
+       idr_alloc(&idr, (void *)-1, 10, 11, GFP_KERNEL);
+       idr_replace(&idr, &idr, 10);
+
+       idr_destroy(&idr);
+}
+
+/*
+ * Unlike the radix tree, you can put a NULL pointer -- with care -- into
+ * the IDR.  Some interfaces, like idr_find() do not distinguish between
+ * "present, value is NULL" and "not present", but that's exactly what some
+ * users want.
+ */
+void idr_null_test(void)
+{
+       int i;
+       DEFINE_IDR(idr);
+
+       assert(idr_is_empty(&idr));
+
+       assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+       assert(!idr_is_empty(&idr));
+       idr_remove(&idr, 0);
+       assert(idr_is_empty(&idr));
+
+       assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+       assert(!idr_is_empty(&idr));
+       idr_destroy(&idr);
+       assert(idr_is_empty(&idr));
+
+       for (i = 0; i < 10; i++) {
+               assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == i);
+       }
+
+       assert(idr_replace(&idr, DUMMY_PTR, 3) == NULL);
+       assert(idr_replace(&idr, DUMMY_PTR, 4) == NULL);
+       assert(idr_replace(&idr, NULL, 4) == DUMMY_PTR);
+       assert(idr_replace(&idr, DUMMY_PTR, 11) == ERR_PTR(-ENOENT));
+       idr_remove(&idr, 5);
+       assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 5);
+       idr_remove(&idr, 5);
+
+       for (i = 0; i < 9; i++) {
+               idr_remove(&idr, i);
+               assert(!idr_is_empty(&idr));
+       }
+       idr_remove(&idr, 8);
+       assert(!idr_is_empty(&idr));
+       idr_remove(&idr, 9);
+       assert(idr_is_empty(&idr));
+
+       assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+       assert(idr_replace(&idr, DUMMY_PTR, 3) == ERR_PTR(-ENOENT));
+       assert(idr_replace(&idr, DUMMY_PTR, 0) == NULL);
+       assert(idr_replace(&idr, NULL, 0) == DUMMY_PTR);
+
+       idr_destroy(&idr);
+       assert(idr_is_empty(&idr));
+
+       for (i = 1; i < 10; i++) {
+               assert(idr_alloc(&idr, NULL, 1, 0, GFP_KERNEL) == i);
+       }
+
+       idr_destroy(&idr);
+       assert(idr_is_empty(&idr));
+}
+
+void idr_nowait_test(void)
+{
+       unsigned int i;
+       DEFINE_IDR(idr);
+
+       idr_preload(GFP_KERNEL);
+
+       for (i = 0; i < 3; i++) {
+               struct item *item = item_create(i, 0);
+               assert(idr_alloc(&idr, item, i, i + 1, GFP_NOWAIT) == i);
+       }
+
+       idr_preload_end();
+
+       idr_for_each(&idr, item_idr_free, &idr);
+       idr_destroy(&idr);
+}
+
+void idr_checks(void)
+{
+       unsigned long i;
+       DEFINE_IDR(idr);
+
+       for (i = 0; i < 10000; i++) {
+               struct item *item = item_create(i, 0);
+               assert(idr_alloc(&idr, item, 0, 20000, GFP_KERNEL) == i);
+       }
+
+       assert(idr_alloc(&idr, DUMMY_PTR, 5, 30, GFP_KERNEL) < 0);
+
+       for (i = 0; i < 5000; i++)
+               item_idr_remove(&idr, i);
+
+       idr_remove(&idr, 3);
+
+       idr_for_each(&idr, item_idr_free, &idr);
+       idr_destroy(&idr);
+
+       assert(idr_is_empty(&idr));
+
+       idr_remove(&idr, 3);
+       idr_remove(&idr, 0);
+
+       for (i = INT_MAX - 3UL; i < INT_MAX + 1UL; i++) {
+               struct item *item = item_create(i, 0);
+               assert(idr_alloc(&idr, item, i, i + 10, GFP_KERNEL) == i);
+       }
+       assert(idr_alloc(&idr, DUMMY_PTR, i - 2, i, GFP_KERNEL) == -ENOSPC);
+
+       idr_for_each(&idr, item_idr_free, &idr);
+       idr_destroy(&idr);
+       idr_destroy(&idr);
+
+       assert(idr_is_empty(&idr));
+
+       for (i = 1; i < 10000; i++) {
+               struct item *item = item_create(i, 0);
+               assert(idr_alloc(&idr, item, 1, 20000, GFP_KERNEL) == i);
+       }
+
+       idr_for_each(&idr, item_idr_free, &idr);
+       idr_destroy(&idr);
+
+       idr_replace_test();
+       idr_alloc_test();
+       idr_null_test();
+       idr_nowait_test();
+}
+
+/*
+ * Check that we get the correct error when we run out of memory doing
+ * allocations.  To ensure we run out of memory, just "forget" to preload.
+ * The first test is for not having a bitmap available, and the second test
+ * is for not being able to allocate a level of the radix tree.
+ */
+void ida_check_nomem(void)
+{
+       DEFINE_IDA(ida);
+       int id, err;
+
+       err = ida_get_new_above(&ida, 256, &id);
+       assert(err == -EAGAIN);
+       err = ida_get_new_above(&ida, 1UL << 30, &id);
+       assert(err == -EAGAIN);
+}
+
+/*
+ * Check what happens when we fill a leaf and then delete it.  This may
+ * discover mishandling of IDR_FREE.
+ */
+void ida_check_leaf(void)
+{
+       DEFINE_IDA(ida);
+       int id;
+       unsigned long i;
+
+       for (i = 0; i < IDA_BITMAP_BITS; i++) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new(&ida, &id));
+               assert(id == i);
+       }
+
+       ida_destroy(&ida);
+       assert(ida_is_empty(&ida));
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new(&ida, &id));
+       assert(id == 0);
+       ida_destroy(&ida);
+       assert(ida_is_empty(&ida));
+}
+
+/*
+ * Check handling of conversions between exceptional entries and full bitmaps.
+ */
+void ida_check_conv(void)
+{
+       DEFINE_IDA(ida);
+       int id;
+       unsigned long i;
+
+       for (i = 0; i < IDA_BITMAP_BITS * 2; i += IDA_BITMAP_BITS) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new_above(&ida, i + 1, &id));
+               assert(id == i + 1);
+               assert(!ida_get_new_above(&ida, i + BITS_PER_LONG, &id));
+               assert(id == i + BITS_PER_LONG);
+               ida_remove(&ida, i + 1);
+               ida_remove(&ida, i + BITS_PER_LONG);
+               assert(ida_is_empty(&ida));
+       }
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+
+       for (i = 0; i < IDA_BITMAP_BITS * 2; i++) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new(&ida, &id));
+               assert(id == i);
+       }
+
+       for (i = IDA_BITMAP_BITS * 2; i > 0; i--) {
+               ida_remove(&ida, i - 1);
+       }
+       assert(ida_is_empty(&ida));
+
+       for (i = 0; i < IDA_BITMAP_BITS + BITS_PER_LONG - 4; i++) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new(&ida, &id));
+               assert(id == i);
+       }
+
+       for (i = IDA_BITMAP_BITS + BITS_PER_LONG - 4; i > 0; i--) {
+               ida_remove(&ida, i - 1);
+       }
+       assert(ida_is_empty(&ida));
+
+       radix_tree_cpu_dead(1);
+       for (i = 0; i < 1000000; i++) {
+               int err = ida_get_new(&ida, &id);
+               if (err == -EAGAIN) {
+                       assert((i % IDA_BITMAP_BITS) == (BITS_PER_LONG - 2));
+                       assert(ida_pre_get(&ida, GFP_KERNEL));
+                       err = ida_get_new(&ida, &id);
+               } else {
+                       assert((i % IDA_BITMAP_BITS) != (BITS_PER_LONG - 2));
+               }
+               assert(!err);
+               assert(id == i);
+       }
+       ida_destroy(&ida);
+}
+
+/*
+ * Check allocations up to and slightly above the maximum allowed (2^31-1) ID.
+ * Allocating up to 2^31-1 should succeed, and then allocating the next one
+ * should fail.
+ */
+void ida_check_max(void)
+{
+       DEFINE_IDA(ida);
+       int id, err;
+       unsigned long i, j;
+
+       for (j = 1; j < 65537; j *= 2) {
+               unsigned long base = (1UL << 31) - j;
+               for (i = 0; i < j; i++) {
+                       assert(ida_pre_get(&ida, GFP_KERNEL));
+                       assert(!ida_get_new_above(&ida, base, &id));
+                       assert(id == base + i);
+               }
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               err = ida_get_new_above(&ida, base, &id);
+               assert(err == -ENOSPC);
+               ida_destroy(&ida);
+               assert(ida_is_empty(&ida));
+               rcu_barrier();
+       }
+}
+
+void ida_check_random(void)
+{
+       DEFINE_IDA(ida);
+       DECLARE_BITMAP(bitmap, 2048);
+       int id;
+       unsigned int i;
+       time_t s = time(NULL);
+
+ repeat:
+       memset(bitmap, 0, sizeof(bitmap));
+       for (i = 0; i < 100000; i++) {
+               int i = rand();
+               int bit = i & 2047;
+               if (test_bit(bit, bitmap)) {
+                       __clear_bit(bit, bitmap);
+                       ida_remove(&ida, bit);
+               } else {
+                       __set_bit(bit, bitmap);
+                       ida_pre_get(&ida, GFP_KERNEL);
+                       assert(!ida_get_new_above(&ida, bit, &id));
+                       assert(id == bit);
+               }
+       }
+       ida_destroy(&ida);
+       if (time(NULL) < s + 10)
+               goto repeat;
+}
+
+void ida_checks(void)
+{
+       DEFINE_IDA(ida);
+       int id;
+       unsigned long i;
+
+       radix_tree_cpu_dead(1);
+       ida_check_nomem();
+
+       for (i = 0; i < 10000; i++) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new(&ida, &id));
+               assert(id == i);
+       }
+
+       ida_remove(&ida, 20);
+       ida_remove(&ida, 21);
+       for (i = 0; i < 3; i++) {
+               assert(ida_pre_get(&ida, GFP_KERNEL));
+               assert(!ida_get_new(&ida, &id));
+               if (i == 2)
+                       assert(id == 10000);
+       }
+
+       for (i = 0; i < 5000; i++)
+               ida_remove(&ida, i);
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 5000, &id));
+       assert(id == 10001);
+
+       ida_destroy(&ida);
+
+       assert(ida_is_empty(&ida));
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 1, &id));
+       assert(id == 1);
+
+       ida_remove(&ida, id);
+       assert(ida_is_empty(&ida));
+       ida_destroy(&ida);
+       assert(ida_is_empty(&ida));
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 1, &id));
+       ida_destroy(&ida);
+       assert(ida_is_empty(&ida));
+
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 1, &id));
+       assert(id == 1);
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 1025, &id));
+       assert(id == 1025);
+       assert(ida_pre_get(&ida, GFP_KERNEL));
+       assert(!ida_get_new_above(&ida, 10000, &id));
+       assert(id == 10000);
+       ida_remove(&ida, 1025);
+       ida_destroy(&ida);
+       assert(ida_is_empty(&ida));
+
+       ida_check_leaf();
+       ida_check_max();
+       ida_check_conv();
+       ida_check_random();
+
+       radix_tree_cpu_dead(1);
+}
+
+int __weak main(void)
+{
+       radix_tree_init();
+       idr_checks();
+       ida_checks();
+       rcu_barrier();
+       if (nr_allocated)
+               printf("nr_allocated = %d\n", nr_allocated);
+       return 0;
+}
index 7572b7e..a92bab5 100644 (file)
@@ -177,7 +177,7 @@ void iteration_test(unsigned order, unsigned test_duration)
 {
        int i;
 
-       printf("Running %siteration tests for %d seconds\n",
+       printv(1, "Running %siteration tests for %d seconds\n",
                        order > 0 ? "multiorder " : "", test_duration);
 
        max_order = order;
index d31ea7c..cf48c84 100644 (file)
@@ -5,7 +5,7 @@
 #include <unistd.h>
 #include <assert.h>
 
-#include <linux/mempool.h>
+#include <linux/gfp.h>
 #include <linux/poison.h>
 #include <linux/slab.h>
 #include <linux/radix-tree.h>
@@ -13,6 +13,8 @@
 
 int nr_allocated;
 int preempt_count;
+int kmalloc_verbose;
+int test_verbose;
 
 struct kmem_cache {
        pthread_mutex_t lock;
@@ -22,27 +24,6 @@ struct kmem_cache {
        void (*ctor)(void *);
 };
 
-void *mempool_alloc(mempool_t *pool, int gfp_mask)
-{
-       return pool->alloc(gfp_mask, pool->data);
-}
-
-void mempool_free(void *element, mempool_t *pool)
-{
-       pool->free(element, pool->data);
-}
-
-mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-                       mempool_free_t *free_fn, void *pool_data)
-{
-       mempool_t *ret = malloc(sizeof(*ret));
-
-       ret->alloc = alloc_fn;
-       ret->free = free_fn;
-       ret->data = pool_data;
-       return ret;
-}
-
 void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
 {
        struct radix_tree_node *node;
@@ -54,9 +35,9 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
        if (cachep->nr_objs) {
                cachep->nr_objs--;
                node = cachep->objs;
-               cachep->objs = node->private_data;
+               cachep->objs = node->parent;
                pthread_mutex_unlock(&cachep->lock);
-               node->private_data = NULL;
+               node->parent = NULL;
        } else {
                pthread_mutex_unlock(&cachep->lock);
                node = malloc(cachep->size);
@@ -65,6 +46,8 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
        }
 
        uatomic_inc(&nr_allocated);
+       if (kmalloc_verbose)
+               printf("Allocating %p from slab\n", node);
        return node;
 }
 
@@ -72,6 +55,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        assert(objp);
        uatomic_dec(&nr_allocated);
+       if (kmalloc_verbose)
+               printf("Freeing %p to slab\n", objp);
        pthread_mutex_lock(&cachep->lock);
        if (cachep->nr_objs > 10) {
                memset(objp, POISON_FREE, cachep->size);
@@ -79,7 +64,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        } else {
                struct radix_tree_node *node = objp;
                cachep->nr_objs++;
-               node->private_data = cachep->objs;
+               node->parent = cachep->objs;
                cachep->objs = node;
        }
        pthread_mutex_unlock(&cachep->lock);
@@ -89,6 +74,8 @@ void *kmalloc(size_t size, gfp_t gfp)
 {
        void *ret = malloc(size);
        uatomic_inc(&nr_allocated);
+       if (kmalloc_verbose)
+               printf("Allocating %p from malloc\n", ret);
        return ret;
 }
 
@@ -97,6 +84,8 @@ void kfree(void *p)
        if (!p)
                return;
        uatomic_dec(&nr_allocated);
+       if (kmalloc_verbose)
+               printf("Freeing %p to malloc\n", p);
        free(p);
 }
 
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h
deleted file mode 100644 (file)
index a13e9bc..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
-#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
-
-#include <linux/types.h>
-#include <linux/bitops/find.h>
-#include <linux/bitops/hweight.h>
-#include <linux/kernel.h>
-
-#define BIT_MASK(nr)           (1UL << ((nr) % BITS_PER_LONG))
-#define BIT_WORD(nr)           ((nr) / BITS_PER_LONG)
-#define BITS_PER_BYTE          8
-#define BITS_TO_LONGS(nr)      DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
-
-/**
- * __set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __set_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-       *p  |= mask;
-}
-
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-       *p &= ~mask;
-}
-
-/**
- * __change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-       *p ^= mask;
-}
-
-/**
- * __test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old | mask;
-       return (old & mask) != 0;
-}
-
-/**
- * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old & ~mask;
-       return (old & mask) != 0;
-}
-
-/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-                                           volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old ^ mask;
-       return (old & mask) != 0;
-}
-
-/**
- * test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
-{
-       return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-       int num = 0;
-
-       if ((word & 0xffffffff) == 0) {
-               num += 32;
-               word >>= 32;
-       }
-       if ((word & 0xffff) == 0) {
-               num += 16;
-               word >>= 16;
-       }
-       if ((word & 0xff) == 0) {
-               num += 8;
-               word >>= 8;
-       }
-       if ((word & 0xf) == 0) {
-               num += 4;
-               word >>= 4;
-       }
-       if ((word & 0x3) == 0) {
-               num += 2;
-               word >>= 2;
-       }
-       if ((word & 0x1) == 0)
-               num += 1;
-       return num;
-}
-
-unsigned long find_next_bit(const unsigned long *addr,
-                           unsigned long size,
-                           unsigned long offset);
-
-static inline unsigned long hweight_long(unsigned long w)
-{
-       return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
-}
-
-#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/__ffs.h b/tools/testing/radix-tree/linux/bitops/__ffs.h
deleted file mode 100644 (file)
index 9a3274a..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS___FFS_H_
-#define _ASM_GENERIC_BITOPS___FFS_H_
-
-#include <asm/types.h>
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-       int num = 0;
-
-#if BITS_PER_LONG == 64
-       if ((word & 0xffffffff) == 0) {
-               num += 32;
-               word >>= 32;
-       }
-#endif
-       if ((word & 0xffff) == 0) {
-               num += 16;
-               word >>= 16;
-       }
-       if ((word & 0xff) == 0) {
-               num += 8;
-               word >>= 8;
-       }
-       if ((word & 0xf) == 0) {
-               num += 4;
-               word >>= 4;
-       }
-       if ((word & 0x3) == 0) {
-               num += 2;
-               word >>= 2;
-       }
-       if ((word & 0x1) == 0)
-               num += 1;
-       return num;
-}
-
-#endif /* _ASM_GENERIC_BITOPS___FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffs.h b/tools/testing/radix-tree/linux/bitops/ffs.h
deleted file mode 100644 (file)
index fbbb43a..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_FFS_H_
-#define _ASM_GENERIC_BITOPS_FFS_H_
-
-/**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- */
-static inline int ffs(int x)
-{
-       int r = 1;
-
-       if (!x)
-               return 0;
-       if (!(x & 0xffff)) {
-               x >>= 16;
-               r += 16;
-       }
-       if (!(x & 0xff)) {
-               x >>= 8;
-               r += 8;
-       }
-       if (!(x & 0xf)) {
-               x >>= 4;
-               r += 4;
-       }
-       if (!(x & 3)) {
-               x >>= 2;
-               r += 2;
-       }
-       if (!(x & 1)) {
-               x >>= 1;
-               r += 1;
-       }
-       return r;
-}
-
-#endif /* _ASM_GENERIC_BITOPS_FFS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/ffz.h b/tools/testing/radix-tree/linux/bitops/ffz.h
deleted file mode 100644 (file)
index 6744bd4..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_FFZ_H_
-#define _ASM_GENERIC_BITOPS_FFZ_H_
-
-/*
- * ffz - find first zero in word.
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-#define ffz(x)  __ffs(~(x))
-
-#endif /* _ASM_GENERIC_BITOPS_FFZ_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/find.h b/tools/testing/radix-tree/linux/bitops/find.h
deleted file mode 100644 (file)
index 72a51e5..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_FIND_H_
-#define _ASM_GENERIC_BITOPS_FIND_H_
-
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
-               size, unsigned long offset);
-
-extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
-               long size, unsigned long offset);
-
-#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
-#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
-
-#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls.h b/tools/testing/radix-tree/linux/bitops/fls.h
deleted file mode 100644 (file)
index 850859b..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_FLS_H_
-#define _ASM_GENERIC_BITOPS_FLS_H_
-
-/**
- * fls - find last (most-significant) bit set
- * @x: the word to search
- *
- * This is defined the same way as ffs.
- * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
- */
-
-static inline int fls(int x)
-{
-       int r = 32;
-
-       if (!x)
-               return 0;
-       if (!(x & 0xffff0000u)) {
-               x <<= 16;
-               r -= 16;
-       }
-       if (!(x & 0xff000000u)) {
-               x <<= 8;
-               r -= 8;
-       }
-       if (!(x & 0xf0000000u)) {
-               x <<= 4;
-               r -= 4;
-       }
-       if (!(x & 0xc0000000u)) {
-               x <<= 2;
-               r -= 2;
-       }
-       if (!(x & 0x80000000u)) {
-               x <<= 1;
-               r -= 1;
-       }
-       return r;
-}
-
-#endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/fls64.h b/tools/testing/radix-tree/linux/bitops/fls64.h
deleted file mode 100644 (file)
index 1b6b17c..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_FLS64_H_
-#define _ASM_GENERIC_BITOPS_FLS64_H_
-
-#include <asm/types.h>
-
-static inline int fls64(__u64 x)
-{
-       __u32 h = x >> 32;
-       if (h)
-               return fls(h) + 32;
-       return fls(x);
-}
-
-#endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/hweight.h b/tools/testing/radix-tree/linux/bitops/hweight.h
deleted file mode 100644 (file)
index fbbc383..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_HWEIGHT_H_
-#define _ASM_GENERIC_BITOPS_HWEIGHT_H_
-
-#include <asm/types.h>
-
-extern unsigned int hweight32(unsigned int w);
-extern unsigned int hweight16(unsigned int w);
-extern unsigned int hweight8(unsigned int w);
-extern unsigned long hweight64(__u64 w);
-
-#endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/le.h b/tools/testing/radix-tree/linux/bitops/le.h
deleted file mode 100644 (file)
index b9c7e5d..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_LE_H_
-#define _ASM_GENERIC_BITOPS_LE_H_
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-
-#define BITOP_WORD(nr)         ((nr) / BITS_PER_LONG)
-#define BITOP_LE_SWIZZLE       ((BITS_PER_LONG-1) & ~0x7)
-
-#if defined(__LITTLE_ENDIAN)
-
-#define generic_test_le_bit(nr, addr) test_bit(nr, addr)
-#define generic___set_le_bit(nr, addr) __set_bit(nr, addr)
-#define generic___clear_le_bit(nr, addr) __clear_bit(nr, addr)
-
-#define generic_test_and_set_le_bit(nr, addr) test_and_set_bit(nr, addr)
-#define generic_test_and_clear_le_bit(nr, addr) test_and_clear_bit(nr, addr)
-
-#define generic___test_and_set_le_bit(nr, addr) __test_and_set_bit(nr, addr)
-#define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr)
-
-#define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset)
-
-#elif defined(__BIG_ENDIAN)
-
-#define generic_test_le_bit(nr, addr) \
-       test_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-#define generic___set_le_bit(nr, addr) \
-       __set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-#define generic___clear_le_bit(nr, addr) \
-       __clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-
-#define generic_test_and_set_le_bit(nr, addr) \
-       test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-#define generic_test_and_clear_le_bit(nr, addr) \
-       test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-
-#define generic___test_and_set_le_bit(nr, addr) \
-       __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-#define generic___test_and_clear_le_bit(nr, addr) \
-       __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
-
-extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr,
-               unsigned long size, unsigned long offset);
-
-#else
-#error "Please fix <asm/byteorder.h>"
-#endif
-
-#define generic_find_first_zero_le_bit(addr, size) \
-        generic_find_next_zero_le_bit((addr), (size), 0)
-
-#endif /* _ASM_GENERIC_BITOPS_LE_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h
deleted file mode 100644 (file)
index 6a1bcb9..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
-#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
-
-#include <asm/types.h>
-
-#define BITOP_WORD(nr)         ((nr) / BITS_PER_LONG)
-
-/**
- * __set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __set_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-
-       *p  |= mask;
-}
-
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-
-       *p &= ~mask;
-}
-
-/**
- * __change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-
-       *p ^= mask;
-}
-
-/**
- * __test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old | mask;
-       return (old & mask) != 0;
-}
-
-/**
- * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old & ~mask;
-       return (old & mask) != 0;
-}
-
-/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-                                           volatile unsigned long *addr)
-{
-       unsigned long mask = BIT_MASK(nr);
-       unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
-       unsigned long old = *p;
-
-       *p = old ^ mask;
-       return (old & mask) != 0;
-}
-
-/**
- * test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
-{
-       return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
-}
-
-#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/export.h b/tools/testing/radix-tree/linux/export.h
deleted file mode 100644 (file)
index b6afd13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-
-#define EXPORT_SYMBOL(sym)
index 5b09b2c..39a0dcb 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _GFP_H
 #define _GFP_H
 
+#include <linux/types.h>
+
 #define __GFP_BITS_SHIFT 26
 #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 #define __GFP_DIRECT_RECLAIM   0x400000u
 #define __GFP_KSWAPD_RECLAIM   0x2000000u
 
-#define __GFP_RECLAIM          (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+#define __GFP_RECLAIM  (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+
+#define GFP_ATOMIC     (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL     (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_NOWAIT     (__GFP_KSWAPD_RECLAIM)
 
-#define GFP_ATOMIC             (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
-#define GFP_KERNEL             (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
 
 static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 {
diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/radix-tree/linux/idr.h
new file mode 100644 (file)
index 0000000..4e342f2
--- /dev/null
@@ -0,0 +1 @@
+#include "../../../../include/linux/idr.h"
index 360cabb..1bb0afc 100644 (file)
@@ -1 +1 @@
-/* An empty file stub that allows radix-tree.c to compile. */
+#define __init
index 9b43b49..b21a77f 100644 (file)
@@ -1,64 +1,21 @@
 #ifndef _KERNEL_H
 #define _KERNEL_H
 
-#include <assert.h>
+#include "../../include/linux/kernel.h"
 #include <string.h>
 #include <stdio.h>
-#include <stddef.h>
 #include <limits.h>
 
-#include "../../include/linux/compiler.h"
-#include "../../include/linux/err.h"
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
 #include "../../../include/linux/kconfig.h"
 
-#ifdef BENCHMARK
-#define RADIX_TREE_MAP_SHIFT   6
-#else
-#define RADIX_TREE_MAP_SHIFT   3
-#endif
-
-#ifndef NULL
-#define NULL   0
-#endif
-
-#define BUG_ON(expr)   assert(!(expr))
-#define WARN_ON(expr)  assert(!(expr))
-#define __init
-#define __must_check
-#define panic(expr)
 #define printk printf
-#define __force
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 #define pr_debug printk
-
-#define smp_rmb()      barrier()
-#define smp_wmb()      barrier()
-#define cpu_relax()    barrier()
+#define pr_cont printk
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 
-#define container_of(ptr, type, member) ({                      \
-       const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
-       (type *)( (char *)__mptr - offsetof(type, member) );})
-#define min(a, b) ((a) < (b) ? (a) : (b))
-
-#define cond_resched() sched_yield()
-
-static inline int in_interrupt(void)
-{
-       return 0;
-}
-
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-
-#define xchg(ptr, x)   uatomic_xchg(ptr, x)
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/mempool.h b/tools/testing/radix-tree/linux/mempool.h
deleted file mode 100644 (file)
index 6a2dc55..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-
-#include <linux/slab.h>
-
-typedef void *(mempool_alloc_t)(int gfp_mask, void *pool_data);
-typedef void (mempool_free_t)(void *element, void *pool_data);
-
-typedef struct {
-       mempool_alloc_t *alloc;
-       mempool_free_t *free;
-       void *data;
-} mempool_t;
-
-void *mempool_alloc(mempool_t *pool, int gfp_mask);
-void mempool_free(void *element, mempool_t *pool);
-mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-                       mempool_free_t *free_fn, void *pool_data);
index 5837f1d..3ea01a1 100644 (file)
@@ -1,7 +1,10 @@
-
+#define DECLARE_PER_CPU(type, val) extern type val
 #define DEFINE_PER_CPU(type, val) type val
 
 #define __get_cpu_var(var)     var
 #define this_cpu_ptr(var)      var
+#define this_cpu_read(var)     var
+#define this_cpu_xchg(var, val)                uatomic_xchg(&var, val)
+#define this_cpu_cmpxchg(var, old, new)        uatomic_cmpxchg(&var, old, new)
 #define per_cpu_ptr(ptr, cpu)   ({ (void)(cpu); (ptr); })
 #define per_cpu(var, cpu)      (*per_cpu_ptr(&(var), cpu))
index 65c04c2..35c5ac8 100644 (file)
@@ -1,4 +1,14 @@
+#ifndef __LINUX_PREEMPT_H
+#define __LINUX_PREEMPT_H
+
 extern int preempt_count;
 
 #define preempt_disable()      uatomic_inc(&preempt_count)
 #define preempt_enable()       uatomic_dec(&preempt_count)
+
+static inline int in_interrupt(void)
+{
+       return 0;
+}
+
+#endif /* __LINUX_PREEMPT_H */
index ce694dd..bf1bb23 100644 (file)
@@ -1 +1,26 @@
+#ifndef _TEST_RADIX_TREE_H
+#define _TEST_RADIX_TREE_H
+
+#include "generated/map-shift.h"
 #include "../../../../include/linux/radix-tree.h"
+
+extern int kmalloc_verbose;
+extern int test_verbose;
+
+static inline void trace_call_rcu(struct rcu_head *head,
+               void (*func)(struct rcu_head *head))
+{
+       if (kmalloc_verbose)
+               printf("Delaying free of %p to slab\n", (char *)head -
+                               offsetof(struct radix_tree_node, rcu_head));
+       call_rcu(head, func);
+}
+
+#define printv(verbosity_level, fmt, ...) \
+       if(test_verbose >= verbosity_level) \
+               printf(fmt, ##__VA_ARGS__)
+
+#undef call_rcu
+#define call_rcu(x, y) trace_call_rcu(x, y)
+
+#endif /* _TEST_RADIX_TREE_H */
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
deleted file mode 100644 (file)
index 8491d89..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _TYPES_H
-#define _TYPES_H
-
-#include "../../include/linux/types.h"
-
-#define __rcu
-#define __read_mostly
-
-static inline void INIT_LIST_HEAD(struct list_head *list)
-{
-       list->next = list;
-       list->prev = list;
-}
-
-typedef struct {
-       unsigned int x;
-} spinlock_t;
-
-#define uninitialized_var(x) x = x
-
-#include <linux/gfp.h>
-
-#endif
index f7e9801..b829127 100644 (file)
@@ -3,6 +3,7 @@
 #include <unistd.h>
 #include <time.h>
 #include <assert.h>
+#include <limits.h>
 
 #include <linux/slab.h>
 #include <linux/radix-tree.h>
@@ -67,7 +68,7 @@ void big_gang_check(bool long_run)
 
        for (i = 0; i < (long_run ? 1000 : 3); i++) {
                __big_gang_check();
-               printf("%d ", i);
+               printv(2, "%d ", i);
                fflush(stdout);
        }
 }
@@ -128,14 +129,19 @@ void check_copied_tags(struct radix_tree_root *tree, unsigned long start, unsign
                        putchar('.'); */
                if (idx[i] < start || idx[i] > end) {
                        if (item_tag_get(tree, idx[i], totag)) {
-                               printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+                               printv(2, "%lu-%lu: %lu, tags %d-%d\n", start,
+                                      end, idx[i], item_tag_get(tree, idx[i],
+                                                                fromtag),
+                                      item_tag_get(tree, idx[i], totag));
                        }
                        assert(!item_tag_get(tree, idx[i], totag));
                        continue;
                }
                if (item_tag_get(tree, idx[i], fromtag) ^
                        item_tag_get(tree, idx[i], totag)) {
-                       printf("%lu-%lu: %lu, tags %d-%d\n", start, end, idx[i], item_tag_get(tree, idx[i], fromtag), item_tag_get(tree, idx[i], totag));
+                       printv(2, "%lu-%lu: %lu, tags %d-%d\n", start, end,
+                              idx[i], item_tag_get(tree, idx[i], fromtag),
+                              item_tag_get(tree, idx[i], totag));
                }
                assert(!(item_tag_get(tree, idx[i], fromtag) ^
                         item_tag_get(tree, idx[i], totag)));
@@ -237,7 +243,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index,
        item = item_lookup(tree, index);
        index2 = find_item(tree, item);
        if (index != index2) {
-               printf("index %ld order %d inserted; found %ld\n",
+               printv(2, "index %ld order %d inserted; found %ld\n",
                        index, order, index2);
                abort();
        }
@@ -288,43 +294,48 @@ static void single_thread_tests(bool long_run)
 {
        int i;
 
-       printf("starting single_thread_tests: %d allocated, preempt %d\n",
+       printv(1, "starting single_thread_tests: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        multiorder_checks();
        rcu_barrier();
-       printf("after multiorder_check: %d allocated, preempt %d\n",
+       printv(2, "after multiorder_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        locate_check();
        rcu_barrier();
-       printf("after locate_check: %d allocated, preempt %d\n",
+       printv(2, "after locate_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        tag_check();
        rcu_barrier();
-       printf("after tag_check: %d allocated, preempt %d\n",
+       printv(2, "after tag_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        gang_check();
        rcu_barrier();
-       printf("after gang_check: %d allocated, preempt %d\n",
+       printv(2, "after gang_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        add_and_check();
        rcu_barrier();
-       printf("after add_and_check: %d allocated, preempt %d\n",
+       printv(2, "after add_and_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        dynamic_height_check();
        rcu_barrier();
-       printf("after dynamic_height_check: %d allocated, preempt %d\n",
+       printv(2, "after dynamic_height_check: %d allocated, preempt %d\n",
+               nr_allocated, preempt_count);
+       idr_checks();
+       ida_checks();
+       rcu_barrier();
+       printv(2, "after idr_checks: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        big_gang_check(long_run);
        rcu_barrier();
-       printf("after big_gang_check: %d allocated, preempt %d\n",
+       printv(2, "after big_gang_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        for (i = 0; i < (long_run ? 2000 : 3); i++) {
                copy_tag_check();
-               printf("%d ", i);
+               printv(2, "%d ", i);
                fflush(stdout);
        }
        rcu_barrier();
-       printf("after copy_tag_check: %d allocated, preempt %d\n",
+       printv(2, "after copy_tag_check: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
 }
 
@@ -334,24 +345,28 @@ int main(int argc, char **argv)
        int opt;
        unsigned int seed = time(NULL);
 
-       while ((opt = getopt(argc, argv, "ls:")) != -1) {
+       while ((opt = getopt(argc, argv, "ls:v")) != -1) {
                if (opt == 'l')
                        long_run = true;
                else if (opt == 's')
                        seed = strtoul(optarg, NULL, 0);
+               else if (opt == 'v')
+                       test_verbose++;
        }
 
        printf("random seed %u\n", seed);
        srand(seed);
 
+       printf("running tests\n");
+
        rcu_register_thread();
        radix_tree_init();
 
        regression1_test();
        regression2_test();
        regression3_test();
-       iteration_test(0, 10);
-       iteration_test(7, 20);
+       iteration_test(0, 10 + 90 * long_run);
+       iteration_test(7, 10 + 90 * long_run);
        single_thread_tests(long_run);
 
        /* Free any remaining preallocated nodes */
@@ -360,9 +375,11 @@ int main(int argc, char **argv)
        benchmark();
 
        rcu_barrier();
-       printf("after rcu_barrier: %d allocated, preempt %d\n",
+       printv(2, "after rcu_barrier: %d allocated, preempt %d\n",
                nr_allocated, preempt_count);
        rcu_unregister_thread();
 
+       printf("tests completed\n");
+
        exit(0);
 }
index f79812a..06c7117 100644 (file)
@@ -30,7 +30,7 @@ static void __multiorder_tag_test(int index, int order)
        /* our canonical entry */
        base = index & ~((1 << order) - 1);
 
-       printf("Multiorder tag test with index %d, canonical entry %d\n",
+       printv(2, "Multiorder tag test with index %d, canonical entry %d\n",
                        index, base);
 
        err = item_insert_order(&tree, index, order);
@@ -150,7 +150,7 @@ static void multiorder_check(unsigned long index, int order)
        struct item *item2 = item_create(min, order);
        RADIX_TREE(tree, GFP_KERNEL);
 
-       printf("Multiorder index %ld, order %d\n", index, order);
+       printv(2, "Multiorder index %ld, order %d\n", index, order);
 
        assert(item_insert_order(&tree, index, order) == 0);
 
@@ -188,7 +188,7 @@ static void multiorder_shrink(unsigned long index, int order)
        RADIX_TREE(tree, GFP_KERNEL);
        struct radix_tree_node *node;
 
-       printf("Multiorder shrink index %ld, order %d\n", index, order);
+       printv(2, "Multiorder shrink index %ld, order %d\n", index, order);
 
        assert(item_insert_order(&tree, 0, order) == 0);
 
@@ -209,7 +209,8 @@ static void multiorder_shrink(unsigned long index, int order)
                item_check_absent(&tree, i);
 
        if (!item_delete(&tree, 0)) {
-               printf("failed to delete index %ld (order %d)\n", index, order);                abort();
+               printv(2, "failed to delete index %ld (order %d)\n", index, order);
+               abort();
        }
 
        for (i = 0; i < 2*max; i++)
@@ -234,7 +235,7 @@ void multiorder_iteration(void)
        void **slot;
        int i, j, err;
 
-       printf("Multiorder iteration test\n");
+       printv(1, "Multiorder iteration test\n");
 
 #define NUM_ENTRIES 11
        int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
@@ -275,7 +276,7 @@ void multiorder_tagged_iteration(void)
        void **slot;
        int i, j;
 
-       printf("Multiorder tagged iteration test\n");
+       printv(1, "Multiorder tagged iteration test\n");
 
 #define MT_NUM_ENTRIES 9
        int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
@@ -355,6 +356,10 @@ void multiorder_tagged_iteration(void)
        item_kill_tree(&tree);
 }
 
+/*
+ * Basic join checks: make sure we can't find an entry in the tree after
+ * a larger entry has replaced it
+ */
 static void multiorder_join1(unsigned long index,
                                unsigned order1, unsigned order2)
 {
@@ -373,6 +378,10 @@ static void multiorder_join1(unsigned long index,
        item_kill_tree(&tree);
 }
 
+/*
+ * Check that the accounting of exceptional entries is handled correctly
+ * by joining an exceptional entry to a normal pointer.
+ */
 static void multiorder_join2(unsigned order1, unsigned order2)
 {
        RADIX_TREE(tree, GFP_KERNEL);
@@ -386,6 +395,9 @@ static void multiorder_join2(unsigned order1, unsigned order2)
        assert(item2 == (void *)0x12UL);
        assert(node->exceptional == 1);
 
+       item2 = radix_tree_lookup(&tree, 0);
+       free(item2);
+
        radix_tree_join(&tree, 0, order1, item1);
        item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
        assert(item2 == item1);
@@ -453,7 +465,7 @@ static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
 {
        struct radix_tree_preload *rtp = &radix_tree_preloads;
        if (rtp->nr != 0)
-               printf("split(%u %u) remaining %u\n", old_order, new_order,
+               printv(2, "split(%u %u) remaining %u\n", old_order, new_order,
                                                        rtp->nr);
        /*
         * Can't check for equality here as some nodes may have been
@@ -461,7 +473,7 @@ static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
         * nodes allocated since they should have all been preloaded.
         */
        if (nr_allocated > alloc)
-               printf("split(%u %u) allocated %u %u\n", old_order, new_order,
+               printv(2, "split(%u %u) allocated %u %u\n", old_order, new_order,
                                                        alloc, nr_allocated);
 }
 
@@ -471,6 +483,7 @@ static void __multiorder_split(int old_order, int new_order)
        void **slot;
        struct radix_tree_iter iter;
        unsigned alloc;
+       struct item *item;
 
        radix_tree_preload(GFP_KERNEL);
        assert(item_insert_order(&tree, 0, old_order) == 0);
@@ -479,7 +492,7 @@ static void __multiorder_split(int old_order, int new_order)
        /* Wipe out the preloaded cache or it'll confuse check_mem() */
        radix_tree_cpu_dead(0);
 
-       radix_tree_tag_set(&tree, 0, 2);
+       item = radix_tree_tag_set(&tree, 0, 2);
 
        radix_tree_split_preload(old_order, new_order, GFP_KERNEL);
        alloc = nr_allocated;
@@ -492,6 +505,7 @@ static void __multiorder_split(int old_order, int new_order)
        radix_tree_preload_end();
 
        item_kill_tree(&tree);
+       free(item);
 }
 
 static void __multiorder_split2(int old_order, int new_order)
@@ -633,3 +647,10 @@ void multiorder_checks(void)
 
        radix_tree_cpu_dead(0);
 }
+
+int __weak main(void)
+{
+       radix_tree_init();
+       multiorder_checks();
+       return 0;
+}
index 0d6813a..bf97742 100644 (file)
@@ -193,7 +193,7 @@ void regression1_test(void)
        long arg;
 
        /* Regression #1 */
-       printf("running regression test 1, should finish in under a minute\n");
+       printv(1, "running regression test 1, should finish in under a minute\n");
        nr_threads = 2;
        pthread_barrier_init(&worker_barrier, NULL, nr_threads);
 
@@ -216,5 +216,5 @@ void regression1_test(void)
 
        free(threads);
 
-       printf("regression test 1, done\n");
+       printv(1, "regression test 1, done\n");
 }
index a41325d..42dd2a3 100644 (file)
@@ -80,7 +80,7 @@ void regression2_test(void)
        unsigned long int start, end;
        struct page *pages[1];
 
-       printf("running regression test 2 (should take milliseconds)\n");
+       printv(1, "running regression test 2 (should take milliseconds)\n");
        /* 0. */
        for (i = 0; i <= max_slots - 1; i++) {
                p = page_alloc();
@@ -103,7 +103,7 @@ void regression2_test(void)
 
        /* 4. */
        for (i = max_slots - 1; i >= 0; i--)
-               radix_tree_delete(&mt_tree, i);
+               free(radix_tree_delete(&mt_tree, i));
 
        /* 5. */
        // NOTE: start should not be 0 because radix_tree_gang_lookup_tag_slot
@@ -114,7 +114,9 @@ void regression2_test(void)
                PAGECACHE_TAG_TOWRITE);
 
        /* We remove all the remained nodes */
-       radix_tree_delete(&mt_tree, max_slots);
+       free(radix_tree_delete(&mt_tree, max_slots));
 
-       printf("regression test 2, done\n");
+       BUG_ON(!radix_tree_empty(&mt_tree));
+
+       printv(1, "regression test 2, done\n");
 }
index b594841..670c3d2 100644 (file)
@@ -34,21 +34,21 @@ void regression3_test(void)
        void **slot;
        bool first;
 
-       printf("running regression test 3 (should take milliseconds)\n");
+       printv(1, "running regression test 3 (should take milliseconds)\n");
 
        radix_tree_insert(&root, 0, ptr0);
        radix_tree_tag_set(&root, 0, 0);
 
        first = true;
        radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
-               printf("tagged %ld %p\n", iter.index, *slot);
+               printv(2, "tagged %ld %p\n", iter.index, *slot);
                if (first) {
                        radix_tree_insert(&root, 1, ptr);
                        radix_tree_tag_set(&root, 1, 0);
                        first = false;
                }
                if (radix_tree_deref_retry(*slot)) {
-                       printf("retry at %ld\n", iter.index);
+                       printv(2, "retry at %ld\n", iter.index);
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
@@ -57,13 +57,13 @@ void regression3_test(void)
 
        first = true;
        radix_tree_for_each_slot(slot, &root, &iter, 0) {
-               printf("slot %ld %p\n", iter.index, *slot);
+               printv(2, "slot %ld %p\n", iter.index, *slot);
                if (first) {
                        radix_tree_insert(&root, 1, ptr);
                        first = false;
                }
                if (radix_tree_deref_retry(*slot)) {
-                       printk("retry at %ld\n", iter.index);
+                       printv(2, "retry at %ld\n", iter.index);
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
@@ -72,30 +72,30 @@ void regression3_test(void)
 
        first = true;
        radix_tree_for_each_contig(slot, &root, &iter, 0) {
-               printk("contig %ld %p\n", iter.index, *slot);
+               printv(2, "contig %ld %p\n", iter.index, *slot);
                if (first) {
                        radix_tree_insert(&root, 1, ptr);
                        first = false;
                }
                if (radix_tree_deref_retry(*slot)) {
-                       printk("retry at %ld\n", iter.index);
+                       printv(2, "retry at %ld\n", iter.index);
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
        }
 
        radix_tree_for_each_slot(slot, &root, &iter, 0) {
-               printf("slot %ld %p\n", iter.index, *slot);
+               printv(2, "slot %ld %p\n", iter.index, *slot);
                if (!iter.index) {
-                       printf("next at %ld\n", iter.index);
+                       printv(2, "next at %ld\n", iter.index);
                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
 
        radix_tree_for_each_contig(slot, &root, &iter, 0) {
-               printf("contig %ld %p\n", iter.index, *slot);
+               printv(2, "contig %ld %p\n", iter.index, *slot);
                if (!iter.index) {
-                       printf("next at %ld\n", iter.index);
+                       printv(2, "next at %ld\n", iter.index);
                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
@@ -103,9 +103,9 @@ void regression3_test(void)
        radix_tree_tag_set(&root, 0, 0);
        radix_tree_tag_set(&root, 1, 0);
        radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
-               printf("tagged %ld %p\n", iter.index, *slot);
+               printv(2, "tagged %ld %p\n", iter.index, *slot);
                if (!iter.index) {
-                       printf("next at %ld\n", iter.index);
+                       printv(2, "next at %ld\n", iter.index);
                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
@@ -113,5 +113,5 @@ void regression3_test(void)
        radix_tree_delete(&root, 0);
        radix_tree_delete(&root, 1);
 
-       printf("regression test 3 passed\n");
+       printv(1, "regression test 3 passed\n");
 }
index fd98c13..d4ff009 100644 (file)
@@ -49,10 +49,10 @@ void simple_checks(void)
        }
        verify_tag_consistency(&tree, 0);
        verify_tag_consistency(&tree, 1);
-       printf("before item_kill_tree: %d allocated\n", nr_allocated);
+       printv(2, "before item_kill_tree: %d allocated\n", nr_allocated);
        item_kill_tree(&tree);
        rcu_barrier();
-       printf("after item_kill_tree: %d allocated\n", nr_allocated);
+       printv(2, "after item_kill_tree: %d allocated\n", nr_allocated);
 }
 
 /*
@@ -257,7 +257,7 @@ static void do_thrash(struct radix_tree_root *tree, char *thrash_state, int tag)
 
                gang_check(tree, thrash_state, tag);
 
-               printf("%d(%d) %d(%d) %d(%d) %d(%d) / "
+               printv(2, "%d(%d) %d(%d) %d(%d) %d(%d) / "
                                "%d(%d) present, %d(%d) tagged\n",
                        insert_chunk, nr_inserted,
                        delete_chunk, nr_deleted,
@@ -296,13 +296,13 @@ static void __leak_check(void)
 {
        RADIX_TREE(tree, GFP_KERNEL);
 
-       printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+       printv(2, "%d: nr_allocated=%d\n", __LINE__, nr_allocated);
        item_insert(&tree, 1000000);
-       printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+       printv(2, "%d: nr_allocated=%d\n", __LINE__, nr_allocated);
        item_delete(&tree, 1000000);
-       printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+       printv(2, "%d: nr_allocated=%d\n", __LINE__, nr_allocated);
        item_kill_tree(&tree);
-       printf("%d: nr_allocated=%d\n", __LINE__, nr_allocated);
+       printv(2, "%d: nr_allocated=%d\n", __LINE__, nr_allocated);
 }
 
 static void single_check(void)
@@ -336,15 +336,15 @@ void tag_check(void)
        extend_checks();
        contract_checks();
        rcu_barrier();
-       printf("after extend_checks: %d allocated\n", nr_allocated);
+       printv(2, "after extend_checks: %d allocated\n", nr_allocated);
        __leak_check();
        leak_check();
        rcu_barrier();
-       printf("after leak_check: %d allocated\n", nr_allocated);
+       printv(2, "after leak_check: %d allocated\n", nr_allocated);
        simple_checks();
        rcu_barrier();
-       printf("after simple_checks: %d allocated\n", nr_allocated);
+       printv(2, "after simple_checks: %d allocated\n", nr_allocated);
        thrash_tags();
        rcu_barrier();
-       printf("after thrash_tags: %d allocated\n", nr_allocated);
+       printv(2, "after thrash_tags: %d allocated\n", nr_allocated);
 }
index e5726e3..1a257d7 100644 (file)
@@ -29,15 +29,28 @@ int __item_insert(struct radix_tree_root *root, struct item *item)
        return __radix_tree_insert(root, item->index, item->order, item);
 }
 
-int item_insert(struct radix_tree_root *root, unsigned long index)
+struct item *item_create(unsigned long index, unsigned int order)
 {
-       return __item_insert(root, item_create(index, 0));
+       struct item *ret = malloc(sizeof(*ret));
+
+       ret->index = index;
+       ret->order = order;
+       return ret;
 }
 
 int item_insert_order(struct radix_tree_root *root, unsigned long index,
                        unsigned order)
 {
-       return __item_insert(root, item_create(index, order));
+       struct item *item = item_create(index, order);
+       int err = __item_insert(root, item);
+       if (err)
+               free(item);
+       return err;
+}
+
+int item_insert(struct radix_tree_root *root, unsigned long index)
+{
+       return item_insert_order(root, index, 0);
 }
 
 void item_sanity(struct item *item, unsigned long index)
@@ -61,15 +74,6 @@ int item_delete(struct radix_tree_root *root, unsigned long index)
        return 0;
 }
 
-struct item *item_create(unsigned long index, unsigned int order)
-{
-       struct item *ret = malloc(sizeof(*ret));
-
-       ret->index = index;
-       ret->order = order;
-       return ret;
-}
-
 void item_check_present(struct radix_tree_root *root, unsigned long index)
 {
        struct item *item;
index 056a23b..b30e11d 100644 (file)
@@ -34,6 +34,8 @@ void tag_check(void);
 void multiorder_checks(void);
 void iteration_test(unsigned order, unsigned duration);
 void benchmark(void);
+void idr_checks(void);
+void ida_checks(void);
 
 struct item *
 item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
index 1bb0125..ccd0734 100644 (file)
@@ -57,7 +57,7 @@ void my_usr1(int sig, siginfo_t *si, void *u)
                exit(EXIT_FAILURE);
        }
        if (stk.ss_flags != SS_DISABLE)
-               printf("[FAIL]\tss_flags=%i, should be SS_DISABLE\n",
+               printf("[FAIL]\tss_flags=%x, should be SS_DISABLE\n",
                                stk.ss_flags);
        else
                printf("[OK]\tsigaltstack is disabled in sighandler\n");
@@ -122,7 +122,8 @@ int main(void)
        if (stk.ss_flags == SS_DISABLE) {
                printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n");
        } else {
-               printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags);
+               printf("[FAIL]\tInitial sigaltstack state was %x; "
+                      "should have been SS_DISABLE\n", stk.ss_flags);
                return EXIT_FAILURE;
        }
 
@@ -165,7 +166,7 @@ int main(void)
                exit(EXIT_FAILURE);
        }
        if (stk.ss_flags != SS_AUTODISARM) {
-               printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n",
+               printf("[FAIL]\tss_flags=%x, should be SS_AUTODISARM\n",
                                stk.ss_flags);
                exit(EXIT_FAILURE);
        }
index 3815e94..2366177 100644 (file)
@@ -204,7 +204,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
        work->addr = hva;
        work->arch = *arch;
        work->mm = current->mm;
-       atomic_inc(&work->mm->mm_users);
+       mmget(work->mm);
        kvm_get_kvm(work->vcpu->kvm);
 
        /* this can't really happen otherwise gfn_to_pfn_async
index 5b0dd4a..35f7140 100644 (file)
@@ -611,7 +611,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
                return ERR_PTR(-ENOMEM);
 
        spin_lock_init(&kvm->mmu_lock);
-       atomic_inc(&current->mm->mm_count);
+       mmgrab(current->mm);
        kvm->mm = current->mm;
        kvm_eventfd_init(kvm);
        mutex_init(&kvm->lock);