Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
Pull btrfs updates from David Sterba:
 "The updates this time are more under the hood and enhancing existing
  features (subpage with compression and zoned namespaces).

  Performance related:

   - misc small inode logging improvements (+3% throughput, -11% latency
     on sample dbench workload)

   - more efficient directory logging: bulk item insertion, less tree
     searches and locking

   - speed up bulk insertion of items into a b-tree, which is used when
     logging directories, when running delayed items for directories
     (fsync and transaction commits) and when running the slow path
     (full sync) of an fsync (bulk creation run time -4%, deletion -12%)

  Core:

   - continued subpage support
      - make defragmentation work
      - make compression write work

   - zoned mode
      - support ZNS (zoned namespaces), zone capacity is number of
        usable blocks in each zone
      - add dedicated block group (zoned) for relocation, to prevent
        out of order writes in some cases
      - greedy block group reclaim, pick the ones with least usable
        space first

   - preparatory work for send protocol updates

   - error handling improvements

   - cleanups and refactoring

  Fixes:

   - lockdep warnings
      - in show_devname callback, on seeding device
      - device delete on loop device due to conversions to workqueues

   - fix deadlock between chunk allocation and chunk btree modifications

   - fix tracking of missing device count and status"

* tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (140 commits)
  btrfs: remove root argument from check_item_in_log()
  btrfs: remove root argument from add_link()
  btrfs: remove root argument from btrfs_unlink_inode()
  btrfs: remove root argument from drop_one_dir_item()
  btrfs: clear MISSING device status bit in btrfs_close_one_device
  btrfs: call btrfs_check_rw_degradable only if there is a missing device
  btrfs: send: prepare for v2 protocol
  btrfs: fix comment about sector sizes supported in 64K systems
  btrfs: update device path inode time instead of bd_inode
  fs: export an inode_update_time helper
  btrfs: fix deadlock when defragging transparent huge pages
  btrfs: sysfs: convert scnprintf and snprintf to sysfs_emit
  btrfs: make btrfs_super_block size match BTRFS_SUPER_INFO_SIZE
  btrfs: update comments for chunk allocation -ENOSPC cases
  btrfs: fix deadlock between chunk allocation and chunk btree modifications
  btrfs: zoned: use greedy gc for auto reclaim
  btrfs: check-integrity: stop storing the block device name in btrfsic_dev_state
  btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls
  btrfs: add a btrfs_get_dev_args_from_path helper
  btrfs: handle device lookup with btrfs_dev_lookup_args
  ...

607 files changed:
Documentation/block/inline-encryption.rst
Documentation/block/queue-sysfs.rst
Documentation/cdrom/cdrom-standard.rst
Documentation/core-api/cachetlb.rst
Documentation/core-api/mm-api.rst
Documentation/filesystems/erofs.rst
Documentation/filesystems/fscrypt.rst
Documentation/filesystems/index.rst
Documentation/filesystems/locks.rst
Documentation/filesystems/netfs_library.rst
Documentation/userspace-api/ioctl/cdrom.rst
Documentation/userspace-api/ioctl/ioctl-number.rst
MAINTAINERS
Makefile
arch/arc/include/asm/cacheflush.h
arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts
arch/arm/include/asm/cacheflush.h
arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts
arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi
arch/arm64/boot/dts/qcom/sm8250.dtsi
arch/arm64/net/bpf_jit_comp.c
arch/m68k/emu/nfblock.c
arch/m68k/include/asm/cacheflush_mm.h
arch/mips/include/asm/cacheflush.h
arch/mips/rb532/prom.c
arch/mips/sibyte/common/cfe.c
arch/mips/sibyte/swarm/setup.c
arch/nds32/include/asm/cacheflush.h
arch/nds32/kernel/ftrace.c
arch/nios2/include/asm/cacheflush.h
arch/nios2/platform/Kconfig.platform
arch/openrisc/mm/init.c
arch/parisc/include/asm/cacheflush.h
arch/powerpc/platforms/cell/spufs/inode.c
arch/powerpc/platforms/pseries/iommu.c
arch/riscv/Kconfig
arch/riscv/include/asm/kasan.h
arch/riscv/kernel/head.S
arch/riscv/mm/kasan_init.c
arch/riscv/net/bpf_jit_core.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/sh/include/asm/cacheflush.h
arch/um/drivers/ubd_kern.c
arch/x86/crypto/sm4-aesni-avx-asm_64.S
arch/x86/crypto/sm4-aesni-avx2-asm_64.S
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/svm/sev.c
arch/x86/kvm/x86.c
arch/x86/kvm/xen.c
arch/xtensa/include/asm/cacheflush.h
arch/xtensa/platforms/iss/simdisk.c
block/Kconfig
block/Kconfig.iosched
block/Makefile
block/bdev.c
block/bfq-cgroup.c
block/bfq-iosched.c
block/bio-integrity.c
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-crypto-fallback.c
block/blk-crypto-internal.h
block/blk-crypto-profile.c [new file with mode: 0644]
block/blk-crypto.c
block/blk-exec.c
block/blk-flush.c
block/blk-ia-ranges.c [new file with mode: 0644]
block/blk-integrity.c
block/blk-iocost.c
block/blk-iolatency.c
block/blk-merge.c
block/blk-mq-debugfs.c
block/blk-mq-sched.c
block/blk-mq-sched.h
block/blk-mq-tag.c
block/blk-mq-tag.h
block/blk-mq.c
block/blk-mq.h
block/blk-rq-qos.h
block/blk-settings.c
block/blk-sysfs.c
block/blk-throttle.c
block/blk-throttle.h [new file with mode: 0644]
block/blk-wbt.c
block/blk.h
block/bounce.c
block/bsg-lib.c
block/elevator.c
block/elevator.h [moved from include/linux/elevator.h with 92% similarity]
block/fops.c
block/genhd.c
block/holder.c
block/ioctl.c
block/keyslot-manager.c [deleted file]
block/kyber-iosched.c
block/mq-deadline.c
block/partitions/Kconfig
block/partitions/core.c
block/partitions/efi.c
block/partitions/ibm.c
block/t10-pi.c
crypto/af_alg.c
drivers/ata/libata-core.c
drivers/ata/libata-scsi.c
drivers/base/regmap/regcache-rbtree.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/amiflop.c
drivers/block/aoe/aoeblk.c
drivers/block/ataflop.c
drivers/block/brd.c
drivers/block/cryptoloop.c [deleted file]
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_req.c
drivers/block/floppy.c
drivers/block/loop.c
drivers/block/loop.h
drivers/block/mtip32xx/mtip32xx.c
drivers/block/n64cart.c
drivers/block/nbd.c
drivers/block/null_blk/main.c
drivers/block/null_blk/null_blk.h
drivers/block/paride/pcd.c
drivers/block/paride/pd.c
drivers/block/paride/pf.c
drivers/block/pktcdvd.c
drivers/block/ps3vram.c
drivers/block/rbd.c
drivers/block/rnbd/rnbd-clt.c
drivers/block/rnbd/rnbd-proto.h
drivers/block/rsxx/core.c
drivers/block/rsxx/dev.c
drivers/block/swim.c
drivers/block/swim3.c
drivers/block/sx8.c
drivers/block/virtio_blk.c
drivers/block/xen-blkback/xenbus.c
drivers/block/xen-blkfront.c
drivers/block/zram/zram_drv.c
drivers/cdrom/cdrom.c
drivers/cdrom/gdrom.c
drivers/char/tpm/Kconfig
drivers/char/tpm/tpm2-space.c
drivers/char/tpm/tpm_tis_core.c
drivers/char/tpm/tpm_tis_core.h
drivers/char/tpm/tpm_tis_spi_main.c
drivers/clk/clk-composite.c
drivers/gpio/gpio-mlxbf2.c
drivers/gpio/gpio-xgs-iproc.c
drivers/gpu/drm/amd/amdgpu/nv.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c
drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c
drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
drivers/gpu/drm/amd/display/include/dal_asic_id.h
drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
drivers/gpu/drm/drm_panel_orientation_quirks.c
drivers/gpu/drm/i915/display/intel_dp.c
drivers/gpu/drm/i915/gt/intel_timeline.c
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/i915_trace.h
drivers/gpu/drm/i915/i915_utils.h
drivers/gpu/drm/i915/intel_dram.c
drivers/gpu/drm/selftests/test-drm_damage_helper.c
drivers/gpu/drm/ttm/ttm_bo_util.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/hw/hfi1/pio.c
drivers/infiniband/hw/irdma/uk.c
drivers/infiniband/hw/irdma/verbs.c
drivers/infiniband/hw/irdma/ws.c
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/qedr/qedr.h
drivers/infiniband/hw/qedr/qedr_iw_cm.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/sw/rdmavt/qp.c
drivers/md/bcache/bcache.h
drivers/md/bcache/bcache_ondisk.h [moved from include/uapi/linux/bcache.h with 99% similarity]
drivers/md/bcache/bset.h
drivers/md/bcache/btree.c
drivers/md/bcache/debug.c
drivers/md/bcache/features.c
drivers/md/bcache/features.h
drivers/md/bcache/io.c
drivers/md/bcache/request.c
drivers/md/bcache/request.h
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/sysfs.h
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c
drivers/md/dm-bio-record.h
drivers/md/dm-bufio.c
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-target.c
drivers/md/dm-clone-target.c
drivers/md/dm-core.h
drivers/md/dm-crypt.c
drivers/md/dm-dust.c
drivers/md/dm-ebs-target.c
drivers/md/dm-era-target.c
drivers/md/dm-exception-store.h
drivers/md/dm-flakey.c
drivers/md/dm-ima.c
drivers/md/dm-integrity.c
drivers/md/dm-linear.c
drivers/md/dm-log-writes.c
drivers/md/dm-log.c
drivers/md/dm-mpath.c
drivers/md/dm-ps-historical-service-time.c
drivers/md/dm-raid.c
drivers/md/dm-rq.c
drivers/md/dm-switch.c
drivers/md/dm-table.c
drivers/md/dm-thin-metadata.c
drivers/md/dm-thin.c
drivers/md/dm-verity-target.c
drivers/md/dm-writecache.c
drivers/md/dm-zoned-target.c
drivers/md/dm.c
drivers/md/md.c
drivers/md/md.h
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/mmc/core/block.c
drivers/mmc/core/crypto.c
drivers/mmc/core/sd.c
drivers/mmc/host/Kconfig
drivers/mmc/host/cqhci-core.c
drivers/mmc/host/cqhci-crypto.c
drivers/mmc/host/dw_mmc-exynos.c
drivers/mmc/host/mtk-sd.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mmc/host/sdhci-pci-core.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/tmio_mmc_core.c
drivers/mmc/host/vub300.c
drivers/mtd/mtd_blkdevs.c
drivers/mtd/mtdsuper.c
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
drivers/net/ethernet/intel/ice/ice_lag.c
drivers/net/ethernet/intel/ice/ice_ptp.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/mellanox/mlxsw/pci.c
drivers/net/ethernet/microchip/lan743x_main.c
drivers/net/ethernet/netronome/nfp/bpf/main.c
drivers/net/ethernet/netronome/nfp/bpf/main.h
drivers/net/ethernet/netronome/nfp/bpf/offload.c
drivers/net/ethernet/nxp/lpc_eth.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/phy/phy.c
drivers/net/usb/lan78xx.c
drivers/net/usb/usbnet.c
drivers/net/vmxnet3/vmxnet3_drv.c
drivers/net/xen-netfront.c
drivers/nfc/port100.c
drivers/nvdimm/blk.c
drivers/nvdimm/btt.c
drivers/nvdimm/core.c
drivers/nvdimm/pmem.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/host/zns.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/configfs.c
drivers/nvme/target/core.c
drivers/nvme/target/discovery.c
drivers/nvme/target/fabrics-cmd.c
drivers/nvme/target/io-cmd-bdev.c
drivers/nvme/target/io-cmd-file.c
drivers/nvme/target/loop.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/rdma.c
drivers/nvme/target/tcp.c
drivers/reset/Kconfig
drivers/reset/reset-brcmstb-rescal.c
drivers/reset/reset-socfpga.c
drivers/reset/tegra/reset-bpmp.c
drivers/s390/block/dasd.c
drivers/s390/block/dasd_3990_erp.c
drivers/s390/block/dasd_eckd.c
drivers/s390/block/dasd_eckd.h
drivers/s390/block/dasd_erp.c
drivers/s390/block/dasd_genhd.c
drivers/s390/block/dasd_int.h
drivers/s390/block/dasd_ioctl.c
drivers/s390/block/dcssblk.c
drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
drivers/scsi/ibmvscsi/ibmvfc.c
drivers/scsi/lpfc/lpfc.h
drivers/scsi/mpt3sas/mpt3sas_scsih.c
drivers/scsi/qla2xxx/qla_nvme.c
drivers/scsi/scsi_bsg.c
drivers/scsi/scsi_debug.c
drivers/scsi/scsi_error.c
drivers/scsi/scsi_ioctl.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_scan.c
drivers/scsi/sd.c
drivers/scsi/sd.h
drivers/scsi/sd_dif.c
drivers/scsi/sg.c
drivers/scsi/sr.c
drivers/scsi/st.c
drivers/scsi/ufs/ufs-exynos.c
drivers/scsi/ufs/ufshcd-crypto.c
drivers/scsi/ufs/ufshcd-crypto.h
drivers/scsi/ufs/ufshcd.c
drivers/scsi/ufs/ufshcd.h
drivers/scsi/ufs/ufshpb.c
drivers/scsi/ufs/ufshpb.h
drivers/scsi/virtio_scsi.c
drivers/spi/spi-altera-dfl.c
drivers/spi/spi-altera-platform.c
drivers/spi/spi-pl022.c
drivers/target/target_core_file.c
drivers/target/target_core_iblock.c
drivers/target/target_core_pscsi.c
drivers/usb/gadget/function/f_fs.c
drivers/usb/gadget/legacy/inode.c
drivers/vdpa/vdpa_user/vduse_dev.c
drivers/virtio/virtio_ring.c
drivers/watchdog/iTCO_wdt.c
drivers/watchdog/ixp4xx_wdt.c
drivers/watchdog/omap_wdt.c
drivers/watchdog/sbsa_gwdt.c
fs/affs/super.c
fs/afs/write.c
fs/aio.c
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/lzo.c
fs/btrfs/volumes.c
fs/btrfs/zlib.c
fs/btrfs/zstd.c
fs/buffer.c
fs/cachefiles/io.c
fs/cachefiles/rdwr.c
fs/ceph/file.c
fs/ceph/locks.c
fs/cifs/file.c
fs/cramfs/inode.c
fs/crypto/bio.c
fs/crypto/fname.c
fs/crypto/fscrypt_private.h
fs/crypto/hkdf.c
fs/crypto/keysetup.c
fs/direct-io.c
fs/erofs/Kconfig
fs/erofs/Makefile
fs/erofs/compress.h
fs/erofs/data.c
fs/erofs/decompressor.c
fs/erofs/decompressor_lzma.c [new file with mode: 0644]
fs/erofs/erofs_fs.h
fs/erofs/inode.c
fs/erofs/internal.h
fs/erofs/pcpubuf.c
fs/erofs/super.c
fs/erofs/utils.c
fs/erofs/xattr.c
fs/erofs/zdata.c
fs/erofs/zdata.h
fs/erofs/zmap.c
fs/exfat/inode.c
fs/ext4/file.c
fs/ext4/super.c
fs/f2fs/compress.c
fs/f2fs/super.c
fs/fat/inode.c
fs/fs-writeback.c
fs/fuse/file.c
fs/gfs2/file.c
fs/hfs/mdb.c
fs/hfsplus/wrapper.c
fs/internal.h
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c
fs/iomap/direct-io.c
fs/jfs/jfs_metapage.c
fs/jfs/resize.c
fs/jfs/super.c
fs/locks.c
fs/namei.c
fs/nfs/blocklayout/dev.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfsd/Kconfig
fs/nfsd/blocklayout.c
fs/nfsd/nfs4layouts.c
fs/nilfs2/ioctl.c
fs/nilfs2/super.c
fs/nilfs2/the_nilfs.c
fs/ntfs/file.c
fs/ntfs/super.c
fs/ntfs3/file.c
fs/ntfs3/inode.c
fs/ntfs3/super.c
fs/ocfs2/suballoc.c
fs/orangefs/inode.c
fs/orangefs/super.c
fs/overlayfs/file.c
fs/pstore/blk.c
fs/quota/quota.c
fs/ramfs/inode.c
fs/read_write.c
fs/reiserfs/super.c
fs/squashfs/super.c
fs/sync.c
fs/ubifs/crypto.c
fs/udf/lowlevel.c
fs/udf/super.c
fs/xfs/xfs_file.c
fs/zonefs/super.c
include/asm-generic/cacheflush.h
include/linux/ata.h
include/linux/backing-dev.h
include/linux/bio.h
include/linux/blk-crypto-profile.h [new file with mode: 0644]
include/linux/blk-integrity.h [new file with mode: 0644]
include/linux/blk-mq.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/blktrace_api.h
include/linux/bpf.h
include/linux/bpf_types.h
include/linux/bvec.h
include/linux/cdrom.h
include/linux/device-mapper.h
include/linux/filter.h
include/linux/flex_proportions.h
include/linux/fs.h
include/linux/fscrypt.h
include/linux/genhd.h
include/linux/gfp.h
include/linux/highmem-internal.h
include/linux/highmem.h
include/linux/huge_mm.h
include/linux/iomap.h
include/linux/keyslot-manager.h [deleted file]
include/linux/ksm.h
include/linux/libata.h
include/linux/memcontrol.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmc/host.h
include/linux/mmdebug.h
include/linux/netfs.h
include/linux/nvme-fc-driver.h
include/linux/nvme-rdma.h
include/linux/nvme.h
include/linux/page-flags.h
include/linux/page_idle.h
include/linux/page_owner.h
include/linux/page_ref.h
include/linux/pagemap.h
include/linux/part_stat.h
include/linux/percpu-refcount.h
include/linux/rmap.h
include/linux/sbitmap.h
include/linux/sched.h
include/linux/skmsg.h
include/linux/swap.h
include/linux/t10-pi.h
include/linux/tpm.h
include/linux/vmstat.h
include/linux/writeback.h
include/linux/xz.h
include/net/cfg80211.h
include/net/mptcp.h
include/net/sock.h
include/net/tls.h
include/net/udp.h
include/scsi/scsi_cmnd.h
include/scsi/scsi_device.h
include/trace/events/block.h
include/trace/events/erofs.h
include/trace/events/io_uring.h
include/trace/events/pagemap.h
include/trace/events/writeback.h
include/uapi/asm-generic/fcntl.h
include/uapi/linux/cdrom.h
include/uapi/linux/io_uring.h
init/main.c
kernel/acct.c
kernel/bpf/arraymap.c
kernel/bpf/core.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/cgroup/cgroup.c
kernel/events/uprobes.c
kernel/exit.c
kernel/fork.c
kernel/sched/core.c
kernel/sched/sched.h
kernel/trace/blktrace.c
kernel/trace/ftrace.c
kernel/trace/trace_eprobe.c
lib/decompress_unxz.c
lib/flex_proportions.c
lib/random32.c
lib/sbitmap.c
lib/xz/Kconfig
lib/xz/xz_dec_lzma2.c
lib/xz/xz_dec_stream.c
lib/xz/xz_dec_syms.c
lib/xz/xz_private.h
mm/Makefile
mm/backing-dev.c
mm/compaction.c
mm/damon/core-test.h
mm/filemap.c
mm/folio-compat.c [new file with mode: 0644]
mm/highmem.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/khugepaged.c
mm/ksm.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/mempolicy.c
mm/mempool.c
mm/memremap.c
mm/migrate.c
mm/mlock.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_io.c
mm/page_owner.c
mm/readahead.c
mm/rmap.c
mm/secretmem.c
mm/shmem.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/userfaultfd.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/workingset.c
net/batman-adv/bridge_loop_avoidance.c
net/batman-adv/main.c
net/batman-adv/network-coding.c
net/batman-adv/translation-table.c
net/core/dev.c
net/core/net-sysfs.c
net/core/skbuff.c
net/core/skmsg.c
net/core/sock_destructor.h [new file with mode: 0644]
net/core/sysctl_net_core.c
net/ipv4/tcp.c
net/ipv4/tcp_bpf.c
net/ipv4/udp.c
net/ipv4/udp_bpf.c
net/mac80211/mesh.c
net/mptcp/options.c
net/sctp/sm_statefuns.c
net/smc/af_smc.c
net/smc/smc_llc.c
net/tipc/crypto.c
net/tls/tls_main.c
net/tls/tls_sw.c
net/unix/af_unix.c
net/unix/unix_bpf.c
net/wireless/core.c
net/wireless/core.h
net/wireless/mlme.c
net/wireless/scan.c
net/wireless/util.c
tools/perf/Makefile.perf
tools/perf/arch/powerpc/util/skip-callchain-idx.c
tools/perf/builtin-script.c
tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
tools/testing/selftests/net/fcnal-test.sh
tools/testing/selftests/vm/split_huge_page_test.c

index 7f9b40d..4d151fb 100644 (file)
@@ -1,5 +1,7 @@
 .. SPDX-License-Identifier: GPL-2.0
 
+.. _inline_encryption:
+
 =================
 Inline Encryption
 =================
@@ -7,230 +9,269 @@ Inline Encryption
 Background
 ==========
 
-Inline encryption hardware sits logically between memory and the disk, and can
-en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a
-fixed number of "keyslots" - slots into which encryption contexts (i.e. the
-encryption key, encryption algorithm, data unit size) can be programmed by the
-kernel at any time. Each request sent to the disk can be tagged with the index
-of a keyslot (and also a data unit number to act as an encryption tweak), and
-the inline encryption hardware will en/decrypt the data in the request with the
-encryption context programmed into that keyslot. This is very different from
-full disk encryption solutions like self encrypting drives/TCG OPAL/ATA
-Security standards, since with inline encryption, any block on disk could be
-encrypted with any encryption context the kernel chooses.
-
+Inline encryption hardware sits logically between memory and disk, and can
+en/decrypt data as it goes in/out of the disk.  For each I/O request, software
+can control exactly how the inline encryption hardware will en/decrypt the data
+in terms of key, algorithm, data unit size (the granularity of en/decryption),
+and data unit number (a value that determines the initialization vector(s)).
+
+Some inline encryption hardware accepts all encryption parameters including raw
+keys directly in low-level I/O requests.  However, most inline encryption
+hardware instead has a fixed number of "keyslots" and requires that the key,
+algorithm, and data unit size first be programmed into a keyslot.  Each
+low-level I/O request then just contains a keyslot index and data unit number.
+
+Note that inline encryption hardware is very different from traditional crypto
+accelerators, which are supported through the kernel crypto API.  Traditional
+crypto accelerators operate on memory regions, whereas inline encryption
+hardware operates on I/O requests.  Thus, inline encryption hardware needs to be
+managed by the block layer, not the kernel crypto API.
+
+Inline encryption hardware is also very different from "self-encrypting drives",
+such as those based on the TCG Opal or ATA Security standards.  Self-encrypting
+drives don't provide fine-grained control of encryption and provide no way to
+verify the correctness of the resulting ciphertext.  Inline encryption hardware
+provides fine-grained control of encryption, including the choice of key and
+initialization vector for each sector, and can be tested for correctness.
 
 Objective
 =========
 
-We want to support inline encryption (IE) in the kernel.
-To allow for testing, we also want a crypto API fallback when actual
-IE hardware is absent. We also want IE to work with layered devices
-like dm and loopback (i.e. we want to be able to use the IE hardware
-of the underlying devices if present, or else fall back to crypto API
-en/decryption).
-
+We want to support inline encryption in the kernel.  To make testing easier, we
+also want support for falling back to the kernel crypto API when actual inline
+encryption hardware is absent.  We also want inline encryption to work with
+layered devices like device-mapper and loopback (i.e. we want to be able to use
+the inline encryption hardware of the underlying devices if present, or else
+fall back to crypto API en/decryption).
 
 Constraints and notes
 =====================
 
-- IE hardware has a limited number of "keyslots" that can be programmed
-  with an encryption context (key, algorithm, data unit size, etc.) at any time.
-  One can specify a keyslot in a data request made to the device, and the
-  device will en/decrypt the data using the encryption context programmed into
-  that specified keyslot. When possible, we want to make multiple requests with
-  the same encryption context share the same keyslot.
-
-- We need a way for upper layers like filesystems to specify an encryption
-  context to use for en/decrypting a struct bio, and a device driver (like UFS)
-  needs to be able to use that encryption context when it processes the bio.
-
-- We need a way for device drivers to expose their inline encryption
-  capabilities in a unified way to the upper layers.
-
-
-Design
-======
-
-We add a struct bio_crypt_ctx to struct bio that can
-represent an encryption context, because we need to be able to pass this
-encryption context from the upper layers (like the fs layer) to the
-device driver to act upon.
-
-While IE hardware works on the notion of keyslots, the FS layer has no
-knowledge of keyslots - it simply wants to specify an encryption context to
-use while en/decrypting a bio.
-
-We introduce a keyslot manager (KSM) that handles the translation from
-encryption contexts specified by the FS to keyslots on the IE hardware.
-This KSM also serves as the way IE hardware can expose its capabilities to
-upper layers. The generic mode of operation is: each device driver that wants
-to support IE will construct a KSM and set it up in its struct request_queue.
-Upper layers that want to use IE on this device can then use this KSM in
-the device's struct request_queue to translate an encryption context into
-a keyslot. The presence of the KSM in the request queue shall be used to mean
-that the device supports IE.
-
-The KSM uses refcounts to track which keyslots are idle (either they have no
-encryption context programmed, or there are no in-flight struct bios
-referencing that keyslot). When a new encryption context needs a keyslot, it
-tries to find a keyslot that has already been programmed with the same
-encryption context, and if there is no such keyslot, it evicts the least
-recently used idle keyslot and programs the new encryption context into that
-one. If no idle keyslots are available, then the caller will sleep until there
-is at least one.
-
-
-blk-mq changes, other block layer changes and blk-crypto-fallback
-=================================================================
-
-We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to
-struct request. These will be referred to as the ``crypto fields``
-for the request. This ``keyslot`` is the keyslot into which the
-``bi_crypt_context`` has been programmed in the KSM of the ``request_queue``
-that this request is being sent to.
-
-We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain
-blissfully unaware of whether or not real inline encryption hardware is present
-underneath. When a bio is submitted with a target ``request_queue`` that doesn't
-support the encryption context specified with the bio, the block layer will
-en/decrypt the bio with the blk-crypto-fallback.
-
-If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio
-is encrypted stored in the bounce bio - blk-mq will then proceed to process the
-bounce bio as if it were not encrypted at all (except when blk-integrity is
-concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an
-internal function that cleans up the bounce bio and ends the original bio.
-
-If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``)
-is saved and overwritten by ``blk-crypto-fallback`` to
-``bio_crypto_fallback_decrypt_bio``.  The bio's ``bi_crypt_context`` is also
-overwritten with ``NULL``, so that to the rest of the stack, the bio looks
-as if it was a regular bio that never had an encryption context specified.
-``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original
-``bi_end_io`` (and also ``bi_private``) and end the bio again.
-
-Regardless of whether real inline encryption hardware is used or the
+- We need a way for upper layers (e.g. filesystems) to specify an encryption
+  context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need
+  to be able to use that encryption context when they process the request.
+  Encryption contexts also introduce constraints on bio merging; the block layer
+  needs to be aware of these constraints.
+
+- Different inline encryption hardware has different supported algorithms,
+  supported data unit sizes, maximum data unit numbers, etc.  We call these
+  properties the "crypto capabilities".  We need a way for device drivers to
+  advertise crypto capabilities to upper layers in a generic way.
+
+- Inline encryption hardware usually (but not always) requires that keys be
+  programmed into keyslots before being used.  Since programming keyslots may be
+  slow and there may not be very many keyslots, we shouldn't just program the
+  key for every I/O request, but rather keep track of which keys are in the
+  keyslots and reuse an already-programmed keyslot when possible.
+
+- Upper layers typically define a specific end-of-life for crypto keys, e.g.
+  when an encrypted directory is locked or when a crypto mapping is torn down.
+  At these times, keys are wiped from memory.  We must provide a way for upper
+  layers to also evict keys from any keyslots they are present in.
+
+- When possible, device-mapper devices must be able to pass through the inline
+  encryption support of their underlying devices.  However, it doesn't make
+  sense for device-mapper devices to have keyslots themselves.
+
+Basic design
+============
+
+We introduce ``struct blk_crypto_key`` to represent an inline encryption key and
+how it will be used.  This includes the actual bytes of the key; the size of the
+key; the algorithm and data unit size the key will be used with; and the number
+of bytes needed to represent the maximum data unit number the key will be used
+with.
+
+We introduce ``struct bio_crypt_ctx`` to represent an encryption context.  It
+contains a data unit number and a pointer to a blk_crypto_key.  We add pointers
+to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users
+of the block layer (e.g. filesystems) to provide an encryption context when
+creating a bio and have it be passed down the stack for processing by the block
+layer and device drivers.  Note that the encryption context doesn't explicitly
+say whether to encrypt or decrypt, as that is implicit from the direction of the
+bio; WRITE means encrypt, and READ means decrypt.
+
+We also introduce ``struct blk_crypto_profile`` to contain all generic inline
+encryption-related state for a particular inline encryption device.  The
+blk_crypto_profile serves as the way that drivers for inline encryption hardware
+advertise their crypto capabilities and provide certain functions (e.g.,
+functions to program and evict keys) to upper layers.  Each device driver that
+wants to support inline encryption will construct a blk_crypto_profile, then
+associate it with the disk's request_queue.
+
+The blk_crypto_profile also manages the hardware's keyslots, when applicable.
+This happens in the block layer, so that users of the block layer can just
+specify encryption contexts and don't need to know about keyslots at all, nor do
+device drivers need to care about most details of keyslot management.
+
+Specifically, for each keyslot, the block layer (via the blk_crypto_profile)
+keeps track of which blk_crypto_key that keyslot contains (if any), and how many
+in-flight I/O requests are using it.  When the block layer creates a
+``struct request`` for a bio that has an encryption context, it grabs a keyslot
+that already contains the key if possible.  Otherwise it waits for an idle
+keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the
+least-recently-used idle keyslot using the function the device driver provided.
+In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of
+the request, where it is then accessible to device drivers and is released after
+the request completes.
+
+``struct request`` also contains a pointer to the original bio_crypt_ctx.
+Requests can be built from multiple bios, and the block layer must take the
+encryption context into account when trying to merge bios and requests.  For two
+bios/requests to be merged, they must have compatible encryption contexts: both
+unencrypted, or both encrypted with the same key and contiguous data unit
+numbers.  Only the encryption context for the first bio in a request is
+retained, since the remaining bios have been verified to be merge-compatible
+with the first bio.
+
+To make it possible for inline encryption to work with request_queue based
+layered devices, when a request is cloned, its encryption context is cloned as
+well.  When the cloned request is submitted, it is then processed as usual; this
+includes getting a keyslot from the clone's target device if needed.
+
+blk-crypto-fallback
+===================
+
+It is desirable for the inline encryption support of upper layers (e.g.
+filesystems) to be testable without real inline encryption hardware, and
+likewise for the block layer's keyslot management logic.  It is also desirable
+to allow upper layers to just always use inline encryption rather than have to
+implement encryption in multiple ways.
+
+Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
+of inline encryption using the kernel crypto API.  blk-crypto-fallback is built
+into the block layer, so it works on any block device without any special setup.
+Essentially, when a bio with an encryption context is submitted to a
+request_queue that doesn't support that encryption context, the block layer will
+handle en/decryption of the bio using blk-crypto-fallback.
+
+For encryption, the data cannot be encrypted in-place, as callers usually rely
+on it being unmodified.  Instead, blk-crypto-fallback allocates bounce pages,
+fills a new bio with those bounce pages, encrypts the data into those bounce
+pages, and submits that "bounce" bio.  When the bounce bio completes,
+blk-crypto-fallback completes the original bio.  If the original bio is too
+large, multiple bounce bios may be required; see the code for details.
+
+For decryption, blk-crypto-fallback "wraps" the bio's completion callback
+(``bi_complete``) and private data (``bi_private``) with its own, unsets the
+bio's encryption context, then submits the bio.  If the read completes
+successfully, blk-crypto-fallback restores the bio's original completion
+callback and private data, then decrypts the bio's data in-place using the
+kernel crypto API.  Decryption happens from a workqueue, as it may sleep.
+Afterwards, blk-crypto-fallback completes the bio.
+
+In both cases, the bios that blk-crypto-fallback submits no longer have an
+encryption context.  Therefore, lower layers only see standard unencrypted I/O.
+
+blk-crypto-fallback also defines its own blk_crypto_profile and has its own
+"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects.  The reason
+for this is twofold.  First, it allows the keyslot management logic to be tested
+without actual inline encryption hardware.  Second, similar to actual inline
+encryption hardware, the crypto API doesn't accept keys directly in requests but
+rather requires that keys be set ahead of time, and setting keys can be
+expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path
+at all due to the locks it takes.  Therefore, the concept of keyslots still
+makes sense for blk-crypto-fallback.
+
+Note that regardless of whether real inline encryption hardware or
 blk-crypto-fallback is used, the ciphertext written to disk (and hence the
-on-disk format of data) will be the same (assuming the hardware's implementation
-of the algorithm being used adheres to spec and functions correctly).
-
-If a ``request queue``'s inline encryption hardware claimed to support the
-encryption context specified with a bio, then it will not be handled by the
-``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a
-struct request needs to be allocated for that bio. At that point,
-blk-mq tries to program the encryption context into the ``request_queue``'s
-keyslot_manager, and obtain a keyslot, which it stores in its newly added
-``keyslot`` field. This keyslot is released when the request is completed.
-
-When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called,
-which sets the request's ``crypt_ctx`` to a copy of the bio's
-``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent
-bio is merged to the front of the request, which updates the ``crypt_ctx`` of
-the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first
-bio in its bio-list (blk-mq needs to be careful to maintain this invariant
-during bio and request merges).
-
-To make it possible for inline encryption to work with request queue based
-layered devices, when a request is cloned, its ``crypto fields`` are cloned as
-well. When the cloned request is submitted, blk-mq programs the
-``bi_crypt_context`` of the request into the clone's request_queue's keyslot
-manager, and stores the returned keyslot in the clone's ``keyslot``.
+on-disk format of data) will be the same (assuming that both the inline
+encryption hardware's implementation and the kernel crypto API's implementation
+of the algorithm being used adhere to spec and function correctly).
 
+blk-crypto-fallback is optional and is controlled by the
+``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option.
 
 API presented to users of the block layer
 =========================================
 
-``struct blk_crypto_key`` represents a crypto key (the raw key, size of the
-key, the crypto algorithm to use, the data unit size to use, and the number of
-bytes required to represent data unit numbers that will be specified with the
-``bi_crypt_context``).
-
-``blk_crypto_init_key`` allows upper layers to initialize such a
-``blk_crypto_key``.
-
-``bio_crypt_set_ctx`` should be called on any bio that a user of
-the block layer wants en/decrypted via inline encryption (or the
-blk-crypto-fallback, if hardware support isn't available for the desired
-crypto configuration). This function takes the ``blk_crypto_key`` and the
-data unit number (DUN) to use when en/decrypting the bio.
-
-``blk_crypto_config_supported`` allows upper layers to query whether or not the
-an encryption context passed to request queue can be handled by blk-crypto
-(either by real inline encryption hardware, or by the blk-crypto-fallback).
-This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer
-wants to use an algorithm that may not supported by hardware - this function
-lets the upper layer know ahead of time that the algorithm isn't supported,
-and the upper layer can fallback to something else if appropriate.
-
-``blk_crypto_start_using_key`` - Upper layers must call this function on
-``blk_crypto_key`` and a ``request_queue`` before using the key with any bio
-headed for that ``request_queue``. This function ensures that either the
-hardware supports the key's crypto settings, or the crypto API fallback has
-transforms for the needed mode allocated and ready to go. Note that this
-function may allocate an ``skcipher``, and must not be called from the data
-path, since allocating ``skciphers`` from the data path can deadlock.
-
-``blk_crypto_evict_key`` *must* be called by upper layers before a
-``blk_crypto_key`` is freed. Further, it *must* only be called only once
-there are no more in-flight requests that use that ``blk_crypto_key``.
-``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in
-inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback).
+``blk_crypto_config_supported()`` allows users to check ahead of time whether
+inline encryption with particular crypto settings will work on a particular
+request_queue -- either via hardware or via blk-crypto-fallback.  This function
+takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
+the actual bytes of the key and instead just contains the algorithm, data unit
+size, etc.  This function can be useful if blk-crypto-fallback is disabled.
+
+``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
+
+Users must call ``blk_crypto_start_using_key()`` before actually starting to use
+a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+was called earlier).  This is needed to initialize blk-crypto-fallback if it
+will be needed.  This must not be called from the data path, as this may have to
+allocate resources, which may deadlock in that case.
+
+Next, to attach an encryption context to a bio, users should call
+``bio_crypt_set_ctx()``.  This function allocates a bio_crypt_ctx and attaches
+it to a bio, given the blk_crypto_key and the data unit number that will be used
+for en/decryption.  Users don't need to worry about freeing the bio_crypt_ctx
+later, as that happens automatically when the bio is freed or reset.
+
+Finally, when done using inline encryption with a blk_crypto_key on a
+request_queue, users must call ``blk_crypto_evict_key()``.  This ensures that
+the key is evicted from all keyslots it may be programmed into and unlinked from
+any kernel data structures it may be linked into.
+
+In summary, for users of the block layer, the lifecycle of a blk_crypto_key is
+as follows:
+
+1. ``blk_crypto_config_supported()`` (optional)
+2. ``blk_crypto_init_key()``
+3. ``blk_crypto_start_using_key()``
+4. ``bio_crypt_set_ctx()`` (potentially many times)
+5. ``blk_crypto_evict_key()`` (after all I/O has completed)
+6. Zeroize the blk_crypto_key (this has no dedicated function)
+
+If a blk_crypto_key is being used on multiple request_queues, then
+``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
+and ``blk_crypto_evict_key()`` must be called on each request_queue.
 
 API presented to device drivers
 ===============================
 
-A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in
-the ``request_queue`` of the device. The device driver needs to call
-``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the
-``blk_keyslot_manager``, while specifying the number of keyslots supported by
-the hardware.
-
-The device driver also needs to tell the KSM how to actually manipulate the
-IE hardware in the device to do things like programming the crypto key into
-the IE hardware into a particular keyslot. All this is achieved through the
-struct blk_ksm_ll_ops field in the KSM that the device driver
-must fill up after initing the ``blk_keyslot_manager``.
-
-The KSM also handles runtime power management for the device when applicable
-(e.g. when it wants to program a crypto key into the IE hardware, the device
-must be runtime powered on) - so the device driver must also set the ``dev``
-field in the ksm to point to the `struct device` for the KSM to use for runtime
-power management.
-
-``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device
-needs each and every of its keyslots to be reprogrammed with the key it
-"should have" at the point in time when the function is called. This is useful
-e.g. if a device loses all its keys on runtime power down/up.
-
-If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then
-``blk_ksm_destroy`` should be called to free up all resources used by a
-``blk_keyslot_manager`` once it is no longer needed.
+A device driver that wants to support inline encryption must set up a
+blk_crypto_profile in the request_queue of its device.  To do this, it first
+must call ``blk_crypto_profile_init()`` (or its resource-managed variant
+``devm_blk_crypto_profile_init()``), providing the number of keyslots.
+
+Next, it must advertise its crypto capabilities by setting fields in the
+blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``.
+
+It then must set function pointers in the ``ll_ops`` field of the
+blk_crypto_profile to tell upper layers how to control the inline encryption
+hardware, e.g. how to program and evict keyslots.  Most drivers will need to
+implement ``keyslot_program`` and ``keyslot_evict``.  For details, see the
+comments for ``struct blk_crypto_ll_ops``.
+
+Once the driver registers a blk_crypto_profile with a request_queue, I/O
+requests the driver receives via that queue may have an encryption context.  All
+encryption contexts will be compatible with the crypto capabilities declared in
+the blk_crypto_profile, so drivers don't need to worry about handling
+unsupported requests.  Also, if a nonzero number of keyslots was declared in the
+blk_crypto_profile, then all I/O requests that have an encryption context will
+also have a keyslot which was already programmed with the appropriate key.
+
+If the driver implements runtime suspend and its blk_crypto_ll_ops don't work
+while the device is runtime-suspended, then the driver must also set the ``dev``
+field of the blk_crypto_profile to point to the ``struct device`` that will be
+resumed before any of the low-level operations are called.
+
+If there are situations where the inline encryption hardware loses the contents
+of its keyslots, e.g. device resets, the driver must handle reprogramming the
+keyslots.  To do this, the driver may call ``blk_crypto_reprogram_all_keys()``.
+
+Finally, if the driver used ``blk_crypto_profile_init()`` instead of
+``devm_blk_crypto_profile_init()``, then it is responsible for calling
+``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed.
 
 Layered Devices
 ===============
 
-Request queue based layered devices like dm-rq that wish to support IE need to
-create their own keyslot manager for their request queue, and expose whatever
-functionality they choose. When a layered device wants to pass a clone of that
-request to another ``request_queue``, blk-crypto will initialize and prepare the
-clone as necessary - see ``blk_crypto_insert_cloned_request`` in
-``blk-crypto.c``.
-
-
-Future Optimizations for layered devices
-========================================
-
-Creating a keyslot manager for a layered device uses up memory for each
-keyslot, and in general, a layered device merely passes the request on to a
-"child" device, so the keyslots in the layered device itself are completely
-unused, and don't need any refcounting or keyslot programming. We can instead
-define a new type of KSM; the "passthrough KSM", that layered devices can use
-to advertise an unlimited number of keyslots, and support for any encryption
-algorithms they choose, while not actually using any memory for each keyslot.
-Another use case for the "passthrough KSM" is for IE devices that do not have a
-limited number of keyslots.
-
+Request queue based layered devices like dm-rq that wish to support inline
+encryption need to create their own blk_crypto_profile for their request_queue,
+and expose whatever functionality they choose. When a layered device wants to
+pass a clone of that request to another request_queue, blk-crypto will
+initialize and prepare the clone as necessary; see
+``blk_crypto_insert_cloned_request()``.
 
 Interaction between inline encryption and blk integrity
 =======================================================
@@ -257,7 +298,7 @@ Because there isn't any real hardware yet, it seems prudent to assume that
 hardware implementations might not implement both features together correctly,
 and disallow the combination for now. Whenever a device supports integrity, the
 kernel will pretend that the device does not support hardware inline encryption
-(by essentially setting the keyslot manager in the request_queue of the device
-to NULL). When the crypto API fallback is enabled, this means that all bios with
-and encryption context will use the fallback, and IO will complete as usual.
-When the fallback is disabled, a bio with an encryption context will be failed.
+(by setting the blk_crypto_profile in the request_queue of the device to NULL).
+When the crypto API fallback is enabled, this means that all bios with and
+encryption context will use the fallback, and IO will complete as usual.  When
+the fallback is disabled, a bio with an encryption context will be failed.
index 4dc7f0d..e8c7430 100644 (file)
@@ -4,7 +4,7 @@ Queue sysfs files
 
 This text file will detail the queue files that are located in the sysfs tree
 for each block device. Note that stacked devices typically do not export
-any settings, since their queue merely functions are a remapping target.
+any settings, since their queue merely functions as a remapping target.
 These files are the ones found in the /sys/block/xxx/queue/ directory.
 
 Files denoted with a RO postfix are readonly and the RW postfix means
@@ -286,4 +286,35 @@ sequential zones of zoned block devices (devices with a zoned attributed
 that reports "host-managed" or "host-aware"). This value is always 0 for
 regular block devices.
 
+independent_access_ranges (RO)
+------------------------------
+
+The presence of this sub-directory of the /sys/block/xxx/queue/ directory
+indicates that the device is capable of executing requests targeting
+different sector ranges in parallel. For instance, single LUN multi-actuator
+hard-disks will have an independent_access_ranges directory if the device
+correctly advertizes the sector ranges of its actuators.
+
+The independent_access_ranges directory contains one directory per access
+range, with each range described using the sector (RO) attribute file to
+indicate the first sector of the range and the nr_sectors (RO) attribute file
+to indicate the total number of sectors in the range starting from the first
+sector of the range.  For example, a dual-actuator hard-disk will have the
+following independent_access_ranges entries.::
+
+        $ tree /sys/block/<device>/queue/independent_access_ranges/
+        /sys/block/<device>/queue/independent_access_ranges/
+        |-- 0
+        |   |-- nr_sectors
+        |   `-- sector
+        `-- 1
+            |-- nr_sectors
+            `-- sector
+
+The sector and nr_sectors attributes use 512B sector unit, regardless of
+the actual block size of the device. Independent access ranges do not
+overlap and include all sectors within the device capacity. The access
+ranges are numbered in increasing order of the range start sector,
+that is, the sector attribute of range 0 always has the value 0.
+
 Jens Axboe <jens.axboe@oracle.com>, February 2009
index 5845960..52ea7b6 100644 (file)
@@ -907,6 +907,17 @@ commands can be identified by the underscores in their names.
        specifies the slot for which the information is given. The special
        value *CDSL_CURRENT* requests that information about the currently
        selected slot be returned.
+`CDROM_TIMED_MEDIA_CHANGE`
+       Checks whether the disc has been changed since a user supplied time
+       and returns the time of the last disc change.
+
+       *arg* is a pointer to a *cdrom_timed_media_change_info* struct.
+       *arg->last_media_change* may be set by calling code to signal
+       the timestamp of the last known media change (by the caller).
+       Upon successful return, this ioctl call will set
+       *arg->last_media_change* to the latest media change timestamp (in ms)
+       known by the kernel/driver and set *arg->has_changed* to 1 if
+       that timestamp is more recent than the timestamp set by the caller.
 `CDROM_DRIVE_STATUS`
        Returns the status of the drive by a call to
        *drive_status()*. Return values are defined in cdrom_drive_status_.
index 8aed910..5c0552e 100644 (file)
@@ -326,6 +326,12 @@ maps this page at its virtual address.
                        dirty.  Again, see sparc64 for examples of how
                        to deal with this.
 
+  ``void flush_dcache_folio(struct folio *folio)``
+       This function is called under the same circumstances as
+       flush_dcache_page().  It allows the architecture to
+       optimise for flushing the entire folio of pages instead
+       of flushing one page at a time.
+
   ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
   unsigned long user_vaddr, void *dst, void *src, int len)``
   ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
index a42f9ba..395835f 100644 (file)
@@ -95,6 +95,11 @@ More Memory Management Functions
 .. kernel-doc:: mm/mempolicy.c
 .. kernel-doc:: include/linux/mm_types.h
    :internal:
+.. kernel-doc:: include/linux/mm_inline.h
+.. kernel-doc:: include/linux/page-flags.h
 .. kernel-doc:: include/linux/mm.h
    :internal:
+.. kernel-doc:: include/linux/page_ref.h
 .. kernel-doc:: include/linux/mmzone.h
+.. kernel-doc:: mm/util.c
+   :functions: folio_mapping
index b97579b..01df283 100644 (file)
@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
    immutable and bit-for-bit identical to the official golden image for
    their releases due to security and other considerations and
 
- - hope to save some extra storage space with guaranteed end-to-end performance
-   by using reduced metadata and transparent file compression, especially
-   for those embedded devices with limited memory (ex, smartphone);
+ - hope to minimize extra storage space with guaranteed end-to-end performance
+   by using compact layout, transparent file compression and direct access,
+   especially for those embedded devices with limited memory and high-density
+   hosts with numerous containers;
 
 Here is the main features of EROFS:
 
@@ -51,7 +52,9 @@ Here is the main features of EROFS:
  - Support POSIX.1e ACLs by using xattrs;
 
  - Support transparent data compression as an option:
-   LZ4 algorithm with the fixed-sized output compression for high performance.
+   LZ4 algorithm with the fixed-sized output compression for high performance;
+
+ - Multiple device support for multi-layer container images.
 
 The following git tree provides the file system user-space tools under
 development (ex, formatting tool mkfs.erofs):
@@ -87,6 +90,7 @@ cache_strategy=%s      Select a strategy for cached decompression from now on:
 dax={always,never}     Use direct access (no page cache).  See
                        Documentation/filesystems/dax.rst.
 dax                    A legacy option which is an alias for ``dax=always``.
+device=%s              Specify a path to an extra device to be used together.
 ===================    =========================================================
 
 On-disk details
index 0eb799d..4d5d50d 100644 (file)
@@ -77,11 +77,11 @@ Side-channel attacks
 
 fscrypt is only resistant to side-channel attacks, such as timing or
 electromagnetic attacks, to the extent that the underlying Linux
-Cryptographic API algorithms are.  If a vulnerable algorithm is used,
-such as a table-based implementation of AES, it may be possible for an
-attacker to mount a side channel attack against the online system.
-Side channel attacks may also be mounted against applications
-consuming decrypted data.
+Cryptographic API algorithms or inline encryption hardware are.  If a
+vulnerable algorithm is used, such as a table-based implementation of
+AES, it may be possible for an attacker to mount a side channel attack
+against the online system.  Side channel attacks may also be mounted
+against applications consuming decrypted data.
 
 Unauthorized file access
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -176,11 +176,11 @@ Master Keys
 
 Each encrypted directory tree is protected by a *master key*.  Master
 keys can be up to 64 bytes long, and must be at least as long as the
-greater of the key length needed by the contents and filenames
-encryption modes being used.  For example, if AES-256-XTS is used for
-contents encryption, the master key must be 64 bytes (512 bits).  Note
-that the XTS mode is defined to require a key twice as long as that
-required by the underlying block cipher.
+greater of the security strength of the contents and filenames
+encryption modes being used.  For example, if any AES-256 mode is
+used, the master key must be at least 256 bits, i.e. 32 bytes.  A
+stricter requirement applies if the key is used by a v1 encryption
+policy and AES-256-XTS is used; such keys must be 64 bytes.
 
 To "unlock" an encrypted directory tree, userspace must provide the
 appropriate master key.  There can be any number of master keys, each
@@ -1135,6 +1135,50 @@ where applications may later write sensitive data.  It is recommended
 that systems implementing a form of "verified boot" take advantage of
 this by validating all top-level encryption policies prior to access.
 
+Inline encryption support
+=========================
+
+By default, fscrypt uses the kernel crypto API for all cryptographic
+operations (other than HKDF, which fscrypt partially implements
+itself).  The kernel crypto API supports hardware crypto accelerators,
+but only ones that work in the traditional way where all inputs and
+outputs (e.g. plaintexts and ciphertexts) are in memory.  fscrypt can
+take advantage of such hardware, but the traditional acceleration
+model isn't particularly efficient and fscrypt hasn't been optimized
+for it.
+
+Instead, many newer systems (especially mobile SoCs) have *inline
+encryption hardware* that can encrypt/decrypt data while it is on its
+way to/from the storage device.  Linux supports inline encryption
+through a set of extensions to the block layer called *blk-crypto*.
+blk-crypto allows filesystems to attach encryption contexts to bios
+(I/O requests) to specify how the data will be encrypted or decrypted
+in-line.  For more information about blk-crypto, see
+:ref:`Documentation/block/inline-encryption.rst <inline_encryption>`.
+
+On supported filesystems (currently ext4 and f2fs), fscrypt can use
+blk-crypto instead of the kernel crypto API to encrypt/decrypt file
+contents.  To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in
+the kernel configuration, and specify the "inlinecrypt" mount option
+when mounting the filesystem.
+
+Note that the "inlinecrypt" mount option just specifies to use inline
+encryption when possible; it doesn't force its use.  fscrypt will
+still fall back to using the kernel crypto API on files where the
+inline encryption hardware doesn't have the needed crypto capabilities
+(e.g. support for the needed encryption algorithm and data unit size)
+and where blk-crypto-fallback is unusable.  (For blk-crypto-fallback
+to be usable, it must be enabled in the kernel configuration with
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.)
+
+Currently fscrypt always uses the filesystem block size (which is
+usually 4096 bytes) as the data unit size.  Therefore, it can only use
+inline encryption hardware that supports that data unit size.
+
+Inline encryption doesn't affect the ciphertext or other aspects of
+the on-disk format, so users may freely switch back and forth between
+using "inlinecrypt" and not using "inlinecrypt".
+
 Implementation details
 ======================
 
@@ -1184,6 +1228,13 @@ keys`_ and `DIRECT_KEY policies`_.
 Data path changes
 -----------------
 
+When inline encryption is used, filesystems just need to associate
+encryption contexts with bios to specify how the block layer or the
+inline encryption hardware will encrypt/decrypt the file contents.
+
+When inline encryption isn't used, filesystems must encrypt/decrypt
+the file contents themselves, as described below:
+
 For the read path (->readpage()) of regular files, filesystems can
 read the ciphertext into the page cache and decrypt it in-place.  The
 page lock must be held until decryption has finished, to prevent the
@@ -1197,18 +1248,6 @@ buffer.  Some filesystems, such as UBIFS, already use temporary
 buffers regardless of encryption.  Other filesystems, such as ext4 and
 F2FS, have to allocate bounce pages specially for encryption.
 
-Fscrypt is also able to use inline encryption hardware instead of the
-kernel crypto API for en/decryption of file contents.  When possible,
-and if directed to do so (by specifying the 'inlinecrypt' mount option
-for an ext4/F2FS filesystem), it adds encryption contexts to bios and
-uses blk-crypto to perform the en/decryption instead of making use of
-the above read/write path changes.  Of course, even if directed to
-make use of inline encryption, fscrypt will only be able to do so if
-either hardware inline encryption support is available for the
-selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
-is selected.  If neither is the case, fscrypt will fall back to using
-the above mentioned read/write path changes for en/decryption.
-
 Filename hashing and encoding
 -----------------------------
 
index c0ad233..bee63d4 100644 (file)
@@ -29,7 +29,6 @@ algorithms work.
    fiemap
    files
    locks
-   mandatory-locking
    mount_api
    quota
    seq_file
index c5ae858..2642931 100644 (file)
@@ -57,16 +57,9 @@ fcntl(), with all the problems that implies.
 1.3 Mandatory Locking As A Mount Option
 ---------------------------------------
 
-Mandatory locking, as described in
-'Documentation/filesystems/mandatory-locking.rst' was prior to this release a
-general configuration option that was valid for all mounted filesystems.  This
-had a number of inherent dangers, not the least of which was the ability to
-freeze an NFS server by asking it to read a file for which a mandatory lock
-existed.
-
-From this release of the kernel, mandatory locking can be turned on and off
-on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
-The default is to disallow mandatory locking. The intention is that
-mandatory locking only be enabled on a local filesystem as the specific need
-arises.
+Mandatory locking was prior to this release a general configuration option
+that was valid for all mounted filesystems.  This had a number of inherent
+dangers, not the least of which was the ability to freeze an NFS server by
+asking it to read a file for which a mandatory lock existed.
 
+Such option was dropped in Kernel v5.14.
index 57a6418..bb68d39 100644 (file)
@@ -524,3 +524,5 @@ Note that these methods are passed a pointer to the cache resource structure,
 not the read request structure as they could be used in other situations where
 there isn't a read request structure as well, such as writing dirty data to the
 cache.
+
+.. kernel-doc:: include/linux/netfs.h
index 3b4c050..682948f 100644 (file)
@@ -13,61 +13,64 @@ in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
 ioctl values are listed in <linux/cdrom.h>.  As of this writing, they
 are as follows:
 
-       ======================  ===============================================
-       CDROMPAUSE              Pause Audio Operation
-       CDROMRESUME             Resume paused Audio Operation
-       CDROMPLAYMSF            Play Audio MSF (struct cdrom_msf)
-       CDROMPLAYTRKIND         Play Audio Track/index (struct cdrom_ti)
-       CDROMREADTOCHDR         Read TOC header (struct cdrom_tochdr)
-       CDROMREADTOCENTRY       Read TOC entry (struct cdrom_tocentry)
-       CDROMSTOP               Stop the cdrom drive
-       CDROMSTART              Start the cdrom drive
-       CDROMEJECT              Ejects the cdrom media
-       CDROMVOLCTRL            Control output volume (struct cdrom_volctrl)
-       CDROMSUBCHNL            Read subchannel data (struct cdrom_subchnl)
-       CDROMREADMODE2          Read CDROM mode 2 data (2336 Bytes)
-                               (struct cdrom_read)
-       CDROMREADMODE1          Read CDROM mode 1 data (2048 Bytes)
-                               (struct cdrom_read)
-       CDROMREADAUDIO          (struct cdrom_read_audio)
-       CDROMEJECT_SW           enable(1)/disable(0) auto-ejecting
-       CDROMMULTISESSION       Obtain the start-of-last-session
-                               address of multi session disks
-                               (struct cdrom_multisession)
-       CDROM_GET_MCN           Obtain the "Universal Product Code"
-                               if available (struct cdrom_mcn)
-       CDROM_GET_UPC           Deprecated, use CDROM_GET_MCN instead.
-       CDROMRESET              hard-reset the drive
-       CDROMVOLREAD            Get the drive's volume setting
-                               (struct cdrom_volctrl)
-       CDROMREADRAW            read data in raw mode (2352 Bytes)
-                               (struct cdrom_read)
-       CDROMREADCOOKED         read data in cooked mode
-       CDROMSEEK               seek msf address
-       CDROMPLAYBLK            scsi-cd only, (struct cdrom_blk)
-       CDROMREADALL            read all 2646 bytes
-       CDROMGETSPINDOWN        return 4-bit spindown value
-       CDROMSETSPINDOWN        set 4-bit spindown value
-       CDROMCLOSETRAY          pendant of CDROMEJECT
-       CDROM_SET_OPTIONS       Set behavior options
-       CDROM_CLEAR_OPTIONS     Clear behavior options
-       CDROM_SELECT_SPEED      Set the CD-ROM speed
-       CDROM_SELECT_DISC       Select disc (for juke-boxes)
-       CDROM_MEDIA_CHANGED     Check is media changed
-       CDROM_DRIVE_STATUS      Get tray position, etc.
-       CDROM_DISC_STATUS       Get disc type, etc.
-       CDROM_CHANGER_NSLOTS    Get number of slots
-       CDROM_LOCKDOOR          lock or unlock door
-       CDROM_DEBUG             Turn debug messages on/off
-       CDROM_GET_CAPABILITY    get capabilities
-       CDROMAUDIOBUFSIZ        set the audio buffer size
-       DVD_READ_STRUCT         Read structure
-       DVD_WRITE_STRUCT        Write structure
-       DVD_AUTH                Authentication
-       CDROM_SEND_PACKET       send a packet to the drive
-       CDROM_NEXT_WRITABLE     get next writable block
-       CDROM_LAST_WRITTEN      get last block written on disc
-       ======================  ===============================================
+       ========================  ===============================================
+       CDROMPAUSE                Pause Audio Operation
+       CDROMRESUME               Resume paused Audio Operation
+       CDROMPLAYMSF              Play Audio MSF (struct cdrom_msf)
+       CDROMPLAYTRKIND           Play Audio Track/index (struct cdrom_ti)
+       CDROMREADTOCHDR           Read TOC header (struct cdrom_tochdr)
+       CDROMREADTOCENTRY         Read TOC entry (struct cdrom_tocentry)
+       CDROMSTOP                 Stop the cdrom drive
+       CDROMSTART                Start the cdrom drive
+       CDROMEJECT                Ejects the cdrom media
+       CDROMVOLCTRL              Control output volume (struct cdrom_volctrl)
+       CDROMSUBCHNL              Read subchannel data (struct cdrom_subchnl)
+       CDROMREADMODE2            Read CDROM mode 2 data (2336 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADMODE1            Read CDROM mode 1 data (2048 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADAUDIO            (struct cdrom_read_audio)
+       CDROMEJECT_SW             enable(1)/disable(0) auto-ejecting
+       CDROMMULTISESSION         Obtain the start-of-last-session
+                                 address of multi session disks
+                                 (struct cdrom_multisession)
+       CDROM_GET_MCN             Obtain the "Universal Product Code"
+                                 if available (struct cdrom_mcn)
+       CDROM_GET_UPC             Deprecated, use CDROM_GET_MCN instead.
+       CDROMRESET                hard-reset the drive
+       CDROMVOLREAD              Get the drive's volume setting
+                                 (struct cdrom_volctrl)
+       CDROMREADRAW              read data in raw mode (2352 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADCOOKED           read data in cooked mode
+       CDROMSEEK                 seek msf address
+       CDROMPLAYBLK              scsi-cd only, (struct cdrom_blk)
+       CDROMREADALL              read all 2646 bytes
+       CDROMGETSPINDOWN          return 4-bit spindown value
+       CDROMSETSPINDOWN          set 4-bit spindown value
+       CDROMCLOSETRAY            pendant of CDROMEJECT
+       CDROM_SET_OPTIONS         Set behavior options
+       CDROM_CLEAR_OPTIONS       Clear behavior options
+       CDROM_SELECT_SPEED        Set the CD-ROM speed
+       CDROM_SELECT_DISC         Select disc (for juke-boxes)
+       CDROM_MEDIA_CHANGED       Check is media changed
+       CDROM_TIMED_MEDIA_CHANGE  Check if media changed
+                                 since given time
+                                 (struct cdrom_timed_media_change_info)
+       CDROM_DRIVE_STATUS        Get tray position, etc.
+       CDROM_DISC_STATUS         Get disc type, etc.
+       CDROM_CHANGER_NSLOTS      Get number of slots
+       CDROM_LOCKDOOR            lock or unlock door
+       CDROM_DEBUG               Turn debug messages on/off
+       CDROM_GET_CAPABILITY      get capabilities
+       CDROMAUDIOBUFSIZ          set the audio buffer size
+       DVD_READ_STRUCT           Read structure
+       DVD_WRITE_STRUCT          Write structure
+       DVD_AUTH                  Authentication
+       CDROM_SEND_PACKET         send a packet to the drive
+       CDROM_NEXT_WRITABLE       get next writable block
+       CDROM_LAST_WRITTEN        get last block written on disc
+       ========================  ===============================================
 
 
 The information that follows was determined from reading kernel source
index 2e81340..6655d92 100644 (file)
@@ -104,6 +104,7 @@ Code  Seq#    Include File                                           Comments
 '8'   all                                                            SNP8023 advanced NIC card
                                                                      <mailto:mcr@solidum.com>
 ';'   64-7F  linux/vfio.h
+'='   00-3f  uapi/linux/ptp_clock.h                                  <mailto:richardcochran@gmail.com>
 '@'   00-0F  linux/radeonfb.h                                        conflict!
 '@'   00-0F  drivers/video/aty/aty128fb.c                            conflict!
 'A'   00-1F  linux/apm_bios.h                                        conflict!
index f26920f..3b79fd4 100644 (file)
@@ -5458,6 +5458,19 @@ F:       include/net/devlink.h
 F:     include/uapi/linux/devlink.h
 F:     net/core/devlink.c
 
+DH ELECTRONICS IMX6 DHCOM BOARD SUPPORT
+M:     Christoph Niedermaier <cniedermaier@dh-electronics.com>
+L:     kernel@dh-electronics.com
+S:     Maintained
+F:     arch/arm/boot/dts/imx6*-dhcom-*
+
+DH ELECTRONICS STM32MP1 DHCOM/DHCOR BOARD SUPPORT
+M:     Marek Vasut <marex@denx.de>
+L:     kernel@dh-electronics.com
+S:     Maintained
+F:     arch/arm/boot/dts/stm32mp1*-dhcom-*
+F:     arch/arm/boot/dts/stm32mp1*-dhcor-*
+
 DIALOG SEMICONDUCTOR DRIVERS
 M:     Support Opensource <support.opensource@diasemi.com>
 S:     Supported
@@ -6147,8 +6160,7 @@ T:        git git://anongit.freedesktop.org/drm/drm
 F:     Documentation/devicetree/bindings/display/
 F:     Documentation/devicetree/bindings/gpu/
 F:     Documentation/gpu/
-F:     drivers/gpu/drm/
-F:     drivers/gpu/vga/
+F:     drivers/gpu/
 F:     include/drm/
 F:     include/linux/vga*
 F:     include/uapi/drm/
@@ -11278,7 +11290,6 @@ F:      Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
 F:     drivers/net/ethernet/marvell/octeontx2/af/
 
 MARVELL PRESTERA ETHERNET SWITCH DRIVER
-M:     Vadym Kochan <vkochan@marvell.com>
 M:     Taras Chornyi <tchornyi@marvell.com>
 S:     Supported
 W:     https://github.com/Marvell-switching/switchdev-prestera
index 30c7c81..a523163 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 VERSION = 5
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
-NAME = Opossums on Parade
+EXTRAVERSION =
+NAME = Trick or Treat
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
@@ -1115,7 +1115,8 @@ export MODORDER := $(extmod_prefix)modules.order
 export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y         += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y                 += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
+core-$(CONFIG_BLOCK)   += block/
 
 vmlinux-dirs   := $(patsubst %/,%,$(filter %/, \
                     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
index e201b4b..e8c2c74 100644 (file)
@@ -36,6 +36,7 @@ void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
index 8077f17..ecb91fb 100644 (file)
        pinctrl-names = "default";
        pinctrl-0 = <&gmac_rgmii_pins>;
        phy-handle = <&phy1>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-id";
        status = "okay";
 };
 
index 5e56288..e68fb87 100644 (file)
@@ -290,6 +290,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr
  */
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
+void flush_dcache_folio(struct folio *folio);
 
 #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 static inline void flush_kernel_vmap_range(void *addr, int size)
index 02f8e72..05486cc 100644 (file)
@@ -75,7 +75,7 @@
        pinctrl-0 = <&emac_rgmii_pins>;
        phy-supply = <&reg_gmac_3v3>;
        phy-handle = <&ext_rgmii_phy>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-id";
        status = "okay";
 };
 
index d17abb5..e99e764 100644 (file)
@@ -70,7 +70,9 @@
                regulator-name = "rst-usb-eth2";
                pinctrl-names = "default";
                pinctrl-0 = <&pinctrl_usb_eth2>;
-               gpio = <&gpio3 2 GPIO_ACTIVE_LOW>;
+               gpio = <&gpio3 2 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+               regulator-always-on;
        };
 
        reg_vdd_5v: regulator-5v {
@@ -95,7 +97,7 @@
                clocks = <&osc_can>;
                interrupt-parent = <&gpio4>;
                interrupts = <28 IRQ_TYPE_EDGE_FALLING>;
-               spi-max-frequency = <100000>;
+               spi-max-frequency = <10000000>;
                vdd-supply = <&reg_vdd_3v3>;
                xceiver-supply = <&reg_vdd_5v>;
        };
 &fec1 {
        pinctrl-names = "default";
        pinctrl-0 = <&pinctrl_enet>;
-       phy-connection-type = "rgmii";
+       phy-connection-type = "rgmii-rxid";
        phy-handle = <&ethphy>;
        status = "okay";
 
index 9db9b90..42bbbb3 100644 (file)
                        reg_vdd_soc: BUCK1 {
                                regulator-name = "buck1";
                                regulator-min-microvolt = <800000>;
-                               regulator-max-microvolt = <900000>;
+                               regulator-max-microvolt = <850000>;
                                regulator-boot-on;
                                regulator-always-on;
                                regulator-ramp-delay = <3125>;
+                               nxp,dvs-run-voltage = <850000>;
+                               nxp,dvs-standby-voltage = <800000>;
                        };
 
                        reg_vdd_arm: BUCK2 {
                        reg_vdd_dram: BUCK3 {
                                regulator-name = "buck3";
                                regulator-min-microvolt = <850000>;
-                               regulator-max-microvolt = <900000>;
+                               regulator-max-microvolt = <950000>;
                                regulator-boot-on;
                                regulator-always-on;
                        };
 
                        reg_vdd_snvs: LDO2 {
                                regulator-name = "ldo2";
-                               regulator-min-microvolt = <850000>;
+                               regulator-min-microvolt = <800000>;
                                regulator-max-microvolt = <900000>;
                                regulator-boot-on;
                                regulator-always-on;
index 8c15d9f..d12e4cb 100644 (file)
                        power-domains = <&dispcc MDSS_GDSC>;
 
                        clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+                                <&gcc GCC_DISP_HF_AXI_CLK>,
                                 <&gcc GCC_DISP_SF_AXI_CLK>,
                                 <&dispcc DISP_CC_MDSS_MDP_CLK>;
-                       clock-names = "iface", "nrt_bus", "core";
+                       clock-names = "iface", "bus", "nrt_bus", "core";
 
                        assigned-clocks = <&dispcc DISP_CC_MDSS_MDP_CLK>;
                        assigned-clock-rates = <460000000>;
index 41c23f4..803e777 100644 (file)
@@ -1136,6 +1136,11 @@ out:
        return prog;
 }
 
+u64 bpf_jit_alloc_exec_limit(void)
+{
+       return BPF_JIT_REGION_SIZE;
+}
+
 void *bpf_jit_alloc_exec(unsigned long size)
 {
        return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
index 9a8394e..9c57b24 100644 (file)
@@ -58,7 +58,7 @@ struct nfhd_device {
        struct gendisk *disk;
 };
 
-static blk_qc_t nfhd_submit_bio(struct bio *bio)
+static void nfhd_submit_bio(struct bio *bio)
 {
        struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
        struct bio_vec bvec;
@@ -76,7 +76,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio)
                sec += len;
        }
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -100,6 +99,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
 {
        struct nfhd_device *dev;
        int dev_id = id - NFHD_DEV_OFFSET;
+       int err = -ENOMEM;
 
        pr_info("nfhd%u: found device with %u blocks (%u bytes)\n", dev_id,
                blocks, bsize);
@@ -130,16 +130,20 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
        sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
        set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
        blk_queue_logical_block_size(dev->disk->queue, bsize);
-       add_disk(dev->disk);
+       err = add_disk(dev->disk);
+       if (err)
+               goto out_cleanup_disk;
 
        list_add_tail(&dev->list, &nfhd_list);
 
        return 0;
 
+out_cleanup_disk:
+       blk_cleanup_disk(dev->disk);
 free_dev:
        kfree(dev);
 out:
-       return -ENOMEM;
+       return err;
 }
 
 static int __init nfhd_init(void)
index 1ac55e7..8ab4662 100644 (file)
@@ -250,6 +250,7 @@ static inline void __flush_page_to_ram(void *vaddr)
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)                __flush_page_to_ram(page_address(page))
+void flush_dcache_folio(struct folio *folio);
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
 #define flush_icache_page(vma, page)   __flush_page_to_ram(page_address(page))
index b3dc9c5..f207388 100644 (file)
@@ -61,6 +61,8 @@ static inline void flush_dcache_page(struct page *page)
                SetPageDcacheDirty(page);
 }
 
+void flush_dcache_folio(struct folio *folio);
+
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
 
index 23ad8dd..b116937 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/console.h>
 #include <linux/memblock.h>
 #include <linux/ioport.h>
-#include <linux/blkdev.h>
 
 #include <asm/bootinfo.h>
 #include <asm/mach-rc32434/ddr.h>
index a3323f8..1a50429 100644 (file)
@@ -7,7 +7,6 @@
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/mm.h>
-#include <linux/blkdev.h>
 #include <linux/memblock.h>
 #include <linux/pm.h>
 #include <linux/smp.h>
index 538a279..f07b15d 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/memblock.h>
-#include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/screen_info.h>
index c2a222e..3fc0bb7 100644 (file)
@@ -27,6 +27,7 @@ void flush_cache_vunmap(unsigned long start, unsigned long end);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
 void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long vaddr, void *dst, void *src, int len);
 void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
index 0e23e3a..d55b73b 100644 (file)
@@ -6,7 +6,7 @@
 
 #ifndef CONFIG_DYNAMIC_FTRACE
 extern void (*ftrace_trace_function)(unsigned long, unsigned long,
-                                    struct ftrace_ops*, struct pt_regs*);
+                                    struct ftrace_ops*, struct ftrace_regs*);
 extern void ftrace_graph_caller(void);
 
 noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
index 18eb9f6..1999561 100644 (file)
@@ -28,7 +28,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr,
        unsigned long pfn);
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
 
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
index 9e32fb7..e849daf 100644 (file)
@@ -37,6 +37,7 @@ config NIOS2_DTB_PHYS_ADDR
 
 config NIOS2_DTB_SOURCE_BOOL
        bool "Compile and link device tree into kernel image"
+       depends on !COMPILE_TEST
        help
          This allows you to specify a dts (device tree source) file
          which will be compiled and linked into the kernel image.
index cfef61a..97305bd 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/memblock.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/blkdev.h>      /* for initrd_* */
 #include <linux/pagemap.h>
 
 #include <asm/pgalloc.h>
index eef0096..da0cd4b 100644 (file)
@@ -49,7 +49,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size);
 #define flush_cache_vunmap(start, end)         flush_cache_all()
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
 
 #define flush_dcache_mmap_lock(mapping)                xa_lock_irq(&mapping->i_pages)
 #define flush_dcache_mmap_unlock(mapping)      xa_unlock_irq(&mapping->i_pages)
index bed05b6..cb25acc 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 
 #include <asm/prom.h>
index dab5c56..a52af8f 100644 (file)
@@ -1302,6 +1302,12 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
                struct property *default_win;
                int reset_win_ext;
 
+               /* DDW + IOMMU on single window may fail if there is any allocation */
+               if (iommu_table_in_use(tbl)) {
+                       dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
+                       goto out_failed;
+               }
+
                default_win = of_find_property(pdn, "ibm,dma-window", NULL);
                if (!default_win)
                        goto out_failed;
@@ -1356,12 +1362,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
                        query.largest_available_block,
                        1ULL << page_shift);
 
-               /* DDW + IOMMU on single window may fail if there is any allocation */
-               if (default_win_removed && iommu_table_in_use(tbl)) {
-                       dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
-                       goto out_failed;
-               }
-
                len = order_base_2(query.largest_available_block << page_shift);
                win_name = DMA64_PROPNAME;
        } else {
@@ -1411,18 +1411,19 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
        } else {
                struct iommu_table *newtbl;
                int i;
+               unsigned long start = 0, end = 0;
 
                for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
                        const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
 
                        /* Look for MMIO32 */
-                       if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM)
+                       if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
+                               start = pci->phb->mem_resources[i].start;
+                               end = pci->phb->mem_resources[i].end;
                                break;
+                       }
                }
 
-               if (i == ARRAY_SIZE(pci->phb->mem_resources))
-                       goto out_del_list;
-
                /* New table for using DDW instead of the default DMA window */
                newtbl = iommu_pseries_alloc_table(pci->phb->node);
                if (!newtbl) {
@@ -1432,15 +1433,15 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
                iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
                                            1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
-               iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start,
-                                pci->phb->mem_resources[i].end);
+               iommu_init_table(newtbl, pci->phb->node, start, end);
 
                pci->table_group->tables[1] = newtbl;
 
                /* Keep default DMA window stuct if removed */
                if (default_win_removed) {
                        tbl->it_size = 0;
-                       kfree(tbl->it_map);
+                       vfree(tbl->it_map);
+                       tbl->it_map = NULL;
                }
 
                set_iommu_table_base(&dev->dev, newtbl);
index 6a6fa9e..f076cee 100644 (file)
@@ -163,6 +163,12 @@ config PAGE_OFFSET
        default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
        default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
 
+config KASAN_SHADOW_OFFSET
+       hex
+       depends on KASAN_GENERIC
+       default 0xdfffffc800000000 if 64BIT
+       default 0xffffffff if 32BIT
+
 config ARCH_FLATMEM_ENABLE
        def_bool !NUMA
 
index a2b3d9c..b00f503 100644 (file)
@@ -30,8 +30,7 @@
 #define KASAN_SHADOW_SIZE      (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
 #define KASAN_SHADOW_START     KERN_VIRT_START
 #define KASAN_SHADOW_END       (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
-#define KASAN_SHADOW_OFFSET    (KASAN_SHADOW_END - (1ULL << \
-                                       (64 - KASAN_SHADOW_SCALE_SHIFT)))
+#define KASAN_SHADOW_OFFSET    _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
 void kasan_init(void);
 asmlinkage void kasan_early_init(void);
index fce5184..52c5ff9 100644 (file)
@@ -193,6 +193,7 @@ setup_trap_vector:
        csrw CSR_SCRATCH, zero
        ret
 
+.align 2
 .Lsecondary_park:
        /* We lack SMP support or have too many harts, so park this hart */
        wfi
index d7189c8..54294f8 100644 (file)
@@ -17,6 +17,9 @@ asmlinkage void __init kasan_early_init(void)
        uintptr_t i;
        pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START);
 
+       BUILD_BUG_ON(KASAN_SHADOW_OFFSET !=
+               KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
+
        for (i = 0; i < PTRS_PER_PTE; ++i)
                set_pte(kasan_early_shadow_pte + i,
                        mk_pte(virt_to_page(kasan_early_shadow_page),
@@ -172,21 +175,10 @@ void __init kasan_init(void)
        phys_addr_t p_start, p_end;
        u64 i;
 
-       /*
-        * Populate all kernel virtual address space with kasan_early_shadow_page
-        * except for the linear mapping and the modules/kernel/BPF mapping.
-        */
-       kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
-                                   (void *)kasan_mem_to_shadow((void *)
-                                                               VMEMMAP_END));
        if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
                kasan_shallow_populate(
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
-       else
-               kasan_populate_early_shadow(
-                       (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
-                       (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
 
        /* Populate the linear mapping */
        for_each_mem_range(i, &p_start, &p_end) {
index fed86f4..753d85b 100644 (file)
@@ -125,7 +125,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
        if (i == NR_JIT_ITERATIONS) {
                pr_err("bpf-jit: image did not converge in <%d passes!\n", i);
-               bpf_jit_binary_free(jit_data->header);
+               if (jit_data->header)
+                       bpf_jit_binary_free(jit_data->header);
                prog = orig_prog;
                goto out_offset;
        }
@@ -166,6 +167,11 @@ out:
        return prog;
 }
 
+u64 bpf_jit_alloc_exec_limit(void)
+{
+       return BPF_JIT_REGION_SIZE;
+}
+
 void *bpf_jit_alloc_exec(unsigned long size)
 {
        return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
index 1072245..2245f4b 100644 (file)
@@ -3053,13 +3053,14 @@ static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
        int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
        struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
        struct kvm_vcpu *vcpu;
+       u8 vcpu_isc_mask;
 
        for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
                vcpu = kvm_get_vcpu(kvm, vcpu_idx);
                if (psw_ioint_disabled(vcpu))
                        continue;
-               deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
-               if (deliverable_mask) {
+               vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+               if (deliverable_mask & vcpu_isc_mask) {
                        /* lately kicked but not yet running */
                        if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
                                return;
index 6a6dd5e..1c97493 100644 (file)
@@ -3363,6 +3363,7 @@ out_free_sie_block:
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
+       clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
        return kvm_s390_vcpu_has_irq(vcpu, 0);
 }
 
index 372afa8..c7a97f3 100644 (file)
@@ -42,7 +42,8 @@ extern void flush_cache_page(struct vm_area_struct *vma,
 extern void flush_cache_range(struct vm_area_struct *vma,
                                 unsigned long start, unsigned long end);
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 #define flush_icache_user_range flush_icache_range
 extern void flush_icache_page(struct vm_area_struct *vma,
index cd9dc05..69d2d00 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/blk-mq.h>
 #include <linux/ata.h>
 #include <linux/hdreg.h>
+#include <linux/major.h>
 #include <linux/cdrom.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -854,8 +855,8 @@ static const struct attribute_group *ubd_attr_groups[] = {
        NULL,
 };
 
-static void ubd_disk_register(int major, u64 size, int unit,
-                             struct gendisk *disk)
+static int ubd_disk_register(int major, u64 size, int unit,
+                            struct gendisk *disk)
 {
        disk->major = major;
        disk->first_minor = unit << UBD_SHIFT;
@@ -872,7 +873,7 @@ static void ubd_disk_register(int major, u64 size, int unit,
 
        disk->private_data = &ubd_devs[unit];
        disk->queue = ubd_devs[unit].queue;
-       device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
+       return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
 }
 
 #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
@@ -919,10 +920,15 @@ static int ubd_add(int n, char **error_out)
        blk_queue_write_cache(ubd_dev->queue, true, false);
        blk_queue_max_segments(ubd_dev->queue, MAX_SG);
        blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
-       ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
+       err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
+       if (err)
+               goto out_cleanup_disk;
+
        ubd_gendisk[n] = disk;
        return 0;
 
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
 out_cleanup_tags:
        blk_mq_free_tag_set(&ubd_dev->tag_set);
 out:
index 18d2f51..1cc72b4 100644 (file)
@@ -78,7 +78,7 @@
        vpxor tmp0, x, x;
 
 
-.section       .rodata.cst164, "aM", @progbits, 164
+.section       .rodata.cst16, "aM", @progbits, 16
 .align 16
 
 /*
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+       .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
 
 .text
 .align 16
index d2ffd7f..9c5d3f3 100644 (file)
@@ -93,7 +93,7 @@
        vpxor tmp0, x, x;
 
 
-.section       .rodata.cst164, "aM", @progbits, 164
+.section       .rodata.cst16, "aM", @progbits, 16
 .align 16
 
 /*
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+       .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
 .text
 .align 16
 
index 5a0298a..13f6465 100644 (file)
@@ -1098,7 +1098,7 @@ struct kvm_arch {
        u64 cur_tsc_generation;
        int nr_vcpus_matched_tsc;
 
-       spinlock_t pvclock_gtod_sync_lock;
+       raw_spinlock_t pvclock_gtod_sync_lock;
        bool use_master_clock;
        u64 master_kernel_ns;
        u64 master_cycle_now;
index 2e4916b..7e34d71 100644 (file)
@@ -2591,11 +2591,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
 {
-       if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+       int count;
+       int bytes;
+
+       if (svm->vmcb->control.exit_info_2 > INT_MAX)
+               return -EINVAL;
+
+       count = svm->vmcb->control.exit_info_2;
+       if (unlikely(check_mul_overflow(count, size, &bytes)))
+               return -EINVAL;
+
+       if (!setup_vmgexit_scratch(svm, in, bytes))
                return -EINVAL;
 
-       return kvm_sev_es_string_io(&svm->vcpu, size, port,
-                                   svm->ghcb_sa, svm->ghcb_sa_len / size, in);
+       return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in);
 }
 
 void sev_es_init_vmcb(struct vcpu_svm *svm)
index b26647a..bfe0de3 100644 (file)
@@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
        if (!matched) {
                kvm->arch.nr_vcpus_matched_tsc = 0;
        } else if (!already_matched) {
@@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        }
 
        kvm_track_tsc_matching(vcpu);
-       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
 }
 
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
        kvm_make_mclock_inprogress_request(kvm);
 
        /* no guest entries from this point */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        pvclock_update_vm_gtod_copy(kvm);
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
        unsigned long flags;
        u64 ret;
 
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                return get_kvmclock_base_ns() + ka->kvmclock_offset;
        }
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
        get_cpu();
@@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         * If the host uses TSC clock, then passthrough TSC as stable
         * to the guest.
         */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        use_master_clock = ka->use_master_clock;
        if (use_master_clock) {
                host_tsc = ka->master_cycle_now;
                kernel_ns = ka->master_kernel_ns;
        }
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -6100,13 +6100,13 @@ set_pit2_out:
                 * is slightly ahead) here we risk going negative on unsigned
                 * 'system_time' when 'user_ns.clock' is very small.
                 */
-               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
                if (kvm->arch.use_master_clock)
                        now_ns = ka->master_kernel_ns;
                else
                        now_ns = get_kvmclock_base_ns();
                ka->kvmclock_offset = user_ns.clock - now_ns;
-               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
 
                kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                break;
@@ -8156,9 +8156,9 @@ static void kvm_hyperv_tsc_notifier(void)
        list_for_each_entry(kvm, &vm_list, vm_list) {
                struct kvm_arch *ka = &kvm->arch;
 
-               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                pvclock_update_vm_gtod_copy(kvm);
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
                kvm_for_each_vcpu(cpu, vcpu, kvm)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -8800,9 +8800,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+       /*
+        * The call to kvm_ready_for_interrupt_injection() may end up in
+        * kvm_xen_has_interrupt() which may require the srcu lock to be
+        * held, to protect against changes in the vcpu_info address.
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        kvm_run->ready_for_interrupt_injection =
                pic_in_kernel(vcpu->kvm) ||
                kvm_vcpu_ready_for_interrupt_injection(vcpu);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
        if (is_smm(vcpu))
                kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -11199,7 +11207,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
-       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+       raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
        kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
        pvclock_update_vm_gtod_copy(kvm);
index 9ea9c3d..8f62bae 100644 (file)
@@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
+       int err;
        u8 rc = 0;
 
        /*
@@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
        if (likely(slots->generation == ghc->generation &&
                   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
                /* Fast path */
-               __get_user(rc, (u8 __user *)ghc->hva + offset);
-       } else {
-               /* Slow path */
-               kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-                                            sizeof(rc));
+               pagefault_disable();
+               err = __get_user(rc, (u8 __user *)ghc->hva + offset);
+               pagefault_enable();
+               if (!err)
+                       return rc;
        }
 
+       /* Slow path */
+
+       /*
+        * This function gets called from kvm_vcpu_block() after setting the
+        * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+        * from a HLT. So we really mustn't sleep. If the page ended up absent
+        * at that point, just return 1 in order to trigger an immediate wake,
+        * and we'll end up getting called again from a context where we *can*
+        * fault in the page and wait for it.
+        */
+       if (in_atomic() || !task_is_running(current))
+               return 1;
+
+       kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+                                    sizeof(rc));
+
        return rc;
 }
 
index cf907e5..a8a0416 100644 (file)
@@ -120,7 +120,8 @@ void flush_cache_page(struct vm_area_struct*,
 #define flush_cache_vunmap(start,end)  flush_cache_all()
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page*);
+void flush_dcache_page(struct page *);
+void flush_dcache_folio(struct folio *);
 
 void local_flush_cache_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end);
@@ -137,7 +138,9 @@ void local_flush_cache_page(struct vm_area_struct *vma,
 #define flush_cache_vunmap(start,end)                  do { } while (0)
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
 #define flush_dcache_page(page)                                do { } while (0)
+static inline void flush_dcache_folio(struct folio *folio) { }
 
 #define flush_icache_range local_flush_icache_range
 #define flush_cache_page(vma, addr, pfn)               do { } while (0)
index 3cdfa00..07b642c 100644 (file)
@@ -100,7 +100,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
        spin_unlock(&dev->lock);
 }
 
-static blk_qc_t simdisk_submit_bio(struct bio *bio)
+static void simdisk_submit_bio(struct bio *bio)
 {
        struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
        struct bio_vec bvec;
@@ -118,7 +118,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio)
        }
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int simdisk_open(struct block_device *bdev, fmode_t mode)
@@ -259,6 +258,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
                struct proc_dir_entry *procdir)
 {
        char tmp[2] = { '0' + which, 0 };
+       int err = -ENOMEM;
 
        dev->fd = -1;
        dev->filename = NULL;
@@ -267,7 +267,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
 
        dev->gd = blk_alloc_disk(NUMA_NO_NODE);
        if (!dev->gd)
-               return -ENOMEM;
+               goto out;
        dev->gd->major = simdisk_major;
        dev->gd->first_minor = which;
        dev->gd->minors = SIMDISK_MINORS;
@@ -275,10 +275,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
        dev->gd->private_data = dev;
        snprintf(dev->gd->disk_name, 32, "simdisk%d", which);
        set_capacity(dev->gd, 0);
-       add_disk(dev->gd);
+       err = add_disk(dev->gd);
+       if (err)
+               goto out_cleanup_disk;
 
        dev->procfile = proc_create_data(tmp, 0644, procdir, &simdisk_proc_ops, dev);
+
        return 0;
+
+out_cleanup_disk:
+       blk_cleanup_disk(dev->gd);
+out:
+       return err;
 }
 
 static int __init simdisk_init(void)
index 8e28ae7..c6ce41a 100644 (file)
@@ -73,7 +73,7 @@ config BLK_DEV_ZONED
 
 config BLK_DEV_THROTTLING
        bool "Block layer bio throttling support"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
        select BLK_CGROUP_RWSTAT
        help
        Block layer bio throttling support. It can be used to limit
@@ -112,7 +112,7 @@ config BLK_WBT_MQ
 
 config BLK_CGROUP_IOLATENCY
        bool "Enable support for latency based cgroup IO protection"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
        help
        Enabling this option enables the .latency interface for IO throttling.
        The IO controller will attempt to maintain average IO latencies below
@@ -132,7 +132,7 @@ config BLK_CGROUP_FC_APPID
 
 config BLK_CGROUP_IOCOST
        bool "Enable support for cost model based cgroup IO controller"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
        select BLK_RQ_IO_DATA_LEN
        select BLK_RQ_ALLOC_TIME
        help
@@ -190,39 +190,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
          by falling back to the kernel crypto API when inline
          encryption hardware is not present.
 
-menu "Partition Types"
-
 source "block/partitions/Kconfig"
 
-endmenu
-
-endif # BLOCK
-
 config BLOCK_COMPAT
-       bool
-       depends on BLOCK && COMPAT
-       default y
+       def_bool COMPAT
 
 config BLK_MQ_PCI
-       bool
-       depends on BLOCK && PCI
-       default y
+       def_bool PCI
 
 config BLK_MQ_VIRTIO
        bool
-       depends on BLOCK && VIRTIO
+       depends on VIRTIO
        default y
 
 config BLK_MQ_RDMA
        bool
-       depends on BLOCK && INFINIBAND
+       depends on INFINIBAND
        default y
 
 config BLK_PM
-       def_bool BLOCK && PM
+       def_bool PM
 
 # do not use in new code
 config BLOCK_HOLDER_DEPRECATED
        bool
 
 source "block/Kconfig.iosched"
+
+endif # BLOCK
index 2f2158e..885fee8 100644 (file)
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-if BLOCK
-
 menu "IO Schedulers"
 
 config MQ_IOSCHED_DEADLINE
@@ -45,5 +43,3 @@ config BFQ_CGROUP_DEBUG
        files in a cgroup which can be useful for debugging.
 
 endmenu
-
-endif
index 41aa1ba..44df57e 100644 (file)
@@ -3,13 +3,13 @@
 # Makefile for the kernel block layer
 #
 
-obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-y          := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-timeout.o \
                        blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
                        blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
                        genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
-                       disk-events.o
+                       disk-events.o blk-ia-ranges.o
 
 obj-$(CONFIG_BOUNCE)           += bounce.o
 obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS)    += blk-mq-debugfs.o
 obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)     += sed-opal.o
 obj-$(CONFIG_BLK_PM)           += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION)    += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION)    += blk-crypto.o blk-crypto-profile.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)   += blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)  += holder.o
index 485a258..b4dab2f 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/major.h>
 #include <linux/device_cgroup.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
@@ -184,14 +185,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
 
 EXPORT_SYMBOL(sb_min_blocksize);
 
-int __sync_blockdev(struct block_device *bdev, int wait)
+int sync_blockdev_nowait(struct block_device *bdev)
 {
        if (!bdev)
                return 0;
-       if (!wait)
-               return filemap_flush(bdev->bd_inode->i_mapping);
-       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+       return filemap_flush(bdev->bd_inode->i_mapping);
 }
+EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
 
 /*
  * Write out and wait upon all the dirty data associated with a block
@@ -199,7 +199,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
  */
 int sync_blockdev(struct block_device *bdev)
 {
-       return __sync_blockdev(bdev, 1);
+       if (!bdev)
+               return 0;
+       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
 }
 EXPORT_SYMBOL(sync_blockdev);
 
@@ -326,12 +328,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return result;
 
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       result = blk_queue_enter(bdev_get_queue(bdev), 0);
        if (result)
                return result;
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
                              REQ_OP_READ);
-       blk_queue_exit(bdev->bd_disk->queue);
+       blk_queue_exit(bdev_get_queue(bdev));
        return result;
 }
 
@@ -362,7 +364,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       result = blk_queue_enter(bdev_get_queue(bdev), 0);
        if (result)
                return result;
 
@@ -375,7 +377,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
                clean_page_buffers(page);
                unlock_page(page);
        }
-       blk_queue_exit(bdev->bd_disk->queue);
+       blk_queue_exit(bdev_get_queue(bdev));
        return result;
 }
 
@@ -492,6 +494,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
        spin_lock_init(&bdev->bd_size_lock);
        bdev->bd_partno = partno;
        bdev->bd_inode = inode;
+       bdev->bd_queue = disk->queue;
        bdev->bd_stats = alloc_percpu(struct disk_stats);
        if (!bdev->bd_stats) {
                iput(inode);
@@ -962,9 +965,11 @@ EXPORT_SYMBOL(blkdev_put);
  * @pathname:  special file representing the block device
  * @dev:       return value of the block device's dev_t
  *
- * Get a reference to the blockdevice at @pathname in the current
- * namespace if possible and return it.  Return ERR_PTR(error)
- * otherwise.
+ * Lookup the block device's dev_t at @pathname in the current
+ * namespace if possible and return it by @dev.
+ *
+ * RETURNS:
+ * 0 if succeeded, errno otherwise.
  */
 int lookup_bdev(const char *pathname, dev_t *dev)
 {
@@ -1016,7 +1021,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 }
 EXPORT_SYMBOL(__invalidate_device);
 
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+void sync_bdevs(bool wait)
 {
        struct inode *inode, *old_inode = NULL;
 
@@ -1047,8 +1052,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
                bdev = I_BDEV(inode);
 
                mutex_lock(&bdev->bd_disk->open_mutex);
-               if (bdev->bd_openers)
-                       func(bdev, arg);
+               if (!bdev->bd_openers) {
+                       ; /* skip */
+               } else if (wait) {
+                       /*
+                        * We keep the error status of individual mapping so
+                        * that applications can catch the writeback error using
+                        * fsync(2). See filemap_fdatawait_keep_errors() for
+                        * details.
+                        */
+                       filemap_fdatawait_keep_errors(inode->i_mapping);
+               } else {
+                       filemap_fdatawrite(inode->i_mapping);
+               }
                mutex_unlock(&bdev->bd_disk->open_mutex);
 
                spin_lock(&blockdev_superblock->s_inode_list_lock);
index 85b8e1c..24a5c53 100644 (file)
@@ -6,13 +6,13 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/cgroup.h>
-#include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/sbitmap.h>
 #include <linux/delay.h>
 
+#include "elevator.h"
 #include "bfq-iosched.h"
 
 #ifdef CONFIG_BFQ_CGROUP_DEBUG
@@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
        if (blkg_rwstat_init(&stats->bytes, gfp) ||
            blkg_rwstat_init(&stats->ios, gfp))
-               return -ENOMEM;
+               goto error;
 
 #ifdef CONFIG_BFQ_CGROUP_DEBUG
        if (blkg_rwstat_init(&stats->merged, gfp) ||
@@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
            bfq_stat_init(&stats->dequeue, gfp) ||
            bfq_stat_init(&stats->group_wait_time, gfp) ||
            bfq_stat_init(&stats->idle_time, gfp) ||
-           bfq_stat_init(&stats->empty_time, gfp)) {
-               bfqg_stats_exit(stats);
-               return -ENOMEM;
-       }
+           bfq_stat_init(&stats->empty_time, gfp))
+               goto error;
 #endif
 
        return 0;
+
+error:
+       bfqg_stats_exit(stats);
+       return -ENOMEM;
 }
 
 static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
index 480e1a1..fec1811 100644 (file)
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/cgroup.h>
-#include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 
 #include <trace/events/block.h>
 
+#include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
@@ -6884,8 +6884,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
        struct blk_mq_tags *tags = hctx->sched_tags;
        unsigned int min_shallow;
 
-       min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
+       min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
 }
 
 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
index 6b47cdd..d251147 100644 (file)
@@ -6,7 +6,7 @@
  * Written by: Martin K. Petersen <martin.petersen@oracle.com>
  */
 
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/mempool.h>
 #include <linux/export.h>
 #include <linux/bio.h>
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        iv = bip->bip_vec + bip->bip_vcnt;
 
        if (bip->bip_vcnt &&
-           bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
+           bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
                             &bip->bip_vec[bip->bip_vcnt - 1], offset))
                return 0;
 
index a6fb6a0..15ab0d6 100644 (file)
@@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
 
        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
        bslab->slab = kmem_cache_create(bslab->name, size,
-                       ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
+                       ARCH_KMALLOC_MINALIGN,
+                       SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
        if (!bslab->slab)
                goto fail_alloc_slab;
 
@@ -156,7 +157,7 @@ out:
 
 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
 {
-       BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
+       BUG_ON(nr_vecs > BIO_MAX_VECS);
 
        if (nr_vecs == BIO_MAX_VECS)
                mempool_free(bv, pool);
@@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 
        atomic_set(&bio->__bi_remaining, 1);
        atomic_set(&bio->__bi_cnt, 1);
+       bio->bi_cookie = BLK_QC_T_NONE;
 
        bio->bi_max_vecs = max_vecs;
        bio->bi_io_vec = table;
@@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
  *   REQ_OP_READ, zero the truncated part. This function should only
  *   be used for handling corner cases, such as bio eod.
  */
-void bio_truncate(struct bio *bio, unsigned new_size)
+static void bio_truncate(struct bio *bio, unsigned new_size)
 {
        struct bio_vec bv;
        struct bvec_iter iter;
@@ -677,7 +679,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
 void bio_put(struct bio *bio)
 {
        if (unlikely(bio_flagged(bio, BIO_REFFED))) {
-               BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+               BUG_ON(!atomic_read(&bio->__bi_cnt));
                if (!atomic_dec_and_test(&bio->__bi_cnt))
                        return;
        }
@@ -772,6 +774,23 @@ const char *bio_devname(struct bio *bio, char *buf)
 }
 EXPORT_SYMBOL(bio_devname);
 
+/**
+ * bio_full - check if the bio is full
+ * @bio:       bio to check
+ * @len:       length of one segment to be added
+ *
+ * Return true if @bio is full and one segment with @len bytes can't be
+ * added to the bio, otherwise return false
+ */
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return true;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return true;
+       return false;
+}
+
 static inline bool page_is_mergeable(const struct bio_vec *bv,
                struct page *page, unsigned int len, unsigned int off,
                bool *same_page)
@@ -791,6 +810,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
        return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
 }
 
+/**
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: start page to add
+ * @len: length of the data to add
+ * @off: offset of the data relative to @page
+ * @same_page: return if the segment has been merged inside the same page
+ *
+ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
+ * useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Warn if (@len, @off) crosses pages in case that @same_page is true.
+ *
+ * Return %true on success or %false on failure.
+ */
+static bool __bio_try_merge_page(struct bio *bio, struct page *page,
+               unsigned int len, unsigned int off, bool *same_page)
+{
+       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+               return false;
+
+       if (bio->bi_vcnt > 0) {
+               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+               if (page_is_mergeable(bv, page, len, off, same_page)) {
+                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
+                               *same_page = false;
+                               return false;
+                       }
+                       bv->bv_len += len;
+                       bio->bi_iter.bi_size += len;
+                       return true;
+               }
+       }
+       return false;
+}
+
 /*
  * Try to merge a page into a segment, while obeying the hardware segment
  * size limit.  This is not for normal read/write bios, but for passthrough
@@ -908,7 +965,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
 int bio_add_zone_append_page(struct bio *bio, struct page *page,
                             unsigned int len, unsigned int offset)
 {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        bool same_page = false;
 
        if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@@ -923,45 +980,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
 EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
 
 /**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio.  This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
-               unsigned int len, unsigned int off, bool *same_page)
-{
-       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-               return false;
-
-       if (bio->bi_vcnt > 0) {
-               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-               if (page_is_mergeable(bv, page, len, off, same_page)) {
-                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
-                               *same_page = false;
-                               return false;
-                       }
-                       bv->bv_len += len;
-                       bio->bi_iter.bi_size += len;
-                       return true;
-               }
-       }
-       return false;
-}
-EXPORT_SYMBOL_GPL(__bio_try_merge_page);
-
-/**
  * __bio_add_page - add page(s) to a bio in a new segment
  * @bio: destination bio
  * @page: start page to add
@@ -1015,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
-void bio_release_pages(struct bio *bio, bool mark_dirty)
+void __bio_release_pages(struct bio *bio, bool mark_dirty)
 {
        struct bvec_iter_all iter_all;
        struct bio_vec *bvec;
 
-       if (bio_flagged(bio, BIO_NO_PAGE_REF))
-               return;
-
        bio_for_each_segment_all(bvec, bio, iter_all) {
                if (mark_dirty && !PageCompound(bvec->bv_page))
                        set_page_dirty_lock(bvec->bv_page);
                put_page(bvec->bv_page);
        }
 }
-EXPORT_SYMBOL_GPL(bio_release_pages);
+EXPORT_SYMBOL_GPL(__bio_release_pages);
 
-static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
+       size_t size = iov_iter_count(iter);
+
        WARN_ON_ONCE(bio->bi_max_vecs);
 
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+               size_t max_sectors = queue_max_zone_append_sectors(q);
+
+               size = min(size, max_sectors << SECTOR_SHIFT);
+       }
+
        bio->bi_vcnt = iter->nr_segs;
        bio->bi_io_vec = (struct bio_vec *)iter->bvec;
        bio->bi_iter.bi_bvec_done = iter->iov_offset;
-       bio->bi_iter.bi_size = iter->count;
+       bio->bi_iter.bi_size = size;
        bio_set_flag(bio, BIO_NO_PAGE_REF);
        bio_set_flag(bio, BIO_CLONED);
 }
 
-static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
-{
-       __bio_iov_bvec_set(bio, iter);
-       iov_iter_advance(iter, iter->count);
-       return 0;
-}
-
-static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
-{
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
-       struct iov_iter i = *iter;
-
-       iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
-       __bio_iov_bvec_set(bio, &i);
-       iov_iter_advance(iter, i.count);
-       return 0;
-}
-
 static void bio_put_pages(struct page **pages, size_t size, size_t off)
 {
        size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@@ -1130,7 +1136,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 {
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
@@ -1202,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
        int ret = 0;
 
        if (iov_iter_is_bvec(iter)) {
-               if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-                       return bio_iov_bvec_set_append(bio, iter);
-               return bio_iov_bvec_set(bio, iter);
+               bio_iov_bvec_set(bio, iter);
+               iov_iter_advance(iter, bio->bi_iter.bi_size);
+               return 0;
        }
 
        do {
@@ -1260,18 +1266,7 @@ int submit_bio_wait(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio_wait);
 
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio:       bio to advance
- * @bytes:     number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
+void __bio_advance(struct bio *bio, unsigned bytes)
 {
        if (bio_integrity(bio))
                bio_integrity_advance(bio, bytes);
@@ -1279,7 +1274,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
        bio_crypt_advance(bio, bytes);
        bio_advance_iter(bio, &bio->bi_iter, bytes);
 }
-EXPORT_SYMBOL(bio_advance);
+EXPORT_SYMBOL(__bio_advance);
 
 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                        struct bio *src, struct bvec_iter *src_iter)
@@ -1467,10 +1462,10 @@ again:
                return;
 
        if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
-               rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
+               rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
 
        if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-               trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
+               trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
        }
 
index 9a1c583..88b1fce 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/psi.h>
 #include "blk.h"
 #include "blk-ioprio.h"
+#include "blk-throttle.h"
 
 /*
  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -620,7 +621,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   char *input, struct blkg_conf_ctx *ctx)
-       __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
+       __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
 {
        struct block_device *bdev;
        struct request_queue *q;
@@ -631,7 +632,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
 
-       q = bdev->bd_disk->queue;
+       q = bdev_get_queue(bdev);
+
+       /*
+        * blkcg_deactivate_policy() requires queue to be frozen, we can grab
+        * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
+        */
+       ret = blk_queue_enter(q, 0);
+       if (ret)
+               return ret;
 
        rcu_read_lock();
        spin_lock_irq(&q->queue_lock);
@@ -702,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                        goto success;
        }
 success:
+       blk_queue_exit(q);
        ctx->bdev = bdev;
        ctx->blkg = blkg;
        ctx->body = input;
@@ -714,6 +724,7 @@ fail_unlock:
        rcu_read_unlock();
 fail:
        blkdev_put_no_open(bdev);
+       blk_queue_exit(q);
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
@@ -736,9 +747,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  * with blkg_conf_prep().
  */
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-       __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
+       __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
 {
-       spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+       spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
        rcu_read_unlock();
        blkdev_put_no_open(ctx->bdev);
 }
@@ -841,7 +852,7 @@ static void blkcg_fill_root_iostats(void)
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg =
-                       blk_queue_root_blkg(bdev->bd_disk->queue);
+                       blk_queue_root_blkg(bdev_get_queue(bdev));
                struct blkg_iostat tmp;
                int cpu;
 
@@ -1800,7 +1811,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
 
        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css),
-                                 bio->bi_bdev->bd_disk->queue);
+                                 bdev_get_queue(bio->bi_bdev));
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
@@ -1836,8 +1847,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
-               blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
-               bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
+               blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
+               bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
index 4d8f5fe..ac1de7d 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-pm.h>
+#include <linux/blk-integrity.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -49,6 +50,7 @@
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
+#include "blk-throttle.h"
 
 struct dentry *blk_debugfs_root;
 
@@ -214,8 +216,7 @@ int blk_status_to_errno(blk_status_t status)
 }
 EXPORT_SYMBOL_GPL(blk_status_to_errno);
 
-static void print_req_error(struct request *req, blk_status_t status,
-               const char *caller)
+void blk_print_req_error(struct request *req, blk_status_t status)
 {
        int idx = (__force int)status;
 
@@ -223,9 +224,9 @@ static void print_req_error(struct request *req, blk_status_t status,
                return;
 
        printk_ratelimited(KERN_ERR
-               "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+               "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                "phys_seg %u prio class %u\n",
-               caller, blk_errors[idx].name,
+               blk_errors[idx].name,
                req->rq_disk ? req->rq_disk->disk_name : "?",
                blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
                req->cmd_flags & ~REQ_OP_MASK,
@@ -233,33 +234,6 @@ static void print_req_error(struct request *req, blk_status_t status,
                IOPRIO_PRIO_CLASS(req->ioprio));
 }
 
-static void req_bio_endio(struct request *rq, struct bio *bio,
-                         unsigned int nbytes, blk_status_t error)
-{
-       if (error)
-               bio->bi_status = error;
-
-       if (unlikely(rq->rq_flags & RQF_QUIET))
-               bio_set_flag(bio, BIO_QUIET);
-
-       bio_advance(bio, nbytes);
-
-       if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
-               /*
-                * Partial zone append completions cannot be supported as the
-                * BIO fragments may end up not being written sequentially.
-                */
-               if (bio->bi_iter.bi_size)
-                       bio->bi_status = BLK_STS_IOERR;
-               else
-                       bio->bi_iter.bi_sector = rq->__sector;
-       }
-
-       /* don't actually finish bio if it's part of flush sequence */
-       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
-               bio_endio(bio);
-}
-
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
@@ -402,7 +376,7 @@ void blk_cleanup_queue(struct request_queue *q)
         */
        mutex_lock(&q->sysfs_lock);
        if (q->elevator)
-               blk_mq_sched_free_requests(q);
+               blk_mq_sched_free_rqs(q);
        mutex_unlock(&q->sysfs_lock);
 
        percpu_ref_exit(&q->q_usage_counter);
@@ -415,7 +389,7 @@ EXPORT_SYMBOL(blk_cleanup_queue);
 static bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
        rcu_read_lock();
-       if (!percpu_ref_tryget_live(&q->q_usage_counter))
+       if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                goto fail;
 
        /*
@@ -430,7 +404,7 @@ static bool blk_try_enter_queue(struct request_queue *q, bool pm)
        return true;
 
 fail_put:
-       percpu_ref_put(&q->q_usage_counter);
+       blk_queue_exit(q);
 fail:
        rcu_read_unlock();
        return false;
@@ -470,10 +444,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 
 static inline int bio_queue_enter(struct bio *bio)
 {
-       struct gendisk *disk = bio->bi_bdev->bd_disk;
-       struct request_queue *q = disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
        while (!blk_try_enter_queue(q, false)) {
+               struct gendisk *disk = bio->bi_bdev->bd_disk;
+
                if (bio->bi_opf & REQ_NOWAIT) {
                        if (test_bit(GD_DEAD, &disk->state))
                                goto dead;
@@ -553,7 +528,7 @@ struct request_queue *blk_alloc_queue(int node_id)
 
        q->node = node_id;
 
-       atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
+       atomic_set(&q->nr_active_requests_shared_tags, 0);
 
        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
@@ -586,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id)
 
        blk_queue_dma_alignment(q, 511);
        blk_set_default_limits(&q->limits);
-       q->nr_requests = BLKDEV_MAX_RQ;
+       q->nr_requests = BLKDEV_DEFAULT_RQ;
 
        return q;
 
@@ -622,40 +597,13 @@ bool blk_get_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_get_queue);
 
-/**
- * blk_get_request - allocate a request
- * @q: request queue to allocate a request for
- * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
- * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
- */
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-                               blk_mq_req_flags_t flags)
-{
-       struct request *req;
-
-       WARN_ON_ONCE(op & REQ_NOWAIT);
-       WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
-
-       req = blk_mq_alloc_request(q, op, flags);
-       if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
-               q->mq_ops->initialize_rq_fn(req);
-
-       return req;
-}
-EXPORT_SYMBOL(blk_get_request);
-
-void blk_put_request(struct request *req)
-{
-       blk_mq_free_request(req);
-}
-EXPORT_SYMBOL(blk_put_request);
-
 static void handle_bad_sector(struct bio *bio, sector_t maxsector)
 {
        char b[BDEVNAME_SIZE];
 
-       pr_info_ratelimited("attempt to access beyond end of device\n"
+       pr_info_ratelimited("%s: attempt to access beyond end of device\n"
                            "%s: rw=%d, want=%llu, limit=%llu\n",
+                           current->comm,
                            bio_devname(bio, b), bio->bi_opf,
                            bio_end_sector(bio), maxsector);
 }
@@ -797,7 +745,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
        struct block_device *bdev = bio->bi_bdev;
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
        blk_status_t status = BLK_STS_IOERR;
        struct blk_plug *plug;
 
@@ -839,7 +787,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
        }
 
        if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               bio_clear_hipri(bio);
+               bio_clear_polled(bio);
 
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
@@ -912,25 +860,22 @@ end_io:
        return false;
 }
 
-static blk_qc_t __submit_bio(struct bio *bio)
+static void __submit_bio(struct bio *bio)
 {
        struct gendisk *disk = bio->bi_bdev->bd_disk;
-       blk_qc_t ret = BLK_QC_T_NONE;
 
        if (unlikely(bio_queue_enter(bio) != 0))
-               return BLK_QC_T_NONE;
+               return;
 
        if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
                goto queue_exit;
-       if (disk->fops->submit_bio) {
-               ret = disk->fops->submit_bio(bio);
-               goto queue_exit;
+       if (!disk->fops->submit_bio) {
+               blk_mq_submit_bio(bio);
+               return;
        }
-       return blk_mq_submit_bio(bio);
-
+       disk->fops->submit_bio(bio);
 queue_exit:
        blk_queue_exit(disk->queue);
-       return ret;
 }
 
 /*
@@ -952,10 +897,9 @@ queue_exit:
  * bio_list_on_stack[1] contains bios that were submitted before the current
  *     ->submit_bio_bio, but that haven't been processed yet.
  */
-static blk_qc_t __submit_bio_noacct(struct bio *bio)
+static void __submit_bio_noacct(struct bio *bio)
 {
        struct bio_list bio_list_on_stack[2];
-       blk_qc_t ret = BLK_QC_T_NONE;
 
        BUG_ON(bio->bi_next);
 
@@ -963,7 +907,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
        current->bio_list = bio_list_on_stack;
 
        do {
-               struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                struct bio_list lower, same;
 
                /*
@@ -972,7 +916,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
                bio_list_on_stack[1] = bio_list_on_stack[0];
                bio_list_init(&bio_list_on_stack[0]);
 
-               ret = __submit_bio(bio);
+               __submit_bio(bio);
 
                /*
                 * Sort new bios into those for a lower level and those for the
@@ -981,7 +925,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
                bio_list_init(&lower);
                bio_list_init(&same);
                while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
-                       if (q == bio->bi_bdev->bd_disk->queue)
+                       if (q == bdev_get_queue(bio->bi_bdev))
                                bio_list_add(&same, bio);
                        else
                                bio_list_add(&lower, bio);
@@ -995,22 +939,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
        } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
 
        current->bio_list = NULL;
-       return ret;
 }
 
-static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
+static void __submit_bio_noacct_mq(struct bio *bio)
 {
        struct bio_list bio_list[2] = { };
-       blk_qc_t ret;
 
        current->bio_list = bio_list;
 
        do {
-               ret = __submit_bio(bio);
+               __submit_bio(bio);
        } while ((bio = bio_list_pop(&bio_list[0])));
 
        current->bio_list = NULL;
-       return ret;
 }
 
 /**
@@ -1022,7 +963,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
  * systems and other upper level users of the block layer should use
  * submit_bio() instead.
  */
-blk_qc_t submit_bio_noacct(struct bio *bio)
+void submit_bio_noacct(struct bio *bio)
 {
        /*
         * We only want one ->submit_bio to be active at a time, else stack
@@ -1030,14 +971,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
         * to collect a list of requests submited by a ->submit_bio method while
         * it is active, and then process them after it returned.
         */
-       if (current->bio_list) {
+       if (current->bio_list)
                bio_list_add(&current->bio_list[0], bio);
-               return BLK_QC_T_NONE;
-       }
-
-       if (!bio->bi_bdev->bd_disk->fops->submit_bio)
-               return __submit_bio_noacct_mq(bio);
-       return __submit_bio_noacct(bio);
+       else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+               __submit_bio_noacct_mq(bio);
+       else
+               __submit_bio_noacct(bio);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 
@@ -1054,10 +993,10 @@ EXPORT_SYMBOL(submit_bio_noacct);
  * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
  * been called.
  */
-blk_qc_t submit_bio(struct bio *bio)
+void submit_bio(struct bio *bio)
 {
        if (blkcg_punt_bio_submit(bio))
-               return BLK_QC_T_NONE;
+               return;
 
        /*
         * If it's a regular read/write or a barrier with data attached,
@@ -1068,7 +1007,7 @@ blk_qc_t submit_bio(struct bio *bio)
 
                if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
                        count = queue_logical_block_size(
-                                       bio->bi_bdev->bd_disk->queue) >> 9;
+                                       bdev_get_queue(bio->bi_bdev)) >> 9;
                else
                        count = bio_sectors(bio);
 
@@ -1089,20 +1028,93 @@ blk_qc_t submit_bio(struct bio *bio)
        if (unlikely(bio_op(bio) == REQ_OP_READ &&
            bio_flagged(bio, BIO_WORKINGSET))) {
                unsigned long pflags;
-               blk_qc_t ret;
 
                psi_memstall_enter(&pflags);
-               ret = submit_bio_noacct(bio);
+               submit_bio_noacct(bio);
                psi_memstall_leave(&pflags);
-
-               return ret;
+               return;
        }
 
-       return submit_bio_noacct(bio);
+       submit_bio_noacct(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 
 /**
+ * bio_poll - poll for BIO completions
+ * @bio: bio to poll for
+ * @flags: BLK_POLL_* flags that control the behavior
+ *
+ * Poll for completions on queue associated with the bio. Returns number of
+ * completed entries found.
+ *
+ * Note: the caller must either be the context that submitted @bio, or
+ * be in a RCU critical section to prevent freeing of @bio.
+ */
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
+{
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+       blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
+       int ret;
+
+       if (cookie == BLK_QC_T_NONE ||
+           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+               return 0;
+
+       if (current->plug)
+               blk_flush_plug(current->plug, false);
+
+       if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
+               return 0;
+       if (WARN_ON_ONCE(!queue_is_mq(q)))
+               ret = 0;        /* not yet implemented, should not happen */
+       else
+               ret = blk_mq_poll(q, cookie, iob, flags);
+       blk_queue_exit(q);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(bio_poll);
+
+/*
+ * Helper to implement file_operations.iopoll.  Requires the bio to be stored
+ * in iocb->private, and cleared before freeing the bio.
+ */
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+                   unsigned int flags)
+{
+       struct bio *bio;
+       int ret = 0;
+
+       /*
+        * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
+        * point to a freshly allocated bio at this point.  If that happens
+        * we have a few cases to consider:
+        *
+        *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
+        *     simply nothing in this case
+        *  2) the bio points to a not poll enabled device.  bio_poll will catch
+        *     this and return 0
+        *  3) the bio points to a poll capable device, including but not
+        *     limited to the one that the original bio pointed to.  In this
+        *     case we will call into the actual poll method and poll for I/O,
+        *     even if we don't need to, but it won't cause harm either.
+        *
+        * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
+        * is still allocated. Because partitions hold a reference to the whole
+        * device bdev and thus disk, the disk is also still valid.  Grabbing
+        * a reference to the queue in bio_poll() ensures the hctxs and requests
+        * are still valid as well.
+        */
+       rcu_read_lock();
+       bio = READ_ONCE(kiocb->private);
+       if (bio && bio->bi_bdev)
+               ret = bio_poll(bio, iob, flags);
+       rcu_read_unlock();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
+
+/**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for the new queue limits
  * @q:  the queue
@@ -1177,8 +1189,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
        if (blk_crypto_insert_cloned_request(rq))
                return BLK_STS_IOERR;
 
-       if (blk_queue_io_stat(q))
-               blk_account_io_start(rq);
+       blk_account_io_start(rq);
 
        /*
         * Since we have a scheduler attached on the top device,
@@ -1246,41 +1257,19 @@ again:
        }
 }
 
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void __blk_account_io_done(struct request *req, u64 now)
 {
-       if (req->part && blk_do_io_stat(req)) {
-               const int sgrp = op_stat_group(req_op(req));
+       const int sgrp = op_stat_group(req_op(req));
 
-               part_stat_lock();
-               part_stat_add(req->part, sectors[sgrp], bytes >> 9);
-               part_stat_unlock();
-       }
-}
-
-void blk_account_io_done(struct request *req, u64 now)
-{
-       /*
-        * Account IO completion.  flush_rq isn't accounted as a
-        * normal IO on queueing nor completion.  Accounting the
-        * containing request is enough.
-        */
-       if (req->part && blk_do_io_stat(req) &&
-           !(req->rq_flags & RQF_FLUSH_SEQ)) {
-               const int sgrp = op_stat_group(req_op(req));
-
-               part_stat_lock();
-               update_io_ticks(req->part, jiffies, true);
-               part_stat_inc(req->part, ios[sgrp]);
-               part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
-               part_stat_unlock();
-       }
+       part_stat_lock();
+       update_io_ticks(req->part, jiffies, true);
+       part_stat_inc(req->part, ios[sgrp]);
+       part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+       part_stat_unlock();
 }
 
-void blk_account_io_start(struct request *rq)
+void __blk_account_io_start(struct request *rq)
 {
-       if (!blk_do_io_stat(rq))
-               return;
-
        /* passthrough requests can hold bios that do not have ->bi_bdev set */
        if (rq->bio && rq->bio->bi_bdev)
                rq->part = rq->bio->bi_bdev;
@@ -1376,112 +1365,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_steal_bios);
 
-/**
- * blk_update_request - Complete multiple bytes without completing the request
- * @req:      the request being processed
- * @error:    block status code
- * @nr_bytes: number of bytes to complete for @req
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @req, but doesn't complete
- *     the request structure even if @req doesn't have leftover.
- *     If @req has leftover, sets it up for the next range of segments.
- *
- *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
- *     %false return from this function.
- *
- * Note:
- *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
- *      except in the consistency check at the end of this function.
- *
- * Return:
- *     %false - this request doesn't have any more data
- *     %true  - this request has more data
- **/
-bool blk_update_request(struct request *req, blk_status_t error,
-               unsigned int nr_bytes)
-{
-       int total_bytes;
-
-       trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
-
-       if (!req->bio)
-               return false;
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
-           error == BLK_STS_OK)
-               req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
-
-       if (unlikely(error && !blk_rq_is_passthrough(req) &&
-                    !(req->rq_flags & RQF_QUIET)))
-               print_req_error(req, error, __func__);
-
-       blk_account_io_completion(req, nr_bytes);
-
-       total_bytes = 0;
-       while (req->bio) {
-               struct bio *bio = req->bio;
-               unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
-
-               if (bio_bytes == bio->bi_iter.bi_size)
-                       req->bio = bio->bi_next;
-
-               /* Completion has already been traced */
-               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-               req_bio_endio(req, bio, bio_bytes, error);
-
-               total_bytes += bio_bytes;
-               nr_bytes -= bio_bytes;
-
-               if (!nr_bytes)
-                       break;
-       }
-
-       /*
-        * completely done
-        */
-       if (!req->bio) {
-               /*
-                * Reset counters so that the request stacking driver
-                * can find how many bytes remain in the request
-                * later.
-                */
-               req->__data_len = 0;
-               return false;
-       }
-
-       req->__data_len -= total_bytes;
-
-       /* update sector only for requests with clear definition of sector */
-       if (!blk_rq_is_passthrough(req))
-               req->__sector += total_bytes >> 9;
-
-       /* mixed attributes always follow the first bio */
-       if (req->rq_flags & RQF_MIXED_MERGE) {
-               req->cmd_flags &= ~REQ_FAILFAST_MASK;
-               req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
-       }
-
-       if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
-               /*
-                * If total number of sectors is less than the first segment
-                * size, something has gone terribly wrong.
-                */
-               if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-                       blk_dump_rq_flags(req, "request botched");
-                       req->__data_len = blk_rq_cur_bytes(req);
-               }
-
-               /* recalculate the number of segments */
-               req->nr_phys_segments = blk_recalc_rq_segments(req);
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_update_request);
-
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 /**
  * rq_flush_dcache_pages - Helper function to flush all pages in a request
@@ -1629,6 +1512,32 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
+void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
+{
+       struct task_struct *tsk = current;
+
+       /*
+        * If this is a nested plug, don't actually assign it.
+        */
+       if (tsk->plug)
+               return;
+
+       plug->mq_list = NULL;
+       plug->cached_rq = NULL;
+       plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
+       plug->rq_count = 0;
+       plug->multiple_queues = false;
+       plug->has_elevator = false;
+       plug->nowait = false;
+       INIT_LIST_HEAD(&plug->cb_list);
+
+       /*
+        * Store ordering should not be needed here, since a potential
+        * preempt will imply a full memory barrier
+        */
+       tsk->plug = plug;
+}
+
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:      The &struct blk_plug that needs to be initialized
@@ -1654,25 +1563,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
  */
 void blk_start_plug(struct blk_plug *plug)
 {
-       struct task_struct *tsk = current;
-
-       /*
-        * If this is a nested plug, don't actually assign it.
-        */
-       if (tsk->plug)
-               return;
-
-       INIT_LIST_HEAD(&plug->mq_list);
-       INIT_LIST_HEAD(&plug->cb_list);
-       plug->rq_count = 0;
-       plug->multiple_queues = false;
-       plug->nowait = false;
-
-       /*
-        * Store ordering should not be needed here, since a potential
-        * preempt will imply a full memory barrier
-        */
-       tsk->plug = plug;
+       blk_start_plug_nr_ios(plug, 1);
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -1718,12 +1609,14 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
 }
 EXPORT_SYMBOL(blk_check_plugged);
 
-void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 {
-       flush_plug_callbacks(plug, from_schedule);
-
-       if (!list_empty(&plug->mq_list))
+       if (!list_empty(&plug->cb_list))
+               flush_plug_callbacks(plug, from_schedule);
+       if (!rq_list_empty(plug->mq_list))
                blk_mq_flush_plug_list(plug, from_schedule);
+       if (unlikely(!from_schedule && plug->cached_rq))
+               blk_mq_free_plug_rqs(plug);
 }
 
 /**
@@ -1738,11 +1631,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  */
 void blk_finish_plug(struct blk_plug *plug)
 {
-       if (plug != current->plug)
-               return;
-       blk_flush_plug_list(plug, false);
-
-       current->plug = NULL;
+       if (plug == current->plug) {
+               blk_flush_plug(plug, false);
+               current->plug = NULL;
+       }
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
index c322176..c87aba8 100644 (file)
 #include <crypto/skcipher.h>
 #include <linux/blk-cgroup.h>
 #include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
 #include <linux/blkdev.h>
 #include <linux/crypto.h>
-#include <linux/keyslot-manager.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/scatterlist.h>
 
 #include "blk-crypto-internal.h"
 
@@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
 static DEFINE_MUTEX(tfms_init_lock);
 static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
 
-static struct blk_crypto_keyslot {
+static struct blk_crypto_fallback_keyslot {
        enum blk_crypto_mode_num crypto_mode;
        struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
 } *blk_crypto_keyslots;
 
-static struct blk_keyslot_manager blk_crypto_ksm;
+static struct blk_crypto_profile blk_crypto_fallback_profile;
 static struct workqueue_struct *blk_crypto_wq;
 static mempool_t *blk_crypto_bounce_page_pool;
 static struct bio_set crypto_bio_split;
@@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
  */
 static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
 
-static void blk_crypto_evict_keyslot(unsigned int slot)
+static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
 {
-       struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+       struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
        enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
        int err;
 
@@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
        slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
 }
 
-static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     unsigned int slot)
+static int
+blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   unsigned int slot)
 {
-       struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+       struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
        const enum blk_crypto_mode_num crypto_mode =
                                                key->crypto_cfg.crypto_mode;
        int err;
 
        if (crypto_mode != slotp->crypto_mode &&
            slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
-               blk_crypto_evict_keyslot(slot);
+               blk_crypto_fallback_evict_keyslot(slot);
 
        slotp->crypto_mode = crypto_mode;
        err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
                                     key->size);
        if (err) {
-               blk_crypto_evict_keyslot(slot);
+               blk_crypto_fallback_evict_keyslot(slot);
                return err;
        }
        return 0;
 }
 
-static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
-                                   const struct blk_crypto_key *key,
-                                   unsigned int slot)
+static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
+                                            const struct blk_crypto_key *key,
+                                            unsigned int slot)
 {
-       blk_crypto_evict_keyslot(slot);
+       blk_crypto_fallback_evict_keyslot(slot);
        return 0;
 }
 
-/*
- * The crypto API fallback KSM ops - only used for a bio when it specifies a
- * blk_crypto_key that was not supported by the device's inline encryption
- * hardware.
- */
-static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
-       .keyslot_program        = blk_crypto_keyslot_program,
-       .keyslot_evict          = blk_crypto_keyslot_evict,
+static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
+       .keyslot_program        = blk_crypto_fallback_keyslot_program,
+       .keyslot_evict          = blk_crypto_fallback_keyslot_evict,
 };
 
 static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
        bio_endio(src_bio);
 }
 
-static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
@@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
        return bio;
 }
 
-static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
-                                       struct skcipher_request **ciph_req_ret,
-                                       struct crypto_wait *wait)
+static bool
+blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
+                                    struct skcipher_request **ciph_req_ret,
+                                    struct crypto_wait *wait)
 {
        struct skcipher_request *ciph_req;
-       const struct blk_crypto_keyslot *slotp;
-       int keyslot_idx = blk_ksm_get_slot_idx(slot);
+       const struct blk_crypto_fallback_keyslot *slotp;
+       int keyslot_idx = blk_crypto_keyslot_index(slot);
 
        slotp = &blk_crypto_keyslots[keyslot_idx];
        ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
        return true;
 }
 
-static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
 {
        struct bio *bio = *bio_ptr;
        unsigned int i = 0;
@@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
 {
        struct bio *src_bio, *enc_bio;
        struct bio_crypt_ctx *bc;
-       struct blk_ksm_keyslot *slot;
+       struct blk_crypto_keyslot *slot;
        int data_unit_size;
        struct skcipher_request *ciph_req = NULL;
        DECLARE_CRYPTO_WAIT(wait);
@@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
        blk_status_t blk_st;
 
        /* Split the bio if it's too big for single page bvec */
-       if (!blk_crypto_split_bio_if_needed(bio_ptr))
+       if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
                return false;
 
        src_bio = *bio_ptr;
@@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
        data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
 
        /* Allocate bounce bio for encryption */
-       enc_bio = blk_crypto_clone_bio(src_bio);
+       enc_bio = blk_crypto_fallback_clone_bio(src_bio);
        if (!enc_bio) {
                src_bio->bi_status = BLK_STS_RESOURCE;
                return false;
        }
 
        /*
-        * Use the crypto API fallback keyslot manager to get a crypto_skcipher
-        * for the algorithm and key specified for this bio.
+        * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+        * this bio's algorithm and key.
         */
-       blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+       blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+                                       bc->bc_key, &slot);
        if (blk_st != BLK_STS_OK) {
                src_bio->bi_status = blk_st;
                goto out_put_enc_bio;
        }
 
        /* and then allocate an skcipher_request for it */
-       if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+       if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
                src_bio->bi_status = BLK_STS_RESOURCE;
                goto out_release_keyslot;
        }
@@ -362,7 +361,7 @@ out_free_bounce_pages:
 out_free_ciph_req:
        skcipher_request_free(ciph_req);
 out_release_keyslot:
-       blk_ksm_put_slot(slot);
+       blk_crypto_put_keyslot(slot);
 out_put_enc_bio:
        if (enc_bio)
                bio_put(enc_bio);
@@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
                container_of(work, struct bio_fallback_crypt_ctx, work);
        struct bio *bio = f_ctx->bio;
        struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
-       struct blk_ksm_keyslot *slot;
+       struct blk_crypto_keyslot *slot;
        struct skcipher_request *ciph_req = NULL;
        DECLARE_CRYPTO_WAIT(wait);
        u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
        blk_status_t blk_st;
 
        /*
-        * Use the crypto API fallback keyslot manager to get a crypto_skcipher
-        * for the algorithm and key specified for this bio.
+        * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+        * this bio's algorithm and key.
         */
-       blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+       blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+                                       bc->bc_key, &slot);
        if (blk_st != BLK_STS_OK) {
                bio->bi_status = blk_st;
                goto out_no_keyslot;
        }
 
        /* and then allocate an skcipher_request for it */
-       if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+       if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
                bio->bi_status = BLK_STS_RESOURCE;
                goto out;
        }
@@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 
 out:
        skcipher_request_free(ciph_req);
-       blk_ksm_put_slot(slot);
+       blk_crypto_put_keyslot(slot);
 out_no_keyslot:
        mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
        bio_endio(bio);
@@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
  * @bio_ptr: pointer to the bio to prepare
  *
  * If bio is doing a WRITE operation, this splits the bio into two parts if it's
- * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
- * for the first part, encrypts it, and update bio_ptr to point to the bounce
- * bio.
+ * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
+ * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
+ * the bounce bio.
  *
  * For a READ operation, we mark the bio for decryption by using bi_private and
  * bi_end_io.
@@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
                return false;
        }
 
-       if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
-                                         &bc->bc_key->crypto_cfg)) {
+       if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
+                                       &bc->bc_key->crypto_cfg)) {
                bio->bi_status = BLK_STS_NOTSUPP;
                return false;
        }
@@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
 
 int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
 {
-       return blk_ksm_evict_key(&blk_crypto_ksm, key);
+       return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
 }
 
 static bool blk_crypto_fallback_inited;
@@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
 {
        int i;
        int err;
+       struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
 
        if (blk_crypto_fallback_inited)
                return 0;
@@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
        if (err)
                goto out;
 
-       err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+       err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
        if (err)
                goto fail_free_bioset;
        err = -ENOMEM;
 
-       blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
-       blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+       profile->ll_ops = blk_crypto_fallback_ll_ops;
+       profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
 
        /* All blk-crypto modes have a crypto API fallback. */
        for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
-               blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
-       blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+               profile->modes_supported[i] = 0xFFFFFFFF;
+       profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
 
        blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
                                        WQ_UNBOUND | WQ_HIGHPRI |
                                        WQ_MEM_RECLAIM, num_online_cpus());
        if (!blk_crypto_wq)
-               goto fail_free_ksm;
+               goto fail_destroy_profile;
 
        blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
                                      sizeof(blk_crypto_keyslots[0]),
@@ -595,8 +596,8 @@ fail_free_keyslots:
        kfree(blk_crypto_keyslots);
 fail_free_wq:
        destroy_workqueue(blk_crypto_wq);
-fail_free_ksm:
-       blk_ksm_destroy(&blk_crypto_ksm);
+fail_destroy_profile:
+       blk_crypto_profile_destroy(profile);
 fail_free_bioset:
        bioset_exit(&crypto_bio_split);
 out:
@@ -610,7 +611,7 @@ out:
 int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
 {
        const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
-       struct blk_crypto_keyslot *slotp;
+       struct blk_crypto_fallback_keyslot *slotp;
        unsigned int i;
        int err = 0;
 
index 0d36aae..2fb0d65 100644 (file)
@@ -7,7 +7,7 @@
 #define __LINUX_BLK_CRYPTO_INTERNAL_H
 
 #include <linux/bio.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 /* Represents a crypto mode supported by blk-crypto  */
 struct blk_crypto_mode {
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
new file mode 100644 (file)
index 0000000..605ba06
--- /dev/null
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/**
+ * DOC: blk-crypto profiles
+ *
+ * 'struct blk_crypto_profile' contains all generic inline encryption-related
+ * state for a particular inline encryption device.  blk_crypto_profile serves
+ * as the way that drivers for inline encryption hardware expose their crypto
+ * capabilities and certain functions (e.g., functions to program and evict
+ * keys) to upper layers.  Device drivers that want to support inline encryption
+ * construct a crypto profile, then associate it with the disk's request_queue.
+ *
+ * If the device has keyslots, then its blk_crypto_profile also handles managing
+ * these keyslots in a device-independent way, using the driver-provided
+ * functions to program and evict keys as needed.  This includes keeping track
+ * of which key and how many I/O requests are using each keyslot, getting
+ * keyslots for I/O requests, and handling key eviction requests.
+ *
+ * For more information, see Documentation/block/inline-encryption.rst.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/blk-crypto-profile.h>
+#include <linux/device.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+
+struct blk_crypto_keyslot {
+       atomic_t slot_refs;
+       struct list_head idle_slot_node;
+       struct hlist_node hash_node;
+       const struct blk_crypto_key *key;
+       struct blk_crypto_profile *profile;
+};
+
+static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+{
+       /*
+        * Calling into the driver requires profile->lock held and the device
+        * resumed.  But we must resume the device first, since that can acquire
+        * and release profile->lock via blk_crypto_reprogram_all_keys().
+        */
+       if (profile->dev)
+               pm_runtime_get_sync(profile->dev);
+       down_write(&profile->lock);
+}
+
+static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+{
+       up_write(&profile->lock);
+       if (profile->dev)
+               pm_runtime_put_sync(profile->dev);
+}
+
+/**
+ * blk_crypto_profile_init() - Initialize a blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Storage drivers must call this when starting to set up a blk_crypto_profile,
+ * before filling in additional fields.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+                           unsigned int num_slots)
+{
+       unsigned int slot;
+       unsigned int i;
+       unsigned int slot_hashtable_size;
+
+       memset(profile, 0, sizeof(*profile));
+       init_rwsem(&profile->lock);
+
+       if (num_slots == 0)
+               return 0;
+
+       /* Initialize keyslot management data. */
+
+       profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]),
+                                 GFP_KERNEL);
+       if (!profile->slots)
+               return -ENOMEM;
+
+       profile->num_slots = num_slots;
+
+       init_waitqueue_head(&profile->idle_slots_wait_queue);
+       INIT_LIST_HEAD(&profile->idle_slots);
+
+       for (slot = 0; slot < num_slots; slot++) {
+               profile->slots[slot].profile = profile;
+               list_add_tail(&profile->slots[slot].idle_slot_node,
+                             &profile->idle_slots);
+       }
+
+       spin_lock_init(&profile->idle_slots_lock);
+
+       slot_hashtable_size = roundup_pow_of_two(num_slots);
+       /*
+        * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
+        * buckets.  This only makes a difference when there is only 1 keyslot.
+        */
+       if (slot_hashtable_size < 2)
+               slot_hashtable_size = 2;
+
+       profile->log_slot_ht_size = ilog2(slot_hashtable_size);
+       profile->slot_hashtable =
+               kvmalloc_array(slot_hashtable_size,
+                              sizeof(profile->slot_hashtable[0]), GFP_KERNEL);
+       if (!profile->slot_hashtable)
+               goto err_destroy;
+       for (i = 0; i < slot_hashtable_size; i++)
+               INIT_HLIST_HEAD(&profile->slot_hashtable[i]);
+
+       return 0;
+
+err_destroy:
+       blk_crypto_profile_destroy(profile);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_init);
+
+static void blk_crypto_profile_destroy_callback(void *profile)
+{
+       blk_crypto_profile_destroy(profile);
+}
+
+/**
+ * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init()
+ * @dev: the device which owns the blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be
+ * called automatically on driver detach.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int devm_blk_crypto_profile_init(struct device *dev,
+                                struct blk_crypto_profile *profile,
+                                unsigned int num_slots)
+{
+       int err = blk_crypto_profile_init(profile, num_slots);
+
+       if (err)
+               return err;
+
+       return devm_add_action_or_reset(dev,
+                                       blk_crypto_profile_destroy_callback,
+                                       profile);
+}
+EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init);
+
+static inline struct hlist_head *
+blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile,
+                              const struct blk_crypto_key *key)
+{
+       return &profile->slot_hashtable[
+                       hash_ptr(key, profile->log_slot_ht_size)];
+}
+
+static void
+blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot)
+{
+       struct blk_crypto_profile *profile = slot->profile;
+       unsigned long flags;
+
+       spin_lock_irqsave(&profile->idle_slots_lock, flags);
+       list_del(&slot->idle_slot_node);
+       spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_keyslot(struct blk_crypto_profile *profile,
+                       const struct blk_crypto_key *key)
+{
+       const struct hlist_head *head =
+               blk_crypto_hash_bucket_for_key(profile, key);
+       struct blk_crypto_keyslot *slotp;
+
+       hlist_for_each_entry(slotp, head, hash_node) {
+               if (slotp->key == key)
+                       return slotp;
+       }
+       return NULL;
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile,
+                                const struct blk_crypto_key *key)
+{
+       struct blk_crypto_keyslot *slot;
+
+       slot = blk_crypto_find_keyslot(profile, key);
+       if (!slot)
+               return NULL;
+       if (atomic_inc_return(&slot->slot_refs) == 1) {
+               /* Took first reference to this slot; remove it from LRU list */
+               blk_crypto_remove_slot_from_lru_list(slot);
+       }
+       return slot;
+}
+
+/**
+ * blk_crypto_keyslot_index() - Get the index of a keyslot
+ * @slot: a keyslot that blk_crypto_get_keyslot() returned
+ *
+ * Return: the 0-based index of the keyslot within the device's keyslots.
+ */
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot)
+{
+       return slot - slot->profile->slots;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index);
+
+/**
+ * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed.
+ * @profile: the crypto profile of the device the key will be used on
+ * @key: the key that will be used
+ * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct
+ *           will be stored here; otherwise NULL will be stored here.
+ *
+ * If the device has keyslots, this gets a keyslot that's been programmed with
+ * the specified key.  If the key is already in a slot, this reuses it;
+ * otherwise this waits for a slot to become idle and programs the key into it.
+ *
+ * This must be paired with a call to blk_crypto_put_keyslot().
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or
+ *        one wasn't needed; or a blk_status_t error on failure.
+ */
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   struct blk_crypto_keyslot **slot_ptr)
+{
+       struct blk_crypto_keyslot *slot;
+       int slot_idx;
+       int err;
+
+       *slot_ptr = NULL;
+
+       /*
+        * If the device has no concept of "keyslots", then there is no need to
+        * get one.
+        */
+       if (profile->num_slots == 0)
+               return BLK_STS_OK;
+
+       down_read(&profile->lock);
+       slot = blk_crypto_find_and_grab_keyslot(profile, key);
+       up_read(&profile->lock);
+       if (slot)
+               goto success;
+
+       for (;;) {
+               blk_crypto_hw_enter(profile);
+               slot = blk_crypto_find_and_grab_keyslot(profile, key);
+               if (slot) {
+                       blk_crypto_hw_exit(profile);
+                       goto success;
+               }
+
+               /*
+                * If we're here, that means there wasn't a slot that was
+                * already programmed with the key. So try to program it.
+                */
+               if (!list_empty(&profile->idle_slots))
+                       break;
+
+               blk_crypto_hw_exit(profile);
+               wait_event(profile->idle_slots_wait_queue,
+                          !list_empty(&profile->idle_slots));
+       }
+
+       slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot,
+                               idle_slot_node);
+       slot_idx = blk_crypto_keyslot_index(slot);
+
+       err = profile->ll_ops.keyslot_program(profile, key, slot_idx);
+       if (err) {
+               wake_up(&profile->idle_slots_wait_queue);
+               blk_crypto_hw_exit(profile);
+               return errno_to_blk_status(err);
+       }
+
+       /* Move this slot to the hash list for the new key. */
+       if (slot->key)
+               hlist_del(&slot->hash_node);
+       slot->key = key;
+       hlist_add_head(&slot->hash_node,
+                      blk_crypto_hash_bucket_for_key(profile, key));
+
+       atomic_set(&slot->slot_refs, 1);
+
+       blk_crypto_remove_slot_from_lru_list(slot);
+
+       blk_crypto_hw_exit(profile);
+success:
+       *slot_ptr = slot;
+       return BLK_STS_OK;
+}
+
+/**
+ * blk_crypto_put_keyslot() - Release a reference to a keyslot
+ * @slot: The keyslot to release the reference of (may be NULL).
+ *
+ * Context: Any context.
+ */
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
+{
+       struct blk_crypto_profile *profile;
+       unsigned long flags;
+
+       if (!slot)
+               return;
+
+       profile = slot->profile;
+
+       if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
+                                       &profile->idle_slots_lock, flags)) {
+               list_add_tail(&slot->idle_slot_node, &profile->idle_slots);
+               spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+               wake_up(&profile->idle_slots_wait_queue);
+       }
+}
+
+/**
+ * __blk_crypto_cfg_supported() - Check whether the given crypto profile
+ *                               supports the given crypto configuration.
+ * @profile: the crypto profile to check
+ * @cfg: the crypto configuration to check for
+ *
+ * Return: %true if @profile supports the given @cfg.
+ */
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+                               const struct blk_crypto_config *cfg)
+{
+       if (!profile)
+               return false;
+       if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
+               return false;
+       if (profile->max_dun_bytes_supported < cfg->dun_bytes)
+               return false;
+       return true;
+}
+
+/**
+ * __blk_crypto_evict_key() - Evict a key from a device.
+ * @profile: the crypto profile of the device
+ * @key: the key to evict.  It must not still be used in any I/O.
+ *
+ * If the device has keyslots, this finds the keyslot (if any) that contains the
+ * specified key and calls the driver's keyslot_evict function to evict it.
+ *
+ * Otherwise, this just calls the driver's keyslot_evict function if it is
+ * implemented, passing just the key (without any particular keyslot).  This
+ * allows layered devices to evict the key from their underlying devices.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
+ *        if the keyslot is still in use, or another -errno value on other
+ *        error.
+ */
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+                          const struct blk_crypto_key *key)
+{
+       struct blk_crypto_keyslot *slot;
+       int err = 0;
+
+       if (profile->num_slots == 0) {
+               if (profile->ll_ops.keyslot_evict) {
+                       blk_crypto_hw_enter(profile);
+                       err = profile->ll_ops.keyslot_evict(profile, key, -1);
+                       blk_crypto_hw_exit(profile);
+                       return err;
+               }
+               return 0;
+       }
+
+       blk_crypto_hw_enter(profile);
+       slot = blk_crypto_find_keyslot(profile, key);
+       if (!slot)
+               goto out_unlock;
+
+       if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
+               err = -EBUSY;
+               goto out_unlock;
+       }
+       err = profile->ll_ops.keyslot_evict(profile, key,
+                                           blk_crypto_keyslot_index(slot));
+       if (err)
+               goto out_unlock;
+
+       hlist_del(&slot->hash_node);
+       slot->key = NULL;
+       err = 0;
+out_unlock:
+       blk_crypto_hw_exit(profile);
+       return err;
+}
+
+/**
+ * blk_crypto_reprogram_all_keys() - Re-program all keyslots.
+ * @profile: The crypto profile
+ *
+ * Re-program all keyslots that are supposed to have a key programmed.  This is
+ * intended only for use by drivers for hardware that loses its keys on reset.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ */
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile)
+{
+       unsigned int slot;
+
+       if (profile->num_slots == 0)
+               return;
+
+       /* This is for device initialization, so don't resume the device */
+       down_write(&profile->lock);
+       for (slot = 0; slot < profile->num_slots; slot++) {
+               const struct blk_crypto_key *key = profile->slots[slot].key;
+               int err;
+
+               if (!key)
+                       continue;
+
+               err = profile->ll_ops.keyslot_program(profile, key, slot);
+               WARN_ON(err);
+       }
+       up_write(&profile->lock);
+}
+EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile)
+{
+       if (!profile)
+               return;
+       kvfree(profile->slot_hashtable);
+       kvfree_sensitive(profile->slots,
+                        sizeof(profile->slots[0]) * profile->num_slots);
+       memzero_explicit(profile, sizeof(*profile));
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy);
+
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+                        struct request_queue *q)
+{
+       if (blk_integrity_queue_supports_integrity(q)) {
+               pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+               return false;
+       }
+       q->crypto_profile = profile;
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_register);
+
+void blk_crypto_unregister(struct request_queue *q)
+{
+       q->crypto_profile = NULL;
+}
+
+/**
+ * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
+ *                                      by child device
+ * @parent: the crypto profile for the parent device
+ * @child: the crypto profile for the child device, or NULL
+ *
+ * This clears all crypto capabilities in @parent that aren't set in @child.  If
+ * @child is NULL, then this clears all parent capabilities.
+ *
+ * Only use this when setting up the crypto profile for a layered device, before
+ * it's been exposed yet.
+ */
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+                                      const struct blk_crypto_profile *child)
+{
+       if (child) {
+               unsigned int i;
+
+               parent->max_dun_bytes_supported =
+                       min(parent->max_dun_bytes_supported,
+                           child->max_dun_bytes_supported);
+               for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
+                       parent->modes_supported[i] &= child->modes_supported[i];
+       } else {
+               parent->max_dun_bytes_supported = 0;
+               memset(parent->modes_supported, 0,
+                      sizeof(parent->modes_supported));
+       }
+}
+EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
+
+/**
+ * blk_crypto_has_capabilities() - Check whether @target supports at least all
+ *                                the crypto capabilities that @reference does.
+ * @target: the target profile
+ * @reference: the reference profile
+ *
+ * Return: %true if @target supports all the crypto capabilities of @reference.
+ */
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+                                const struct blk_crypto_profile *reference)
+{
+       int i;
+
+       if (!reference)
+               return true;
+
+       if (!target)
+               return false;
+
+       for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) {
+               if (reference->modes_supported[i] & ~target->modes_supported[i])
+                       return false;
+       }
+
+       if (reference->max_dun_bytes_supported >
+           target->max_dun_bytes_supported)
+               return false;
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
+
+/**
+ * blk_crypto_update_capabilities() - Update the capabilities of a crypto
+ *                                   profile to match those of another crypto
+ *                                   profile.
+ * @dst: The crypto profile whose capabilities to update.
+ * @src: The crypto profile whose capabilities this function will update @dst's
+ *      capabilities to.
+ *
+ * Blk-crypto requires that crypto capabilities that were
+ * advertised when a bio was created continue to be supported by the
+ * device until that bio is ended. This is turn means that a device cannot
+ * shrink its advertised crypto capabilities without any explicit
+ * synchronization with upper layers. So if there's no such explicit
+ * synchronization, @src must support all the crypto capabilities that
+ * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)).
+ *
+ * Note also that as long as the crypto capabilities are being expanded, the
+ * order of updates becoming visible is not important because it's alright
+ * for blk-crypto to see stale values - they only cause blk-crypto to
+ * believe that a crypto capability isn't supported when it actually is (which
+ * might result in blk-crypto-fallback being used if available, or the bio being
+ * failed).
+ */
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+                                   const struct blk_crypto_profile *src)
+{
+       memcpy(dst->modes_supported, src->modes_supported,
+              sizeof(dst->modes_supported));
+
+       dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
index 103c2e2..ec9efee 100644 (file)
@@ -11,7 +11,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 
@@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
 
 blk_status_t __blk_crypto_init_request(struct request *rq)
 {
-       return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
-                                       &rq->crypt_keyslot);
+       return blk_crypto_get_keyslot(rq->q->crypto_profile,
+                                     rq->crypt_ctx->bc_key,
+                                     &rq->crypt_keyslot);
 }
 
 /**
@@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
  */
 void __blk_crypto_free_request(struct request *rq)
 {
-       blk_ksm_put_slot(rq->crypt_keyslot);
+       blk_crypto_put_keyslot(rq->crypt_keyslot);
        mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
        blk_crypto_rq_set_defaults(rq);
 }
@@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 {
        struct bio *bio = *bio_ptr;
        const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
+       struct blk_crypto_profile *profile;
 
        /* Error if bio has no data. */
        if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
         * Success if device supports the encryption context, or if we succeeded
         * in falling back to the crypto API.
         */
-       if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
-                                        &bc_key->crypto_cfg))
+       profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
+       if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
                return true;
 
        if (blk_crypto_fallback_bio_prep(bio_ptr))
@@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
                                 const struct blk_crypto_config *cfg)
 {
        return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-              blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+              __blk_crypto_cfg_supported(q->crypto_profile, cfg);
 }
 
 /**
@@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
 int blk_crypto_start_using_key(const struct blk_crypto_key *key,
                               struct request_queue *q)
 {
-       if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+       if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
                return 0;
        return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
 }
@@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
  * evicted from any hardware that it might have been programmed into.  The key
  * must not be in use by any in-flight IO when this function is called.
  *
- * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
  */
 int blk_crypto_evict_key(struct request_queue *q,
                         const struct blk_crypto_key *key)
 {
-       if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
-               return blk_ksm_evict_key(q->ksm, key);
+       if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+               return __blk_crypto_evict_key(q->crypto_profile, key);
 
        /*
-        * If the request queue's associated inline encryption hardware didn't
-        * have support for the key, then the key might have been programmed
-        * into the fallback keyslot manager, so try to evict from there.
+        * If the request_queue didn't support the key, then blk-crypto-fallback
+        * may have been used, so try to evict the key from blk-crypto-fallback.
         */
        return blk_crypto_fallback_evict_key(key);
 }
index d6cd501..1b8b47f 100644 (file)
@@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 
 static bool blk_rq_is_poll(struct request *rq)
 {
-       return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL;
+       if (!rq->mq_hctx)
+               return false;
+       if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+               return false;
+       if (WARN_ON_ONCE(!rq->bio))
+               return false;
+       return true;
 }
 
 static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
 {
        do {
-               blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true);
+               bio_poll(rq->bio, NULL, 0);
                cond_resched();
        } while (!completion_done(wait));
 }
index 4201728..8e364bd 100644 (file)
@@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
  * @rq is being submitted.  Analyze what needs to be done and put it on the
  * right queue.
  */
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
 {
        struct request_queue *q = rq->q;
        unsigned long fflags = q->queue_flags;  /* may change, cache */
@@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq)
         */
        if (!policy) {
                blk_mq_end_request(rq, 0);
-               return;
+               return true;
        }
 
        BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
@@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq)
         * for normal execution.
         */
        if ((policy & REQ_FSEQ_DATA) &&
-           !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-               blk_mq_request_bypass_insert(rq, false, false);
-               return;
-       }
+           !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
+               return false;
 
        /*
         * @rq should go through flush machinery.  Mark it part of flush
@@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq)
        spin_lock_irq(&fq->mq_flush_lock);
        blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
        spin_unlock_irq(&fq->mq_flush_lock);
+
+       return true;
 }
 
 /**
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
new file mode 100644 (file)
index 0000000..c246c42
--- /dev/null
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Block device concurrent positioning ranges.
+ *
+ *  Copyright (C) 2021 Western Digital Corporation or its Affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include "blk.h"
+
+static ssize_t
+blk_ia_range_sector_show(struct blk_independent_access_range *iar,
+                        char *buf)
+{
+       return sprintf(buf, "%llu\n", iar->sector);
+}
+
+static ssize_t
+blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar,
+                            char *buf)
+{
+       return sprintf(buf, "%llu\n", iar->nr_sectors);
+}
+
+struct blk_ia_range_sysfs_entry {
+       struct attribute attr;
+       ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
+       .attr = { .name = "sector", .mode = 0444 },
+       .show = blk_ia_range_sector_show,
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
+       .attr = { .name = "nr_sectors", .mode = 0444 },
+       .show = blk_ia_range_nr_sectors_show,
+};
+
+static struct attribute *blk_ia_range_attrs[] = {
+       &blk_ia_range_sector_entry.attr,
+       &blk_ia_range_nr_sectors_entry.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(blk_ia_range);
+
+static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj,
+                                     struct attribute *attr, char *buf)
+{
+       struct blk_ia_range_sysfs_entry *entry =
+               container_of(attr, struct blk_ia_range_sysfs_entry, attr);
+       struct blk_independent_access_range *iar =
+               container_of(kobj, struct blk_independent_access_range, kobj);
+       ssize_t ret;
+
+       mutex_lock(&iar->queue->sysfs_lock);
+       ret = entry->show(iar, buf);
+       mutex_unlock(&iar->queue->sysfs_lock);
+
+       return ret;
+}
+
+static const struct sysfs_ops blk_ia_range_sysfs_ops = {
+       .show   = blk_ia_range_sysfs_show,
+};
+
+/*
+ * Independent access range entries are not freed individually, but alltogether
+ * with struct blk_independent_access_ranges and its array of ranges. Since
+ * kobject_add() takes a reference on the parent kobject contained in
+ * struct blk_independent_access_ranges, the array of independent access range
+ * entries cannot be freed until kobject_del() is called for all entries.
+ * So we do not need to do anything here, but still need this no-op release
+ * operation to avoid complaints from the kobject code.
+ */
+static void blk_ia_range_sysfs_nop_release(struct kobject *kobj)
+{
+}
+
+static struct kobj_type blk_ia_range_ktype = {
+       .sysfs_ops      = &blk_ia_range_sysfs_ops,
+       .default_groups = blk_ia_range_groups,
+       .release        = blk_ia_range_sysfs_nop_release,
+};
+
+/*
+ * This will be executed only after all independent access range entries are
+ * removed with kobject_del(), at which point, it is safe to free everything,
+ * including the array of ranges.
+ */
+static void blk_ia_ranges_sysfs_release(struct kobject *kobj)
+{
+       struct blk_independent_access_ranges *iars =
+               container_of(kobj, struct blk_independent_access_ranges, kobj);
+
+       kfree(iars);
+}
+
+static struct kobj_type blk_ia_ranges_ktype = {
+       .release        = blk_ia_ranges_sysfs_release,
+};
+
+/**
+ * disk_register_ia_ranges - register with sysfs a set of independent
+ *                         access ranges
+ * @disk:      Target disk
+ * @new_iars:  New set of independent access ranges
+ *
+ * Register with sysfs a set of independent access ranges for @disk.
+ * If @new_iars is not NULL, this set of ranges is registered and the old set
+ * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
+ * registered if it is not already.
+ */
+int disk_register_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *new_iars)
+{
+       struct request_queue *q = disk->queue;
+       struct blk_independent_access_ranges *iars;
+       int i, ret;
+
+       lockdep_assert_held(&q->sysfs_dir_lock);
+       lockdep_assert_held(&q->sysfs_lock);
+
+       /* If a new range set is specified, unregister the old one */
+       if (new_iars) {
+               if (q->ia_ranges)
+                       disk_unregister_independent_access_ranges(disk);
+               q->ia_ranges = new_iars;
+       }
+
+       iars = q->ia_ranges;
+       if (!iars)
+               return 0;
+
+       /*
+        * At this point, iars is the new set of sector access ranges that needs
+        * to be registered with sysfs.
+        */
+       WARN_ON(iars->sysfs_registered);
+       ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
+                                  &q->kobj, "%s", "independent_access_ranges");
+       if (ret) {
+               q->ia_ranges = NULL;
+               kfree(iars);
+               return ret;
+       }
+
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               iars->ia_range[i].queue = q;
+               ret = kobject_init_and_add(&iars->ia_range[i].kobj,
+                                          &blk_ia_range_ktype, &iars->kobj,
+                                          "%d", i);
+               if (ret) {
+                       while (--i >= 0)
+                               kobject_del(&iars->ia_range[i].kobj);
+                       kobject_del(&iars->kobj);
+                       kobject_put(&iars->kobj);
+                       return ret;
+               }
+       }
+
+       iars->sysfs_registered = true;
+
+       return 0;
+}
+
+void disk_unregister_independent_access_ranges(struct gendisk *disk)
+{
+       struct request_queue *q = disk->queue;
+       struct blk_independent_access_ranges *iars = q->ia_ranges;
+       int i;
+
+       lockdep_assert_held(&q->sysfs_dir_lock);
+       lockdep_assert_held(&q->sysfs_lock);
+
+       if (!iars)
+               return;
+
+       if (iars->sysfs_registered) {
+               for (i = 0; i < iars->nr_ia_ranges; i++)
+                       kobject_del(&iars->ia_range[i].kobj);
+               kobject_del(&iars->kobj);
+               kobject_put(&iars->kobj);
+       } else {
+               kfree(iars);
+       }
+
+       q->ia_ranges = NULL;
+}
+
+static struct blk_independent_access_range *
+disk_find_ia_range(struct blk_independent_access_ranges *iars,
+                 sector_t sector)
+{
+       struct blk_independent_access_range *iar;
+       int i;
+
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               iar = &iars->ia_range[i];
+               if (sector >= iar->sector &&
+                   sector < iar->sector + iar->nr_sectors)
+                       return iar;
+       }
+
+       return NULL;
+}
+
+static bool disk_check_ia_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars)
+{
+       struct blk_independent_access_range *iar, *tmp;
+       sector_t capacity = get_capacity(disk);
+       sector_t sector = 0;
+       int i;
+
+       /*
+        * While sorting the ranges in increasing LBA order, check that the
+        * ranges do not overlap, that there are no sector holes and that all
+        * sectors belong to one range.
+        */
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               tmp = disk_find_ia_range(iars, sector);
+               if (!tmp || tmp->sector != sector) {
+                       pr_warn("Invalid non-contiguous independent access ranges\n");
+                       return false;
+               }
+
+               iar = &iars->ia_range[i];
+               if (tmp != iar) {
+                       swap(iar->sector, tmp->sector);
+                       swap(iar->nr_sectors, tmp->nr_sectors);
+               }
+
+               sector += iar->nr_sectors;
+       }
+
+       if (sector != capacity) {
+               pr_warn("Independent access ranges do not match disk capacity\n");
+               return false;
+       }
+
+       return true;
+}
+
+static bool disk_ia_ranges_changed(struct gendisk *disk,
+                                  struct blk_independent_access_ranges *new)
+{
+       struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+       int i;
+
+       if (!old)
+               return true;
+
+       if (old->nr_ia_ranges != new->nr_ia_ranges)
+               return true;
+
+       for (i = 0; i < old->nr_ia_ranges; i++) {
+               if (new->ia_range[i].sector != old->ia_range[i].sector ||
+                   new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors)
+                       return true;
+       }
+
+       return false;
+}
+
+/**
+ * disk_alloc_independent_access_ranges - Allocate an independent access ranges
+ *                                        data structure
+ * @disk:              target disk
+ * @nr_ia_ranges:      Number of independent access ranges
+ *
+ * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges
+ * access range descriptors.
+ */
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges)
+{
+       struct blk_independent_access_ranges *iars;
+
+       iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges),
+                           GFP_KERNEL, disk->queue->node);
+       if (iars)
+               iars->nr_ia_ranges = nr_ia_ranges;
+       return iars;
+}
+EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges);
+
+/**
+ * disk_set_independent_access_ranges - Set a disk independent access ranges
+ * @disk:      target disk
+ * @iars:      independent access ranges structure
+ *
+ * Set the independent access ranges information of the request queue
+ * of @disk to @iars. If @iars is NULL and the independent access ranges
+ * structure already set is cleared. If there are no differences between
+ * @iars and the independent access ranges structure already set, @iars
+ * is freed.
+ */
+void disk_set_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars)
+{
+       struct request_queue *q = disk->queue;
+
+       if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
+               kfree(iars);
+               iars = NULL;
+       }
+
+       mutex_lock(&q->sysfs_dir_lock);
+       mutex_lock(&q->sysfs_lock);
+
+       if (iars) {
+               if (!disk_check_ia_ranges(disk, iars)) {
+                       kfree(iars);
+                       iars = NULL;
+                       goto reg;
+               }
+
+               if (!disk_ia_ranges_changed(disk, iars)) {
+                       kfree(iars);
+                       goto unlock;
+               }
+       }
+
+       /*
+        * This may be called for a registered queue. E.g. during a device
+        * revalidation. If that is the case, we need to unregister the old
+        * set of independent access ranges and register the new set. If the
+        * queue is not registered, registration of the device request queue
+        * will register the independent access ranges, so only swap in the
+        * new set and free the old one.
+        */
+reg:
+       if (blk_queue_registered(q)) {
+               disk_register_independent_access_ranges(disk, iars);
+       } else {
+               swap(q->ia_ranges, iars);
+               kfree(iars);
+       }
+
+unlock:
+       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->sysfs_dir_lock);
+}
+EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
index 16d5d53..d670d54 100644 (file)
@@ -6,7 +6,7 @@
  * Written by: Martin K. Petersen <martin.petersen@oracle.com>
  */
 
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
@@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
        blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       if (disk->queue->ksm) {
+       if (disk->queue->crypto_profile) {
                pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-               blk_ksm_unregister(disk->queue);
+               blk_crypto_unregister(disk->queue);
        }
 #endif
 }
index b3880e4..a5b37cc 100644 (file)
@@ -3165,12 +3165,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
 
-       ioc = q_to_ioc(bdev->bd_disk->queue);
+       ioc = q_to_ioc(bdev_get_queue(bdev));
        if (!ioc) {
-               ret = blk_iocost_init(bdev->bd_disk->queue);
+               ret = blk_iocost_init(bdev_get_queue(bdev));
                if (ret)
                        goto err;
-               ioc = q_to_ioc(bdev->bd_disk->queue);
+               ioc = q_to_ioc(bdev_get_queue(bdev));
        }
 
        spin_lock_irq(&ioc->lock);
@@ -3332,12 +3332,12 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
 
-       ioc = q_to_ioc(bdev->bd_disk->queue);
+       ioc = q_to_ioc(bdev_get_queue(bdev));
        if (!ioc) {
-               ret = blk_iocost_init(bdev->bd_disk->queue);
+               ret = blk_iocost_init(bdev_get_queue(bdev));
                if (ret)
                        goto err;
-               ioc = q_to_ioc(bdev->bd_disk->queue);
+               ioc = q_to_ioc(bdev_get_queue(bdev));
        }
 
        spin_lock_irq(&ioc->lock);
index c0545f9..6593c71 100644 (file)
@@ -74,6 +74,7 @@
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 #include "blk.h"
index 7a5c81c..df69f4b 100644 (file)
@@ -6,12 +6,45 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/scatterlist.h>
 
 #include <trace/events/block.h>
 
 #include "blk.h"
 #include "blk-rq-qos.h"
+#include "blk-throttle.h"
+
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       struct bvec_iter iter = bio->bi_iter;
+       int idx;
+
+       bio_get_first_bvec(bio, bv);
+       if (bv->bv_len == bio->bi_iter.bi_size)
+               return;         /* this bio only has a single bvec */
+
+       bio_advance_iter(bio, &iter, iter.bi_size);
+
+       if (!iter.bi_bvec_done)
+               idx = iter.bi_idx - 1;
+       else    /* in the middle of bvec */
+               idx = iter.bi_idx;
+
+       *bv = bio->bi_io_vec[idx];
+
+       /*
+        * iter.bi_bvec_done records actual length of the last bvec
+        * if this bio ends in the middle of one io vector
+        */
+       if (iter.bi_bvec_done)
+               bv->bv_len = iter.bi_bvec_done;
+}
 
 static inline bool bio_will_gap(struct request_queue *q,
                struct request *prev_rq, struct bio *prev, struct bio *next)
@@ -285,13 +318,13 @@ split:
         * iopoll in direct IO routine. Given performance gain of iopoll for
         * big IO can be trival, disable iopoll when split needed.
         */
-       bio_clear_hipri(bio);
-
+       bio_clear_polled(bio);
        return bio_split(bio, sectors, GFP_NOIO, bs);
 }
 
 /**
  * __blk_queue_split - split a bio and submit the second half
+ * @q:       [in] request_queue new bio is being queued at
  * @bio:     [in, out] bio to be split
  * @nr_segs: [out] number of segments in the first bio
  *
@@ -302,9 +335,9 @@ split:
  * of the caller to ensure that q->bio_split is only released after processing
  * of the split bio has finished.
  */
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+                      unsigned int *nr_segs)
 {
-       struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
        struct bio *split = NULL;
 
        switch (bio_op(*bio)) {
@@ -321,21 +354,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
                                nr_segs);
                break;
        default:
-               /*
-                * All drivers must accept single-segments bios that are <=
-                * PAGE_SIZE.  This is a quick and dirty check that relies on
-                * the fact that bi_io_vec[0] is always valid if a bio has data.
-                * The check might lead to occasional false negatives when bios
-                * are cloned, but compared to the performance impact of cloned
-                * bios themselves the loop below doesn't matter anyway.
-                */
-               if (!q->limits.chunk_sectors &&
-                   (*bio)->bi_vcnt == 1 &&
-                   ((*bio)->bi_io_vec[0].bv_len +
-                    (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
-                       *nr_segs = 1;
-                       break;
-               }
                split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
                break;
        }
@@ -365,9 +383,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
  */
 void blk_queue_split(struct bio **bio)
 {
+       struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
        unsigned int nr_segs;
 
-       __blk_queue_split(bio, &nr_segs);
+       if (blk_may_split(q, *bio))
+               __blk_queue_split(q, bio, &nr_segs);
 }
 EXPORT_SYMBOL(blk_queue_split);
 
@@ -558,6 +578,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
        return queue_max_segments(rq->q);
 }
 
+static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
+                                                 sector_t offset)
+{
+       struct request_queue *q = rq->q;
+
+       if (blk_rq_is_passthrough(rq))
+               return q->limits.max_hw_sectors;
+
+       if (!q->limits.chunk_sectors ||
+           req_op(rq) == REQ_OP_DISCARD ||
+           req_op(rq) == REQ_OP_SECURE_ERASE)
+               return blk_queue_get_max_sectors(q, req_op(rq));
+
+       return min(blk_max_size_offset(q, offset, 0),
+                       blk_queue_get_max_sectors(q, req_op(rq)));
+}
+
 static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
                unsigned int nr_phys_segs)
 {
@@ -718,6 +755,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
        return ELEVATOR_NO_MERGE;
 }
 
+static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+{
+       if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
+               return true;
+       return false;
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -1023,12 +1067,11 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
  * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
+ * @same_queue_rq: output value, will be true if there's an existing request
+ * from the passed in @q already in the plug list
  *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list.  Returns %true if merge was successful,
+ * Determine whether @bio being queued on @q can be merged with the previous
+ * request on %current's plugged list.  Returns %true if merge was successful,
  * otherwise %false.
  *
  * Plugging coalesces IOs from the same issuer for the same purpose without
@@ -1041,36 +1084,26 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs, struct request **same_queue_rq)
+               unsigned int nr_segs, bool *same_queue_rq)
 {
        struct blk_plug *plug;
        struct request *rq;
-       struct list_head *plug_list;
 
        plug = blk_mq_plug(q, bio);
-       if (!plug)
+       if (!plug || rq_list_empty(plug->mq_list))
                return false;
 
-       plug_list = &plug->mq_list;
-
-       list_for_each_entry_reverse(rq, plug_list, queuelist) {
-               if (rq->q == q && same_queue_rq) {
-                       /*
-                        * Only blk-mq multiple hardware queues case checks the
-                        * rq in the same queue, there should be only one such
-                        * rq in a queue
-                        **/
-                       *same_queue_rq = rq;
-               }
-
-               if (rq->q != q)
-                       continue;
-
-               if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
-                   BIO_MERGE_OK)
-                       return true;
+       /* check the previously added entry for a quick merge attempt */
+       rq = rq_list_peek(&plug->mq_list);
+       if (rq->q == q) {
+               /*
+                * Only blk-mq multiple hardware queues case checks the rq in
+                * the same queue, there should be only one such rq in a queue
+                */
+               *same_queue_rq = true;
        }
-
+       if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK)
+               return true;
        return false;
 }
 
index 3b38d15..f5076c1 100644 (file)
@@ -124,7 +124,6 @@ static const char *const blk_queue_flag_name[] = {
        QUEUE_FLAG_NAME(STATS),
        QUEUE_FLAG_NAME(POLL_STATS),
        QUEUE_FLAG_NAME(REGISTERED),
-       QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
        QUEUE_FLAG_NAME(QUIESCED),
        QUEUE_FLAG_NAME(PCI_P2PDMA),
        QUEUE_FLAG_NAME(ZONE_RESETALL),
@@ -287,7 +286,7 @@ static const char *const cmd_flag_name[] = {
        CMD_FLAG_NAME(BACKGROUND),
        CMD_FLAG_NAME(NOWAIT),
        CMD_FLAG_NAME(NOUNMAP),
-       CMD_FLAG_NAME(HIPRI),
+       CMD_FLAG_NAME(POLLED),
 };
 #undef CMD_FLAG_NAME
 
@@ -453,11 +452,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
                   atomic_read(&tags->active_queues));
 
        seq_puts(m, "\nbitmap_tags:\n");
-       sbitmap_queue_show(tags->bitmap_tags, m);
+       sbitmap_queue_show(&tags->bitmap_tags, m);
 
        if (tags->nr_reserved_tags) {
                seq_puts(m, "\nbreserved_tags:\n");
-               sbitmap_queue_show(tags->breserved_tags, m);
+               sbitmap_queue_show(&tags->breserved_tags, m);
        }
 }
 
@@ -488,7 +487,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
        if (res)
                goto out;
        if (hctx->tags)
-               sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
+               sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
        mutex_unlock(&q->sysfs_lock);
 
 out:
@@ -522,77 +521,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
        if (res)
                goto out;
        if (hctx->sched_tags)
-               sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
+               sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
        mutex_unlock(&q->sysfs_lock);
 
 out:
        return res;
 }
 
-static int hctx_io_poll_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       seq_printf(m, "considered=%lu\n", hctx->poll_considered);
-       seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
-       seq_printf(m, "success=%lu\n", hctx->poll_success);
-       return 0;
-}
-
-static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
-       return count;
-}
-
-static int hctx_dispatched_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-       int i;
-
-       seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
-       for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
-               unsigned int d = 1U << (i - 1);
-
-               seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
-       }
-
-       seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
-       return 0;
-}
-
-static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
-                                    size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-       int i;
-
-       for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
-               hctx->dispatched[i] = 0;
-       return count;
-}
-
-static int hctx_queued_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       seq_printf(m, "%lu\n", hctx->queued);
-       return 0;
-}
-
-static ssize_t hctx_queued_write(void *data, const char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       hctx->queued = 0;
-       return count;
-}
-
 static int hctx_run_show(void *data, struct seq_file *m)
 {
        struct blk_mq_hw_ctx *hctx = data;
@@ -614,7 +549,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
 {
        struct blk_mq_hw_ctx *hctx = data;
 
-       seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+       seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
        return 0;
 }
 
@@ -663,57 +598,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
 CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
 CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 
-static int ctx_dispatched_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
-       return 0;
-}
-
-static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
-                                   size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
-       return count;
-}
-
-static int ctx_merged_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu\n", ctx->rq_merged);
-       return 0;
-}
-
-static ssize_t ctx_merged_write(void *data, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_merged = 0;
-       return count;
-}
-
-static int ctx_completed_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
-       return 0;
-}
-
-static ssize_t ctx_completed_write(void *data, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
-       return count;
-}
-
 static int blk_mq_debugfs_show(struct seq_file *m, void *v)
 {
        const struct blk_mq_debugfs_attr *attr = m->private;
@@ -789,9 +673,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
        {"tags_bitmap", 0400, hctx_tags_bitmap_show},
        {"sched_tags", 0400, hctx_sched_tags_show},
        {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
-       {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
-       {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
-       {"queued", 0600, hctx_queued_show, hctx_queued_write},
        {"run", 0600, hctx_run_show, hctx_run_write},
        {"active", 0400, hctx_active_show},
        {"dispatch_busy", 0400, hctx_dispatch_busy_show},
@@ -803,9 +684,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
        {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
        {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
        {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
-       {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
-       {"merged", 0600, ctx_merged_show, ctx_merged_write},
-       {"completed", 0600, ctx_completed_show, ctx_completed_write},
        {},
 };
 
index 0f006ca..c62b966 100644 (file)
@@ -57,10 +57,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
 
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
-       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-               return;
        clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
        /*
@@ -363,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
        }
 }
 
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
 {
        struct elevator_queue *e = q->elevator;
@@ -389,13 +387,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
         * potentially merge with. Currently includes a hand-wavy stop
         * count of 8, to not spend too much time checking for merges.
         */
-       if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
-               ctx->rq_merged++;
+       if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
                ret = true;
-       }
 
        spin_unlock(&ctx->lock);
-
        return ret;
 }
 
@@ -515,83 +510,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
        percpu_ref_put(&q->q_usage_counter);
 }
 
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
-                                  struct blk_mq_hw_ctx *hctx,
-                                  unsigned int hctx_idx)
+static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
+                                         struct blk_mq_hw_ctx *hctx,
+                                         unsigned int hctx_idx)
 {
-       struct blk_mq_tag_set *set = q->tag_set;
-       int ret;
+       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+               hctx->sched_tags = q->sched_shared_tags;
+               return 0;
+       }
+
+       hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
+                                                   q->nr_requests);
 
-       hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
-                                              set->reserved_tags, set->flags);
        if (!hctx->sched_tags)
                return -ENOMEM;
+       return 0;
+}
 
-       ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
-       if (ret) {
-               blk_mq_free_rq_map(hctx->sched_tags, set->flags);
-               hctx->sched_tags = NULL;
-       }
-
-       return ret;
+static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
+{
+       blk_mq_free_rq_map(queue->sched_shared_tags);
+       queue->sched_shared_tags = NULL;
 }
 
 /* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
 
        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->sched_tags) {
-                       blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
+                       if (!blk_mq_is_shared_tags(flags))
+                               blk_mq_free_rq_map(hctx->sched_tags);
                        hctx->sched_tags = NULL;
                }
        }
+
+       if (blk_mq_is_shared_tags(flags))
+               blk_mq_exit_sched_shared_tags(q);
 }
 
-static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
 {
        struct blk_mq_tag_set *set = queue->tag_set;
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-       struct blk_mq_hw_ctx *hctx;
-       int ret, i;
 
        /*
         * Set initial depth at max so that we don't need to reallocate for
         * updating nr_requests.
         */
-       ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
-                                 &queue->sched_breserved_tags,
-                                 MAX_SCHED_RQ, set->reserved_tags,
-                                 set->numa_node, alloc_policy);
-       if (ret)
-               return ret;
-
-       queue_for_each_hw_ctx(queue, hctx, i) {
-               hctx->sched_tags->bitmap_tags =
-                                       &queue->sched_bitmap_tags;
-               hctx->sched_tags->breserved_tags =
-                                       &queue->sched_breserved_tags;
-       }
+       queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
+                                               BLK_MQ_NO_HCTX_IDX,
+                                               MAX_SCHED_RQ);
+       if (!queue->sched_shared_tags)
+               return -ENOMEM;
 
-       sbitmap_queue_resize(&queue->sched_bitmap_tags,
-                            queue->nr_requests - set->reserved_tags);
+       blk_mq_tag_update_sched_shared_tags(queue);
 
        return 0;
 }
 
-static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
-{
-       sbitmap_queue_free(&queue->sched_bitmap_tags);
-       sbitmap_queue_free(&queue->sched_breserved_tags);
-}
-
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
+       unsigned int i, flags = q->tag_set->flags;
        struct blk_mq_hw_ctx *hctx;
        struct elevator_queue *eq;
-       unsigned int i;
        int ret;
 
        if (!e) {
@@ -606,23 +589,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
         * Additionally, this is a per-hw queue depth.
         */
        q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
-                                  BLKDEV_MAX_RQ);
+                                  BLKDEV_DEFAULT_RQ);
 
-       queue_for_each_hw_ctx(q, hctx, i) {
-               ret = blk_mq_sched_alloc_tags(q, hctx, i);
+       if (blk_mq_is_shared_tags(flags)) {
+               ret = blk_mq_init_sched_shared_tags(q);
                if (ret)
-                       goto err_free_tags;
+                       return ret;
        }
 
-       if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
-               ret = blk_mq_init_sched_shared_sbitmap(q);
+       queue_for_each_hw_ctx(q, hctx, i) {
+               ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
                if (ret)
-                       goto err_free_tags;
+                       goto err_free_map_and_rqs;
        }
 
        ret = e->ops.init_sched(q, e);
        if (ret)
-               goto err_free_sbitmap;
+               goto err_free_map_and_rqs;
 
        blk_mq_debugfs_register_sched(q);
 
@@ -631,7 +614,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
                        ret = e->ops.init_hctx(hctx, i);
                        if (ret) {
                                eq = q->elevator;
-                               blk_mq_sched_free_requests(q);
+                               blk_mq_sched_free_rqs(q);
                                blk_mq_exit_sched(q, eq);
                                kobject_put(&eq->kobj);
                                return ret;
@@ -642,12 +625,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 
        return 0;
 
-err_free_sbitmap:
-       if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
-               blk_mq_exit_sched_shared_sbitmap(q);
-err_free_tags:
-       blk_mq_sched_free_requests(q);
-       blk_mq_sched_tags_teardown(q);
+err_free_map_and_rqs:
+       blk_mq_sched_free_rqs(q);
+       blk_mq_sched_tags_teardown(q, flags);
+
        q->elevator = NULL;
        return ret;
 }
@@ -656,14 +637,20 @@ err_free_tags:
  * called in either blk_queue_cleanup or elevator_switch, tagset
  * is required for freeing requests
  */
-void blk_mq_sched_free_requests(struct request_queue *q)
+void blk_mq_sched_free_rqs(struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
 
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (hctx->sched_tags)
-                       blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+               blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
+                               BLK_MQ_NO_HCTX_IDX);
+       } else {
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (hctx->sched_tags)
+                               blk_mq_free_rqs(q->tag_set,
+                                               hctx->sched_tags, i);
+               }
        }
 }
 
@@ -684,8 +671,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
        blk_mq_debugfs_unregister_sched(q);
        if (e->type->ops.exit_sched)
                e->type->ops.exit_sched(e);
-       blk_mq_sched_tags_teardown(q);
-       if (blk_mq_is_sbitmap_shared(flags))
-               blk_mq_exit_sched_shared_sbitmap(q);
+       blk_mq_sched_tags_teardown(q, flags);
        q->elevator = NULL;
 }
index 5246ae0..25d1034 100644 (file)
@@ -2,21 +2,22 @@
 #ifndef BLK_MQ_SCHED_H
 #define BLK_MQ_SCHED_H
 
+#include "elevator.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
 
 void blk_mq_sched_assign_ioc(struct request *rq);
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                   struct list_head *free);
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                 bool run_queue, bool async);
@@ -28,45 +29,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-void blk_mq_sched_free_requests(struct request_queue *q);
+void blk_mq_sched_free_rqs(struct request_queue *q);
 
-static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs)
+static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
-       if (blk_queue_nomerges(q) || !bio_mergeable(bio))
-               return false;
+       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+               __blk_mq_sched_restart(hctx);
+}
 
-       return __blk_mq_sched_bio_merge(q, bio, nr_segs);
+static inline bool bio_mergeable(struct bio *bio)
+{
+       return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
 }
 
 static inline bool
 blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
                         struct bio *bio)
 {
-       struct elevator_queue *e = q->elevator;
-
-       if (e && e->type->ops.allow_merge)
-               return e->type->ops.allow_merge(q, rq, bio);
+       if (rq->rq_flags & RQF_ELV) {
+               struct elevator_queue *e = q->elevator;
 
+               if (e->type->ops.allow_merge)
+                       return e->type->ops.allow_merge(q, rq, bio);
+       }
        return true;
 }
 
 static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 {
-       struct elevator_queue *e = rq->q->elevator;
+       if (rq->rq_flags & RQF_ELV) {
+               struct elevator_queue *e = rq->q->elevator;
 
-       if (e && e->type->ops.completed_request)
-               e->type->ops.completed_request(rq, now);
+               if (e->type->ops.completed_request)
+                       e->type->ops.completed_request(rq, now);
+       }
 }
 
 static inline void blk_mq_sched_requeue_request(struct request *rq)
 {
-       struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
+       if (rq->rq_flags & RQF_ELV) {
+               struct request_queue *q = rq->q;
+               struct elevator_queue *e = q->elevator;
 
-       if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
-               e->type->ops.requeue_request(rq);
+               if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
+                       e->type->ops.requeue_request(rq);
+       }
 }
 
 static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
index ff5caeb..995336a 100644 (file)
  */
 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;
-               struct blk_mq_tag_set *set = q->tag_set;
 
                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
                    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
-                       atomic_inc(&set->active_queues_shared_sbitmap);
+                       atomic_inc(&hctx->tags->active_queues);
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
                    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
@@ -45,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-       sbitmap_queue_wake_all(tags->bitmap_tags);
+       sbitmap_queue_wake_all(&tags->bitmap_tags);
        if (include_reserve)
-               sbitmap_queue_wake_all(tags->breserved_tags);
+               sbitmap_queue_wake_all(&tags->breserved_tags);
 }
 
 /*
@@ -57,20 +56,20 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
        struct blk_mq_tags *tags = hctx->tags;
-       struct request_queue *q = hctx->queue;
-       struct blk_mq_tag_set *set = q->tag_set;
 
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
+               struct request_queue *q = hctx->queue;
+
                if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
                                        &q->queue_flags))
                        return;
-               atomic_dec(&set->active_queues_shared_sbitmap);
        } else {
                if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
-               atomic_dec(&tags->active_queues);
        }
 
+       atomic_dec(&tags->active_queues);
+
        blk_mq_tag_wakeup_all(tags, false);
 }
 
@@ -87,6 +86,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                return __sbitmap_queue_get(bt);
 }
 
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+                             unsigned int *offset)
+{
+       struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+       struct sbitmap_queue *bt = &tags->bitmap_tags;
+       unsigned long ret;
+
+       if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
+           data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+               return 0;
+       ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
+       *offset += tags->nr_reserved_tags;
+       return ret;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -101,10 +115,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                        WARN_ON_ONCE(1);
                        return BLK_MQ_NO_TAG;
                }
-               bt = tags->breserved_tags;
+               bt = &tags->breserved_tags;
                tag_offset = 0;
        } else {
-               bt = tags->bitmap_tags;
+               bt = &tags->bitmap_tags;
                tag_offset = tags->nr_reserved_tags;
        }
 
@@ -150,9 +164,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                                                data->ctx);
                tags = blk_mq_tags_from_data(data);
                if (data->flags & BLK_MQ_REQ_RESERVED)
-                       bt = tags->breserved_tags;
+                       bt = &tags->breserved_tags;
                else
-                       bt = tags->bitmap_tags;
+                       bt = &tags->bitmap_tags;
 
                /*
                 * If destination hw queue is changed, fake wake up on
@@ -186,13 +200,19 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                const int real_tag = tag - tags->nr_reserved_tags;
 
                BUG_ON(real_tag >= tags->nr_tags);
-               sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
+               sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
        } else {
                BUG_ON(tag >= tags->nr_reserved_tags);
-               sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
+               sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
        }
 }
 
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
+{
+       sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+                                       tag_array, nr_tags);
+}
+
 struct bt_iter_data {
        struct blk_mq_hw_ctx *hctx;
        busy_iter_fn *fn;
@@ -340,9 +360,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
        WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
 
        if (tags->nr_reserved_tags)
-               bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
+               bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
                                 flags | BT_TAG_ITER_RESERVED);
-       bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
+       bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
 }
 
 /**
@@ -379,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv)
 {
-       int i;
+       unsigned int flags = tagset->flags;
+       int i, nr_tags;
+
+       nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
 
-       for (i = 0; i < tagset->nr_hw_queues; i++) {
+       for (i = 0; i < nr_tags; i++) {
                if (tagset->tags && tagset->tags[i])
                        __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
                                              BT_TAG_ITER_STARTED);
@@ -459,8 +482,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                        continue;
 
                if (tags->nr_reserved_tags)
-                       bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
-               bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
+                       bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+               bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
        }
        blk_queue_exit(q);
 }
@@ -492,56 +515,10 @@ free_bitmap_tags:
        return -ENOMEM;
 }
 
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
-                                  int node, int alloc_policy)
-{
-       int ret;
-
-       ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
-                                 &tags->__breserved_tags,
-                                 tags->nr_tags, tags->nr_reserved_tags,
-                                 node, alloc_policy);
-       if (ret)
-               return ret;
-
-       tags->bitmap_tags = &tags->__bitmap_tags;
-       tags->breserved_tags = &tags->__breserved_tags;
-
-       return 0;
-}
-
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
-{
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-       int i, ret;
-
-       ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
-                                 set->queue_depth, set->reserved_tags,
-                                 set->numa_node, alloc_policy);
-       if (ret)
-               return ret;
-
-       for (i = 0; i < set->nr_hw_queues; i++) {
-               struct blk_mq_tags *tags = set->tags[i];
-
-               tags->bitmap_tags = &set->__bitmap_tags;
-               tags->breserved_tags = &set->__breserved_tags;
-       }
-
-       return 0;
-}
-
-void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
-{
-       sbitmap_queue_free(&set->__bitmap_tags);
-       sbitmap_queue_free(&set->__breserved_tags);
-}
-
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                     unsigned int reserved_tags,
-                                    int node, unsigned int flags)
+                                    int node, int alloc_policy)
 {
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
        struct blk_mq_tags *tags;
 
        if (total_tags > BLK_MQ_TAG_MAX) {
@@ -557,22 +534,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
        tags->nr_reserved_tags = reserved_tags;
        spin_lock_init(&tags->lock);
 
-       if (blk_mq_is_sbitmap_shared(flags))
-               return tags;
-
-       if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
+       if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
+                               total_tags, reserved_tags, node,
+                               alloc_policy) < 0) {
                kfree(tags);
                return NULL;
        }
        return tags;
 }
 
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-       if (!blk_mq_is_sbitmap_shared(flags)) {
-               sbitmap_queue_free(tags->bitmap_tags);
-               sbitmap_queue_free(tags->breserved_tags);
-       }
+       sbitmap_queue_free(&tags->bitmap_tags);
+       sbitmap_queue_free(&tags->breserved_tags);
        kfree(tags);
 }
 
@@ -592,7 +566,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
        if (tdepth > tags->nr_tags) {
                struct blk_mq_tag_set *set = hctx->queue->tag_set;
                struct blk_mq_tags *new;
-               bool ret;
 
                if (!can_grow)
                        return -EINVAL;
@@ -604,34 +577,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                if (tdepth > MAX_SCHED_RQ)
                        return -EINVAL;
 
-               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
-                               tags->nr_reserved_tags, set->flags);
+               /*
+                * Only the sbitmap needs resizing since we allocated the max
+                * initially.
+                */
+               if (blk_mq_is_shared_tags(set->flags))
+                       return 0;
+
+               new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
                if (!new)
                        return -ENOMEM;
-               ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
-               if (ret) {
-                       blk_mq_free_rq_map(new, set->flags);
-                       return -ENOMEM;
-               }
 
-               blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
-               blk_mq_free_rq_map(*tagsptr, set->flags);
+               blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
                *tagsptr = new;
        } else {
                /*
                 * Don't need (or can't) update reserved tags here, they
                 * remain static and should never need resizing.
                 */
-               sbitmap_queue_resize(tags->bitmap_tags,
+               sbitmap_queue_resize(&tags->bitmap_tags,
                                tdepth - tags->nr_reserved_tags);
        }
 
        return 0;
 }
 
-void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
+{
+       struct blk_mq_tags *tags = set->shared_tags;
+
+       sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
+}
+
+void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
 {
-       sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+       sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
+                            q->nr_requests - q->tag_set->reserved_tags);
 }
 
 /**
index 8ed55af..df787b5 100644 (file)
@@ -2,52 +2,30 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
 
-/*
- * Tag address space map.
- */
-struct blk_mq_tags {
-       unsigned int nr_tags;
-       unsigned int nr_reserved_tags;
-
-       atomic_t active_queues;
-
-       struct sbitmap_queue *bitmap_tags;
-       struct sbitmap_queue *breserved_tags;
-
-       struct sbitmap_queue __bitmap_tags;
-       struct sbitmap_queue __breserved_tags;
-
-       struct request **rqs;
-       struct request **static_rqs;
-       struct list_head page_list;
-
-       /*
-        * used to clear request reference in rqs[] before freeing one
-        * request pool
-        */
-       spinlock_t lock;
-};
+struct blk_mq_alloc_data;
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
                                        unsigned int reserved_tags,
-                                       int node, unsigned int flags);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+                                       int node, int alloc_policy);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
                               struct sbitmap_queue *breserved_tags,
                               unsigned int queue_depth,
                               unsigned int reserved,
                               int node, int alloc_policy);
 
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
-extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+                             unsigned int *offset);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                           unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_tags **tags,
                                        unsigned int depth, bool can_grow);
-extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
+extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
                                             unsigned int size);
+extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
 
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
index bc02637..07eb141 100644 (file)
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/kmemleak.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/smp.h>
+#include <linux/interrupt.h>
 #include <linux/llist.h>
-#include <linux/list_sort.h>
 #include <linux/cpu.h>
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
@@ -63,6 +64,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
        return bucket;
 }
 
+#define BLK_QC_T_SHIFT         16
+#define BLK_QC_T_INTERNAL      (1U << 31)
+
+static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
+               blk_qc_t qc)
+{
+       return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+}
+
+static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
+               blk_qc_t qc)
+{
+       unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
+
+       if (qc & BLK_QC_T_INTERNAL)
+               return blk_mq_tag_to_rq(hctx->sched_tags, tag);
+       return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static inline blk_qc_t blk_rq_to_qc(struct request *rq)
+{
+       return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
+               (rq->tag != -1 ?
+                rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+}
+
 /*
  * Check if any of the ctx, dispatch list or elevator
  * have pending work in this hardware queue.
@@ -214,7 +241,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  */
 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 {
-       blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->queue_lock, flags);
+       if (!q->quiesce_depth++)
+               blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+       spin_unlock_irqrestore(&q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
@@ -255,10 +287,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
  */
 void blk_mq_unquiesce_queue(struct request_queue *q)
 {
-       blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+       unsigned long flags;
+       bool run_queue = false;
+
+       spin_lock_irqsave(&q->queue_lock, flags);
+       if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
+               ;
+       } else if (!--q->quiesce_depth) {
+               blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+               run_queue = true;
+       }
+       spin_unlock_irqrestore(&q->queue_lock, flags);
 
        /* dispatch requests which are inserted during quiescing */
-       blk_mq_run_hw_queues(q, true);
+       if (run_queue)
+               blk_mq_run_hw_queues(q, true);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 
@@ -272,74 +315,67 @@ void blk_mq_wake_waiters(struct request_queue *q)
                        blk_mq_tag_wakeup_all(hctx->tags, true);
 }
 
-/*
- * Only need start/end time stamping if we have iostat or
- * blk stats enabled, or using an IO scheduler.
- */
-static inline bool blk_mq_need_time_stamp(struct request *rq)
-{
-       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
-}
-
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-               unsigned int tag, u64 alloc_time_ns)
+               struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
 {
-       struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+       struct blk_mq_ctx *ctx = data->ctx;
+       struct blk_mq_hw_ctx *hctx = data->hctx;
+       struct request_queue *q = data->q;
        struct request *rq = tags->static_rqs[tag];
 
-       if (data->q->elevator) {
-               rq->tag = BLK_MQ_NO_TAG;
-               rq->internal_tag = tag;
-       } else {
+       rq->q = q;
+       rq->mq_ctx = ctx;
+       rq->mq_hctx = hctx;
+       rq->cmd_flags = data->cmd_flags;
+
+       if (data->flags & BLK_MQ_REQ_PM)
+               data->rq_flags |= RQF_PM;
+       if (blk_queue_io_stat(q))
+               data->rq_flags |= RQF_IO_STAT;
+       rq->rq_flags = data->rq_flags;
+
+       if (!(data->rq_flags & RQF_ELV)) {
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
+       } else {
+               rq->tag = BLK_MQ_NO_TAG;
+               rq->internal_tag = tag;
        }
+       rq->timeout = 0;
 
-       /* csd/requeue_work/fifo_time is initialized before use */
-       rq->q = data->q;
-       rq->mq_ctx = data->ctx;
-       rq->mq_hctx = data->hctx;
-       rq->rq_flags = 0;
-       rq->cmd_flags = data->cmd_flags;
-       if (data->flags & BLK_MQ_REQ_PM)
-               rq->rq_flags |= RQF_PM;
-       if (blk_queue_io_stat(data->q))
-               rq->rq_flags |= RQF_IO_STAT;
-       INIT_LIST_HEAD(&rq->queuelist);
-       INIT_HLIST_NODE(&rq->hash);
-       RB_CLEAR_NODE(&rq->rb_node);
+       if (blk_mq_need_time_stamp(rq))
+               rq->start_time_ns = ktime_get_ns();
+       else
+               rq->start_time_ns = 0;
        rq->rq_disk = NULL;
        rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
        rq->alloc_time_ns = alloc_time_ns;
 #endif
-       if (blk_mq_need_time_stamp(rq))
-               rq->start_time_ns = ktime_get_ns();
-       else
-               rq->start_time_ns = 0;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
 #endif
-       blk_crypto_rq_set_defaults(rq);
-       /* tag was already set */
-       WRITE_ONCE(rq->deadline, 0);
-
-       rq->timeout = 0;
-
        rq->end_io = NULL;
        rq->end_io_data = NULL;
 
-       data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
+       blk_crypto_rq_set_defaults(rq);
+       INIT_LIST_HEAD(&rq->queuelist);
+       /* tag was already set */
+       WRITE_ONCE(rq->deadline, 0);
        refcount_set(&rq->ref, 1);
 
-       if (!op_is_flush(data->cmd_flags)) {
+       if (rq->rq_flags & RQF_ELV) {
                struct elevator_queue *e = data->q->elevator;
 
                rq->elv.icq = NULL;
-               if (e && e->type->ops.prepare_request) {
+               INIT_HLIST_NODE(&rq->hash);
+               RB_CLEAR_NODE(&rq->rb_node);
+
+               if (!op_is_flush(data->cmd_flags) &&
+                   e->type->ops.prepare_request) {
                        if (e->type->icq_cache)
                                blk_mq_sched_assign_ioc(rq);
 
@@ -348,15 +384,44 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                }
        }
 
-       data->hctx->queued++;
        return rq;
 }
 
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
+static inline struct request *
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
+               u64 alloc_time_ns)
+{
+       unsigned int tag, tag_offset;
+       struct blk_mq_tags *tags;
+       struct request *rq;
+       unsigned long tag_mask;
+       int i, nr = 0;
+
+       tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+       if (unlikely(!tag_mask))
+               return NULL;
+
+       tags = blk_mq_tags_from_data(data);
+       for (i = 0; tag_mask; i++) {
+               if (!(tag_mask & (1UL << i)))
+                       continue;
+               prefetch(tags->static_rqs[tag]);
+               tag = tag_offset + i;
+               tag_mask &= ~(1UL << i);
+               rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+               rq_list_add(data->cached_rq, rq);
+       }
+       data->nr_tags -= nr;
+
+       return rq_list_pop(data->cached_rq);
+}
+
+static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
        struct request_queue *q = data->q;
        struct elevator_queue *e = q->elevator;
        u64 alloc_time_ns = 0;
+       struct request *rq;
        unsigned int tag;
 
        /* alloc_time includes depth and tag waits */
@@ -386,6 +451,16 @@ retry:
                blk_mq_tag_busy(data->hctx);
 
        /*
+        * Try batched alloc if we want more than 1 tag.
+        */
+       if (data->nr_tags > 1) {
+               rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+               if (rq)
+                       return rq;
+               data->nr_tags = 1;
+       }
+
+       /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
         * should have migrated us to an online CPU by now.
@@ -394,16 +469,18 @@ retry:
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;
-
                /*
-                * Give up the CPU and sleep for a random short time to ensure
-                * that thread using a realtime scheduling class are migrated
-                * off the CPU, and thus off the hctx that is going away.
+                * Give up the CPU and sleep for a random short time to
+                * ensure that thread using a realtime scheduling class
+                * are migrated off the CPU, and thus off the hctx that
+                * is going away.
                 */
                msleep(3);
                goto retry;
        }
-       return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+
+       return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
+                                       alloc_time_ns);
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -413,6 +490,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                .q              = q,
                .flags          = flags,
                .cmd_flags      = op,
+               .rq_flags       = q->elevator ? RQF_ELV : 0,
+               .nr_tags        = 1,
        };
        struct request *rq;
        int ret;
@@ -421,7 +500,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
        if (ret)
                return ERR_PTR(ret);
 
-       rq = __blk_mq_alloc_request(&data);
+       rq = __blk_mq_alloc_requests(&data);
        if (!rq)
                goto out_queue_exit;
        rq->__data_len = 0;
@@ -441,6 +520,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                .q              = q,
                .flags          = flags,
                .cmd_flags      = op,
+               .rq_flags       = q->elevator ? RQF_ELV : 0,
+               .nr_tags        = 1,
        };
        u64 alloc_time_ns = 0;
        unsigned int cpu;
@@ -485,7 +566,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
-       return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
+       return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
+                                       alloc_time_ns);
 
 out_queue_exit:
        blk_queue_exit(q);
@@ -514,12 +596,12 @@ static void __blk_mq_free_request(struct request *rq)
 void blk_mq_free_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
        if (rq->rq_flags & RQF_ELVPRIV) {
-               if (e && e->type->ops.finish_request)
+               struct elevator_queue *e = q->elevator;
+
+               if (e->type->ops.finish_request)
                        e->type->ops.finish_request(rq);
                if (rq->elv.icq) {
                        put_io_context(rq->elv.icq->ioc);
@@ -527,7 +609,6 @@ void blk_mq_free_request(struct request *rq)
                }
        }
 
-       ctx->rq_completed[rq_is_sync(rq)]++;
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
                __blk_mq_dec_active_requests(hctx);
 
@@ -542,21 +623,173 @@ void blk_mq_free_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
-       u64 now = 0;
+       struct request *rq;
 
-       if (blk_mq_need_time_stamp(rq))
-               now = ktime_get_ns();
+       while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
+               percpu_ref_get(&rq->q->q_usage_counter);
+               blk_mq_free_request(rq);
+       }
+}
 
+static void req_bio_endio(struct request *rq, struct bio *bio,
+                         unsigned int nbytes, blk_status_t error)
+{
+       if (unlikely(error)) {
+               bio->bi_status = error;
+       } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
+               /*
+                * Partial zone append completions cannot be supported as the
+                * BIO fragments may end up not being written sequentially.
+                */
+               if (bio->bi_iter.bi_size != nbytes)
+                       bio->bi_status = BLK_STS_IOERR;
+               else
+                       bio->bi_iter.bi_sector = rq->__sector;
+       }
+
+       bio_advance(bio, nbytes);
+
+       if (unlikely(rq->rq_flags & RQF_QUIET))
+               bio_set_flag(bio, BIO_QUIET);
+       /* don't actually finish bio if it's part of flush sequence */
+       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+               bio_endio(bio);
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+       if (req->part && blk_do_io_stat(req)) {
+               const int sgrp = op_stat_group(req_op(req));
+
+               part_stat_lock();
+               part_stat_add(req->part, sectors[sgrp], bytes >> 9);
+               part_stat_unlock();
+       }
+}
+
+/**
+ * blk_update_request - Complete multiple bytes without completing the request
+ * @req:      the request being processed
+ * @error:    block status code
+ * @nr_bytes: number of bytes to complete for @req
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @req, but doesn't complete
+ *     the request structure even if @req doesn't have leftover.
+ *     If @req has leftover, sets it up for the next range of segments.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
+ *
+ * Note:
+ *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ *      except in the consistency check at the end of this function.
+ *
+ * Return:
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
+ **/
+bool blk_update_request(struct request *req, blk_status_t error,
+               unsigned int nr_bytes)
+{
+       int total_bytes;
+
+       trace_block_rq_complete(req, error, nr_bytes);
+
+       if (!req->bio)
+               return false;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+           error == BLK_STS_OK)
+               req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
+       if (unlikely(error && !blk_rq_is_passthrough(req) &&
+                    !(req->rq_flags & RQF_QUIET)))
+               blk_print_req_error(req, error);
+
+       blk_account_io_completion(req, nr_bytes);
+
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
+
+               if (bio_bytes == bio->bi_iter.bi_size)
+                       req->bio = bio->bi_next;
+
+               /* Completion has already been traced */
+               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+               req_bio_endio(req, bio, bio_bytes, error);
+
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
+
+               if (!nr_bytes)
+                       break;
+       }
+
+       /*
+        * completely done
+        */
+       if (!req->bio) {
+               /*
+                * Reset counters so that the request stacking driver
+                * can find how many bytes remain in the request
+                * later.
+                */
+               req->__data_len = 0;
+               return false;
+       }
+
+       req->__data_len -= total_bytes;
+
+       /* update sector only for requests with clear definition of sector */
+       if (!blk_rq_is_passthrough(req))
+               req->__sector += total_bytes >> 9;
+
+       /* mixed attributes always follow the first bio */
+       if (req->rq_flags & RQF_MIXED_MERGE) {
+               req->cmd_flags &= ~REQ_FAILFAST_MASK;
+               req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
+       }
+
+       if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+               /*
+                * If total number of sectors is less than the first segment
+                * size, something has gone terribly wrong.
+                */
+               if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+                       blk_dump_rq_flags(req, "request botched");
+                       req->__data_len = blk_rq_cur_bytes(req);
+               }
+
+               /* recalculate the number of segments */
+               req->nr_phys_segments = blk_recalc_rq_segments(req);
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+
+static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
+{
        if (rq->rq_flags & RQF_STATS) {
                blk_mq_poll_stats_start(rq->q);
                blk_stat_add(rq, now);
        }
 
        blk_mq_sched_completed_request(rq, now);
-
        blk_account_io_done(rq, now);
+}
+
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+       if (blk_mq_need_time_stamp(rq))
+               __blk_mq_end_request_acct(rq, ktime_get_ns());
 
        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
@@ -575,6 +808,57 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 
+#define TAG_COMP_BATCH         32
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+                                         int *tag_array, int nr_tags)
+{
+       struct request_queue *q = hctx->queue;
+
+       blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
+       percpu_ref_put_many(&q->q_usage_counter, nr_tags);
+}
+
+void blk_mq_end_request_batch(struct io_comp_batch *iob)
+{
+       int tags[TAG_COMP_BATCH], nr_tags = 0;
+       struct blk_mq_hw_ctx *cur_hctx = NULL;
+       struct request *rq;
+       u64 now = 0;
+
+       if (iob->need_ts)
+               now = ktime_get_ns();
+
+       while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
+               prefetch(rq->bio);
+               prefetch(rq->rq_next);
+
+               blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+               if (iob->need_ts)
+                       __blk_mq_end_request_acct(rq, now);
+
+               WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+               if (!refcount_dec_and_test(&rq->ref))
+                       continue;
+
+               blk_crypto_free_request(rq);
+               blk_pm_mark_last_busy(rq);
+               rq_qos_done(rq->q, rq);
+
+               if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
+                       if (cur_hctx)
+                               blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+                       nr_tags = 0;
+                       cur_hctx = rq->mq_hctx;
+               }
+               tags[nr_tags++] = rq->tag;
+       }
+
+       if (nr_tags)
+               blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
 static void blk_complete_reqs(struct llist_head *list)
 {
        struct llist_node *entry = llist_reverse_order(llist_del_all(list));
@@ -658,7 +942,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
         * For a polled request, always complete locallly, it's pointless
         * to redirect the completion.
         */
-       if (rq->cmd_flags & REQ_HIPRI)
+       if (rq->cmd_flags & REQ_POLLED)
                return false;
 
        if (blk_mq_complete_need_ipi(rq)) {
@@ -723,7 +1007,14 @@ void blk_mq_start_request(struct request *rq)
        trace_block_rq_issue(rq);
 
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-               rq->io_start_time_ns = ktime_get_ns();
+               u64 start_time;
+#ifdef CONFIG_BLK_CGROUP
+               if (rq->bio)
+                       start_time = bio_issue_time(&rq->bio->bi_issue);
+               else
+#endif
+                       start_time = ktime_get_ns();
+               rq->io_start_time_ns = start_time;
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
@@ -738,6 +1029,8 @@ void blk_mq_start_request(struct request *rq)
        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                q->integrity.profile->prepare_fn(rq);
 #endif
+       if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
+               WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
@@ -763,7 +1056,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
        /* this request will be re-inserted to io scheduler queue */
        blk_mq_sched_requeue_request(rq);
 
-       BUG_ON(!list_empty(&rq->queuelist));
        blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -844,17 +1136,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
-{
-       if (tag < tags->nr_tags) {
-               prefetch(tags->rqs[tag]);
-               return tags->rqs[tag];
-       }
-
-       return NULL;
-}
-EXPORT_SYMBOL(blk_mq_tag_to_rq);
-
 static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
                               void *priv, bool reserved)
 {
@@ -1059,24 +1340,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
        return data.rq;
 }
 
-static inline unsigned int queued_to_index(unsigned int queued)
-{
-       if (!queued)
-               return 0;
-
-       return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
-}
-
-static bool __blk_mq_get_driver_tag(struct request *rq)
+static bool __blk_mq_alloc_driver_tag(struct request *rq)
 {
-       struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;
 
        blk_mq_tag_busy(rq->mq_hctx);
 
        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
-               bt = rq->mq_hctx->tags->breserved_tags;
+               bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
@@ -1091,11 +1364,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
        return true;
 }
 
-bool blk_mq_get_driver_tag(struct request *rq)
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
                return false;
 
        if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
@@ -1119,7 +1390,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                struct sbitmap_queue *sbq;
 
                list_del_init(&wait->entry);
-               sbq = hctx->tags->bitmap_tags;
+               sbq = &hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);
@@ -1137,7 +1408,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
 {
-       struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
+       struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;
@@ -1325,6 +1596,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
        int errors, queued;
        blk_status_t ret = BLK_STS_OK;
        LIST_HEAD(zone_list);
+       bool needs_resource = false;
 
        if (list_empty(list))
                return false;
@@ -1370,6 +1642,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
+                       needs_resource = true;
+                       fallthrough;
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_handle_dev_resource(rq, list);
                        goto out;
@@ -1380,6 +1654,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                         * accept.
                         */
                        blk_mq_handle_zone_resource(rq, &zone_list);
+                       needs_resource = true;
                        break;
                default:
                        errors++;
@@ -1390,8 +1665,6 @@ out:
        if (!list_empty(&zone_list))
                list_splice_tail_init(&zone_list, list);
 
-       hctx->dispatched[queued_to_index(queued)]++;
-
        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
@@ -1406,7 +1679,6 @@ out:
                /* For non-shared tags, the RESTART check will suffice */
                bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                        (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
-               bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
 
                if (nr_budgets)
                        blk_mq_release_budgets(q, list);
@@ -1447,14 +1719,16 @@ out:
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
-                * similar if we couldn't get budget and SCHED_RESTART is set.
+                * similar if we couldn't get budget or couldn't lock a zone
+                * and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
+               if (prep == PREP_DISPATCH_NO_BUDGET)
+                       needs_resource = true;
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
-               else if (needs_restart && (ret == BLK_STS_RESOURCE ||
-                                          no_budget_avail))
+               else if (needs_restart && needs_resource)
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 
                blk_mq_update_dispatch_busy(hctx, true);
@@ -1894,54 +2168,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
        spin_unlock(&ctx->lock);
 }
 
-static int plug_rq_cmp(void *priv, const struct list_head *a,
-                      const struct list_head *b)
+static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
+                             bool from_schedule)
 {
-       struct request *rqa = container_of(a, struct request, queuelist);
-       struct request *rqb = container_of(b, struct request, queuelist);
+       if (hctx->queue->mq_ops->commit_rqs) {
+               trace_block_unplug(hctx->queue, *queued, !from_schedule);
+               hctx->queue->mq_ops->commit_rqs(hctx);
+       }
+       *queued = 0;
+}
 
-       if (rqa->mq_ctx != rqb->mq_ctx)
-               return rqa->mq_ctx > rqb->mq_ctx;
-       if (rqa->mq_hctx != rqb->mq_hctx)
-               return rqa->mq_hctx > rqb->mq_hctx;
+static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
+{
+       struct blk_mq_hw_ctx *hctx = NULL;
+       struct request *rq;
+       int queued = 0;
+       int errors = 0;
 
-       return blk_rq_pos(rqa) > blk_rq_pos(rqb);
+       while ((rq = rq_list_pop(&plug->mq_list))) {
+               bool last = rq_list_empty(plug->mq_list);
+               blk_status_t ret;
+
+               if (hctx != rq->mq_hctx) {
+                       if (hctx)
+                               blk_mq_commit_rqs(hctx, &queued, from_schedule);
+                       hctx = rq->mq_hctx;
+               }
+
+               ret = blk_mq_request_issue_directly(rq, last);
+               switch (ret) {
+               case BLK_STS_OK:
+                       queued++;
+                       break;
+               case BLK_STS_RESOURCE:
+               case BLK_STS_DEV_RESOURCE:
+                       blk_mq_request_bypass_insert(rq, false, last);
+                       blk_mq_commit_rqs(hctx, &queued, from_schedule);
+                       return;
+               default:
+                       blk_mq_end_request(rq, ret);
+                       errors++;
+                       break;
+               }
+       }
+
+       /*
+        * If we didn't flush the entire list, we could have told the driver
+        * there was more coming, but that turned out to be a lie.
+        */
+       if (errors)
+               blk_mq_commit_rqs(hctx, &queued, from_schedule);
 }
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
+       struct blk_mq_hw_ctx *this_hctx;
+       struct blk_mq_ctx *this_ctx;
+       unsigned int depth;
        LIST_HEAD(list);
 
-       if (list_empty(&plug->mq_list))
+       if (rq_list_empty(plug->mq_list))
                return;
-       list_splice_init(&plug->mq_list, &list);
-
-       if (plug->rq_count > 2 && plug->multiple_queues)
-               list_sort(NULL, &list, plug_rq_cmp);
-
        plug->rq_count = 0;
 
+       if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+               blk_mq_plug_issue_direct(plug, from_schedule);
+               if (rq_list_empty(plug->mq_list))
+                       return;
+       }
+
+       this_hctx = NULL;
+       this_ctx = NULL;
+       depth = 0;
        do {
-               struct list_head rq_list;
-               struct request *rq, *head_rq = list_entry_rq(list.next);
-               struct list_head *pos = &head_rq->queuelist; /* skip first */
-               struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
-               struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
-               unsigned int depth = 1;
-
-               list_for_each_continue(pos, &list) {
-                       rq = list_entry_rq(pos);
-                       BUG_ON(!rq->q);
-                       if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
-                               break;
-                       depth++;
+               struct request *rq;
+
+               rq = rq_list_pop(&plug->mq_list);
+
+               if (!this_hctx) {
+                       this_hctx = rq->mq_hctx;
+                       this_ctx = rq->mq_ctx;
+               } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+                       trace_block_unplug(this_hctx->queue, depth,
+                                               !from_schedule);
+                       blk_mq_sched_insert_requests(this_hctx, this_ctx,
+                                               &list, from_schedule);
+                       depth = 0;
+                       this_hctx = rq->mq_hctx;
+                       this_ctx = rq->mq_ctx;
+
                }
 
-               list_cut_before(&rq_list, &list, pos);
-               trace_block_unplug(head_rq->q, depth, !from_schedule);
-               blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
+               list_add(&rq->queuelist, &list);
+               depth++;
+       } while (!rq_list_empty(plug->mq_list));
+
+       if (!list_empty(&list)) {
+               trace_block_unplug(this_hctx->queue, depth, !from_schedule);
+               blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
                                                from_schedule);
-       } while(!list_empty(&list));
+       }
 }
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
@@ -1964,19 +2290,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                           struct request *rq,
-                                           blk_qc_t *cookie, bool last)
+                                           struct request *rq, bool last)
 {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
-       blk_qc_t new_cookie;
        blk_status_t ret;
 
-       new_cookie = request_to_qc_t(hctx, rq);
-
        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
@@ -1986,7 +2308,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = new_cookie;
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
@@ -1995,7 +2316,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = BLK_QC_T_NONE;
                break;
        }
 
@@ -2004,7 +2324,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                struct request *rq,
-                                               blk_qc_t *cookie,
                                                bool bypass_insert, bool last)
 {
        struct request_queue *q = rq->q;
@@ -2024,7 +2343,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
 
-       if (q->elevator && !bypass_insert)
+       if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
                goto insert;
 
        budget_token = blk_mq_get_dispatch_budget(q);
@@ -2038,7 +2357,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
 
-       return __blk_mq_issue_directly(hctx, rq, cookie, last);
+       return __blk_mq_issue_directly(hctx, rq, last);
 insert:
        if (bypass_insert)
                return BLK_STS_RESOURCE;
@@ -2052,7 +2371,6 @@ insert:
  * blk_mq_try_issue_directly - Try to send a request directly to device driver.
  * @hctx: Pointer of the associated hardware queue.
  * @rq: Pointer to request to be sent.
- * @cookie: Request queue cookie.
  *
  * If the device has enough resources to accept a new request now, send the
  * request directly to device driver. Else, insert at hctx->dispatch queue, so
@@ -2060,7 +2378,7 @@ insert:
  * queue have higher priority.
  */
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-               struct request *rq, blk_qc_t *cookie)
+               struct request *rq)
 {
        blk_status_t ret;
        int srcu_idx;
@@ -2069,7 +2387,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
        hctx_lock(hctx, &srcu_idx);
 
-       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
+       ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
                blk_mq_request_bypass_insert(rq, false, true);
        else if (ret != BLK_STS_OK)
@@ -2082,11 +2400,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
        blk_status_t ret;
        int srcu_idx;
-       blk_qc_t unused_cookie;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
        hctx_lock(hctx, &srcu_idx);
-       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
+       ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
        hctx_unlock(hctx, srcu_idx);
 
        return ret;
@@ -2130,27 +2447,28 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 
 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 {
-       list_add_tail(&rq->queuelist, &plug->mq_list);
-       plug->rq_count++;
-       if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
-               struct request *tmp;
+       if (!plug->multiple_queues) {
+               struct request *nxt = rq_list_peek(&plug->mq_list);
 
-               tmp = list_first_entry(&plug->mq_list, struct request,
-                                               queuelist);
-               if (tmp->q != rq->q)
+               if (nxt && nxt->q != rq->q)
                        plug->multiple_queues = true;
        }
+       if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+               plug->has_elevator = true;
+       rq->rq_next = NULL;
+       rq_list_add(&plug->mq_list, rq);
+       plug->rq_count++;
 }
 
 /*
- * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
  * queues. This is important for md arrays to benefit from merging
  * requests.
  */
 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 {
        if (plug->multiple_queues)
-               return BLK_MAX_REQUEST_COUNT * 4;
+               return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
 }
 
@@ -2166,57 +2484,63 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
  *
  * It will not queue the request if there is an error with the bio, or at the
  * request creation.
- *
- * Returns: Request queue cookie.
  */
-blk_qc_t blk_mq_submit_bio(struct bio *bio)
+void blk_mq_submit_bio(struct bio *bio)
 {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = {
-               .q              = q,
-       };
        struct request *rq;
        struct blk_plug *plug;
-       struct request *same_queue_rq = NULL;
-       unsigned int nr_segs;
-       blk_qc_t cookie;
+       bool same_queue_rq = false;
+       unsigned int nr_segs = 1;
        blk_status_t ret;
-       bool hipri;
 
        blk_queue_bounce(q, &bio);
-       __blk_queue_split(&bio, &nr_segs);
+       if (blk_may_split(q, bio))
+               __blk_queue_split(q, &bio, &nr_segs);
 
        if (!bio_integrity_prep(bio))
                goto queue_exit;
 
-       if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-               goto queue_exit;
-
-       if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-               goto queue_exit;
+       if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+               if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
+                       goto queue_exit;
+               if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+                       goto queue_exit;
+       }
 
        rq_qos_throttle(q, bio);
 
-       hipri = bio->bi_opf & REQ_HIPRI;
-
-       data.cmd_flags = bio->bi_opf;
-       rq = __blk_mq_alloc_request(&data);
-       if (unlikely(!rq)) {
-               rq_qos_cleanup(q, bio);
-               if (bio->bi_opf & REQ_NOWAIT)
-                       bio_wouldblock_error(bio);
-               goto queue_exit;
+       plug = blk_mq_plug(q, bio);
+       if (plug && plug->cached_rq) {
+               rq = rq_list_pop(&plug->cached_rq);
+               INIT_LIST_HEAD(&rq->queuelist);
+       } else {
+               struct blk_mq_alloc_data data = {
+                       .q              = q,
+                       .nr_tags        = 1,
+                       .cmd_flags      = bio->bi_opf,
+                       .rq_flags       = q->elevator ? RQF_ELV : 0,
+               };
+
+               if (plug) {
+                       data.nr_tags = plug->nr_ios;
+                       plug->nr_ios = 1;
+                       data.cached_rq = &plug->cached_rq;
+               }
+               rq = __blk_mq_alloc_requests(&data);
+               if (unlikely(!rq)) {
+                       rq_qos_cleanup(q, bio);
+                       if (bio->bi_opf & REQ_NOWAIT)
+                               bio_wouldblock_error(bio);
+                       goto queue_exit;
+               }
        }
 
        trace_block_getrq(bio);
 
        rq_qos_track(q, rq, bio);
 
-       cookie = request_to_qc_t(data.hctx, rq);
-
        blk_mq_bio_to_request(rq, bio, nr_segs);
 
        ret = blk_crypto_init_request(rq);
@@ -2224,17 +2548,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
-               return BLK_QC_T_NONE;
+               return;
        }
 
-       plug = blk_mq_plug(q, bio);
-       if (unlikely(is_flush_fua)) {
-               /* Bypass scheduler for flush requests */
-               blk_insert_flush(rq);
-               blk_mq_run_hw_queue(data.hctx, true);
-       } else if (plug && (q->nr_hw_queues == 1 ||
-                  blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
-                  q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
+       if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+               return;
+
+       if (plug && (q->nr_hw_queues == 1 ||
+           blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
+           q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
                /*
                 * Use plugging if we have a ->commit_rqs() hook as well, as
                 * we know the driver uses bd->last in a smart fashion.
@@ -2245,22 +2567,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                unsigned int request_count = plug->rq_count;
                struct request *last = NULL;
 
-               if (!request_count)
+               if (!request_count) {
                        trace_block_plug(q);
-               else
-                       last = list_entry_rq(plug->mq_list.prev);
+               } else if (!blk_queue_nomerges(q)) {
+                       last = rq_list_peek(&plug->mq_list);
+                       if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE)
+                               last = NULL;
+               }
 
-               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
-                   blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
-                       blk_flush_plug_list(plug, false);
+               if (request_count >= blk_plug_max_rq_count(plug) || last) {
+                       blk_mq_flush_plug_list(plug, false);
                        trace_block_plug(q);
                }
 
                blk_add_rq_to_plug(plug, rq);
-       } else if (q->elevator) {
+       } else if (rq->rq_flags & RQF_ELV) {
                /* Insert the request at the IO scheduler queue */
                blk_mq_sched_insert_request(rq, false, true, true);
        } else if (plug && !blk_queue_nomerges(q)) {
+               struct request *next_rq = NULL;
+
                /*
                 * We do limited plugging. If the bio can be merged, do that.
                 * Otherwise the existing request in the plug list will be
@@ -2268,39 +2594,32 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                 * The plug list might get flushed before this. If that happens,
                 * the plug list is empty, and same_queue_rq is invalid.
                 */
-               if (list_empty(&plug->mq_list))
-                       same_queue_rq = NULL;
                if (same_queue_rq) {
-                       list_del_init(&same_queue_rq->queuelist);
+                       next_rq = rq_list_pop(&plug->mq_list);
                        plug->rq_count--;
                }
                blk_add_rq_to_plug(plug, rq);
                trace_block_plug(q);
 
-               if (same_queue_rq) {
-                       data.hctx = same_queue_rq->mq_hctx;
+               if (next_rq) {
                        trace_block_unplug(q, 1, true);
-                       blk_mq_try_issue_directly(data.hctx, same_queue_rq,
-                                       &cookie);
+                       blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq);
                }
        } else if ((q->nr_hw_queues > 1 && is_sync) ||
-                       !data.hctx->dispatch_busy) {
+                  !rq->mq_hctx->dispatch_busy) {
                /*
                 * There is no scheduler and we can try to send directly
                 * to the hardware.
                 */
-               blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+               blk_mq_try_issue_directly(rq->mq_hctx, rq);
        } else {
                /* Default case. */
                blk_mq_sched_insert_request(rq, false, true, true);
        }
 
-       if (!hipri)
-               return BLK_QC_T_NONE;
-       return cookie;
+       return;
 queue_exit:
        blk_queue_exit(q);
-       return BLK_QC_T_NONE;
 }
 
 static size_t order_to_size(unsigned int order)
@@ -2309,19 +2628,22 @@ static size_t order_to_size(unsigned int order)
 }
 
 /* called before freeing request pool in @tags */
-static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
-               struct blk_mq_tags *tags, unsigned int hctx_idx)
+static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
+                                   struct blk_mq_tags *tags)
 {
-       struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
        struct page *page;
        unsigned long flags;
 
+       /* There is no need to clear a driver tags own mapping */
+       if (drv_tags == tags)
+               return;
+
        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;
 
-               for (i = 0; i < set->queue_depth; i++) {
+               for (i = 0; i < drv_tags->nr_tags; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;
 
@@ -2345,9 +2667,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
 {
+       struct blk_mq_tags *drv_tags;
        struct page *page;
 
-       if (tags->rqs && set->ops->exit_request) {
+       if (blk_mq_is_shared_tags(set->flags))
+               drv_tags = set->shared_tags;
+       else
+               drv_tags = set->tags[hctx_idx];
+
+       if (tags->static_rqs && set->ops->exit_request) {
                int i;
 
                for (i = 0; i < tags->nr_tags; i++) {
@@ -2360,7 +2688,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                }
        }
 
-       blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+       blk_mq_clear_rq_mapping(drv_tags, tags);
 
        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
@@ -2374,21 +2702,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
        }
 }
 
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 {
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;
 
-       blk_mq_free_tags(tags, flags);
+       blk_mq_free_tags(tags);
 }
 
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-                                       unsigned int hctx_idx,
-                                       unsigned int nr_tags,
-                                       unsigned int reserved_tags,
-                                       unsigned int flags)
+static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                              unsigned int hctx_idx,
+                                              unsigned int nr_tags,
+                                              unsigned int reserved_tags)
 {
        struct blk_mq_tags *tags;
        int node;
@@ -2397,7 +2724,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
 
-       tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
+       tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
+                               BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;
 
@@ -2405,7 +2733,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs) {
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                return NULL;
        }
 
@@ -2414,7 +2742,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                        node);
        if (!tags->static_rqs) {
                kfree(tags->rqs);
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                return NULL;
        }
 
@@ -2436,8 +2764,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
        return 0;
 }
 
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-                    unsigned int hctx_idx, unsigned int depth)
+static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
+                           struct blk_mq_tags *tags,
+                           unsigned int hctx_idx, unsigned int depth)
 {
        unsigned int i, j, entries_per_page, max_order = 4;
        size_t rq_size, left;
@@ -2848,37 +3177,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
        }
 }
 
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
-                                       int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                            unsigned int hctx_idx,
+                                            unsigned int depth)
 {
-       unsigned int flags = set->flags;
-       int ret = 0;
+       struct blk_mq_tags *tags;
+       int ret;
 
-       set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
-                                       set->queue_depth, set->reserved_tags, flags);
-       if (!set->tags[hctx_idx])
-               return false;
+       tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
+       if (!tags)
+               return NULL;
 
-       ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
-                               set->queue_depth);
-       if (!ret)
-               return true;
+       ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
+       if (ret) {
+               blk_mq_free_rq_map(tags);
+               return NULL;
+       }
 
-       blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-       set->tags[hctx_idx] = NULL;
-       return false;
+       return tags;
 }
 
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
-                                        unsigned int hctx_idx)
+static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                      int hctx_idx)
 {
-       unsigned int flags = set->flags;
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->tags[hctx_idx] = set->shared_tags;
 
-       if (set->tags && set->tags[hctx_idx]) {
-               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-               blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-               set->tags[hctx_idx] = NULL;
+               return true;
        }
+
+       set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
+                                                      set->queue_depth);
+
+       return set->tags[hctx_idx];
+}
+
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                            struct blk_mq_tags *tags,
+                            unsigned int hctx_idx)
+{
+       if (tags) {
+               blk_mq_free_rqs(set, tags, hctx_idx);
+               blk_mq_free_rq_map(tags);
+       }
+}
+
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                                     unsigned int hctx_idx)
+{
+       if (!blk_mq_is_shared_tags(set->flags))
+               blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
+
+       set->tags[hctx_idx] = NULL;
 }
 
 static void blk_mq_map_swqueue(struct request_queue *q)
@@ -2911,7 +3261,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
-                           !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+                           !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
@@ -2958,8 +3308,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                         * fallback in case of a new remap fails
                         * allocation
                         */
-                       if (i && set->tags[i])
-                               blk_mq_free_map_and_requests(set, i);
+                       if (i)
+                               __blk_mq_free_map_and_rqs(set, i);
 
                        hctx->tags = NULL;
                        continue;
@@ -3255,8 +3605,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx = hctxs[j];
 
                if (hctx) {
-                       if (hctx->tags)
-                               blk_mq_free_map_and_requests(set, j);
+                       __blk_mq_free_map_and_rqs(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
                        hctxs[j] = NULL;
                }
@@ -3343,8 +3692,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
        int i;
 
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->shared_tags = blk_mq_alloc_map_and_rqs(set,
+                                               BLK_MQ_NO_HCTX_IDX,
+                                               set->queue_depth);
+               if (!set->shared_tags)
+                       return -ENOMEM;
+       }
+
        for (i = 0; i < set->nr_hw_queues; i++) {
-               if (!__blk_mq_alloc_map_and_request(set, i))
+               if (!__blk_mq_alloc_map_and_rqs(set, i))
                        goto out_unwind;
                cond_resched();
        }
@@ -3353,7 +3710,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 
 out_unwind:
        while (--i >= 0)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
+
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
 
        return -ENOMEM;
 }
@@ -3363,7 +3725,7 @@ out_unwind:
  * may reduce the depth asked for, if memory is tight. set->queue_depth
  * will be updated to reflect the allocated depth.
  */
-static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
 {
        unsigned int depth;
        int err;
@@ -3529,27 +3891,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
        if (ret)
                goto out_free_mq_map;
 
-       ret = blk_mq_alloc_map_and_requests(set);
+       ret = blk_mq_alloc_set_map_and_rqs(set);
        if (ret)
                goto out_free_mq_map;
 
-       if (blk_mq_is_sbitmap_shared(set->flags)) {
-               atomic_set(&set->active_queues_shared_sbitmap, 0);
-
-               if (blk_mq_init_shared_sbitmap(set)) {
-                       ret = -ENOMEM;
-                       goto out_free_mq_rq_maps;
-               }
-       }
-
        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);
 
        return 0;
 
-out_free_mq_rq_maps:
-       for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
 out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
@@ -3582,10 +3932,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
        int i, j;
 
        for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
 
-       if (blk_mq_is_sbitmap_shared(set->flags))
-               blk_mq_exit_shared_sbitmap(set);
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
 
        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
@@ -3620,20 +3972,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
                 * If we're using an MQ scheduler, just update the scheduler
                 * queue depth. This is similar to what the old code would do.
                 */
-               if (!hctx->sched_tags) {
-                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
-                                                       false);
-                       if (!ret && blk_mq_is_sbitmap_shared(set->flags))
-                               blk_mq_tag_resize_shared_sbitmap(set, nr);
-               } else {
+               if (hctx->sched_tags) {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
-                                                       nr, true);
-                       if (blk_mq_is_sbitmap_shared(set->flags)) {
-                               hctx->sched_tags->bitmap_tags =
-                                       &q->sched_bitmap_tags;
-                               hctx->sched_tags->breserved_tags =
-                                       &q->sched_breserved_tags;
-                       }
+                                                     nr, true);
+               } else {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
+                                                     false);
                }
                if (ret)
                        break;
@@ -3642,9 +3986,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        }
        if (!ret) {
                q->nr_requests = nr;
-               if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
-                       sbitmap_queue_resize(&q->sched_bitmap_tags,
-                                            nr - set->reserved_tags);
+               if (blk_mq_is_shared_tags(set->flags)) {
+                       if (q->elevator)
+                               blk_mq_tag_update_sched_shared_tags(q);
+                       else
+                               blk_mq_tag_resize_shared_tags(set, nr);
+               }
        }
 
        blk_mq_unquiesce_queue(q);
@@ -3863,15 +4210,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
        return ret;
 }
 
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
-                                    struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
 {
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
+       struct request *rq = blk_qc_to_rq(hctx, qc);
        struct hrtimer_sleeper hs;
        enum hrtimer_mode mode;
        unsigned int nsecs;
        ktime_t kt;
 
-       if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
+       /*
+        * If a request has completed on queue that uses an I/O scheduler, we
+        * won't get back a request from blk_qc_to_rq.
+        */
+       if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
                return false;
 
        /*
@@ -3913,92 +4265,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 
        __set_current_state(TASK_RUNNING);
        destroy_hrtimer_on_stack(&hs.timer);
-       return true;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q,
-                              struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
-{
-       struct request *rq;
-
-       if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
-               return false;
-
-       if (!blk_qc_t_is_internal(cookie))
-               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-       else {
-               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-               /*
-                * With scheduling, if the request has completed, we'll
-                * get a NULL return here, as we clear the sched tag when
-                * that happens. The request still remains valid, like always,
-                * so we should be safe with just the NULL check.
-                */
-               if (!rq)
-                       return false;
-       }
-
-       return blk_mq_poll_hybrid_sleep(q, rq);
-}
-
-/**
- * blk_poll - poll for IO completions
- * @q:  the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- *    Poll for completions on the passed in queue. Returns number of
- *    completed entries found. If @spin is true, then blk_poll will continue
- *    looping until at least one completion is found, unless the task is
- *    otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int state;
-
-       if (!blk_qc_t_valid(cookie) ||
-           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               return 0;
-
-       if (current->plug)
-               blk_flush_plug_list(current->plug, false);
-
-       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 
        /*
-        * If we sleep, have the caller restart the poll loop to reset
-        * the state. Like for the other success return cases, the
-        * caller is responsible for checking if the IO completed. If
-        * the IO isn't complete, we'll get called again and will go
-        * straight to the busy poll loop. If specified not to spin,
-        * we also should not sleep.
+        * If we sleep, have the caller restart the poll loop to reset the
+        * state.  Like for the other success return cases, the caller is
+        * responsible for checking if the IO completed.  If the IO isn't
+        * complete, we'll get called again and will go straight to the busy
+        * poll loop.
         */
-       if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
-               return 1;
+       return true;
+}
 
-       hctx->poll_considered++;
+static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
+                              struct io_comp_batch *iob, unsigned int flags)
+{
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
+       long state = get_current_state();
+       int ret;
 
-       state = get_current_state();
        do {
-               int ret;
-
-               hctx->poll_invoked++;
-
-               ret = q->mq_ops->poll(hctx);
+               ret = q->mq_ops->poll(hctx, iob);
                if (ret > 0) {
-                       hctx->poll_success++;
                        __set_current_state(TASK_RUNNING);
                        return ret;
                }
 
                if (signal_pending_state(state, current))
                        __set_current_state(TASK_RUNNING);
-
                if (task_is_running(current))
                        return 1;
-               if (ret < 0 || !spin)
+
+               if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                        break;
                cpu_relax();
        } while (!need_resched());
@@ -4006,7 +4303,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
        __set_current_state(TASK_RUNNING);
        return 0;
 }
-EXPORT_SYMBOL_GPL(blk_poll);
+
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags)
+{
+       if (!(flags & BLK_POLL_NOSLEEP) &&
+           q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
+               if (blk_mq_poll_hybrid(q, cookie))
+                       return 1;
+       }
+       return blk_mq_poll_classic(q, cookie, iob, flags);
+}
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
index d08779f..28859fc 100644 (file)
@@ -25,18 +25,14 @@ struct blk_mq_ctx {
        unsigned short          index_hw[HCTX_MAX_TYPES];
        struct blk_mq_hw_ctx    *hctxs[HCTX_MAX_TYPES];
 
-       /* incremented at dispatch time */
-       unsigned long           rq_dispatched[2];
-       unsigned long           rq_merged;
-
-       /* incremented at completion time */
-       unsigned long           ____cacheline_aligned_in_smp rq_completed[2];
-
        struct request_queue    *queue;
        struct blk_mq_ctxs      *ctxs;
        struct kobject          kobj;
 } ____cacheline_aligned_in_smp;
 
+void blk_mq_submit_bio(struct bio *bio);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
@@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
  */
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-                                       unsigned int hctx_idx,
-                                       unsigned int nr_tags,
-                                       unsigned int reserved_tags,
-                                       unsigned int flags);
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-                    unsigned int hctx_idx, unsigned int depth);
-
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                               unsigned int hctx_idx, unsigned int depth);
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                            struct blk_mq_tags *tags,
+                            unsigned int hctx_idx);
 /*
  * Internal helpers for request insertion into sw queues
  */
@@ -109,9 +102,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
        enum hctx_type type = HCTX_TYPE_DEFAULT;
 
        /*
-        * The caller ensure that if REQ_HIPRI, poll must be enabled.
+        * The caller ensure that if REQ_POLLED, poll must be enabled.
         */
-       if (flags & REQ_HIPRI)
+       if (flags & REQ_POLLED)
                type = HCTX_TYPE_POLL;
        else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
                type = HCTX_TYPE_READ;
@@ -128,6 +121,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_free_plug_rqs(struct blk_plug *plug);
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_release(struct request_queue *q);
 
@@ -154,23 +149,27 @@ struct blk_mq_alloc_data {
        blk_mq_req_flags_t flags;
        unsigned int shallow_depth;
        unsigned int cmd_flags;
+       unsigned int rq_flags;
+
+       /* allocate multiple requests/tags in one go */
+       unsigned int nr_tags;
+       struct request **cached_rq;
 
        /* input & output parameter */
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
 };
 
-static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+static inline bool blk_mq_is_shared_tags(unsigned int flags)
 {
        return flags & BLK_MQ_F_TAG_HCTX_SHARED;
 }
 
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
 {
-       if (data->q->elevator)
-               return data->hctx->sched_tags;
-
-       return data->hctx->tags;
+       if (!(data->rq_flags & RQF_ELV))
+               return data->hctx->tags;
+       return data->hctx->sched_tags;
 }
 
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@@ -220,24 +219,24 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
 
 static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_inc(&hctx->nr_active);
 }
 
 static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_dec(&hctx->nr_active);
 }
 
 static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
        return atomic_read(&hctx->nr_active);
 }
 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -260,7 +259,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
        __blk_mq_put_driver_tag(rq->mq_hctx, rq);
 }
 
-bool blk_mq_get_driver_tag(struct request *rq);
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+       if (rq->tag != BLK_MQ_NO_TAG &&
+           !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+               hctx->tags->rqs[rq->tag] = rq;
+               return true;
+       }
+
+       return __blk_mq_get_driver_tag(hctx, rq);
+}
 
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 {
@@ -331,19 +343,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
        if (bt->sb.depth == 1)
                return true;
 
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;
-               struct blk_mq_tag_set *set = q->tag_set;
 
                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return true;
-               users = atomic_read(&set->active_queues_shared_sbitmap);
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return true;
-               users = atomic_read(&hctx->tags->active_queues);
        }
 
+       users = atomic_read(&hctx->tags->active_queues);
+
        if (!users)
                return true;
 
index f000f83..3cfbc86 100644 (file)
@@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
         * BIO_TRACKED lets controllers know that a bio went through the
         * normal rq_qos path.
         */
-       bio_set_flag(bio, BIO_TRACKED);
-       if (q->rq_qos)
+       if (q->rq_qos) {
+               bio_set_flag(bio, BIO_TRACKED);
                __rq_qos_throttle(q->rq_qos, bio);
+       }
 }
 
 static inline void rq_qos_track(struct request_queue *q, struct request *rq,
index a7c857a..b880c70 100644 (file)
@@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
 
+static bool disk_has_partitions(struct gendisk *disk)
+{
+       unsigned long idx;
+       struct block_device *part;
+       bool ret = false;
+
+       rcu_read_lock();
+       xa_for_each(&disk->part_tbl, idx, part) {
+               if (bdev_is_partition(part)) {
+                       ret = true;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
 /**
  * blk_queue_set_zoned - configure a disk queue zoned model.
  * @disk:      the gendisk of the queue to configure
@@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
                 * we do nothing special as far as the block layer is concerned.
                 */
                if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
-                   !xa_empty(&disk->part_tbl))
+                   disk_has_partitions(disk))
                        model = BLK_ZONED_NONE;
                break;
        case BLK_ZONED_NONE:
index 614d9d4..cef1f71 100644 (file)
@@ -17,6 +17,7 @@
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-wbt.h"
+#include "blk-throttle.h"
 
 struct queue_sysfs_entry {
        struct attribute attr;
@@ -432,26 +433,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
 static ssize_t queue_poll_store(struct request_queue *q, const char *page,
                                size_t count)
 {
-       unsigned long poll_on;
-       ssize_t ret;
-
-       if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
-           !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
+       if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                return -EINVAL;
-
-       ret = queue_var_store(&poll_on, page, count);
-       if (ret < 0)
-               return ret;
-
-       if (poll_on) {
-               blk_queue_flag_set(QUEUE_FLAG_POLL, q);
-       } else {
-               blk_mq_freeze_queue(q);
-               blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
-               blk_mq_unfreeze_queue(q);
-       }
-
-       return ret;
+       pr_info_ratelimited("writes to the poll attribute are ignored.\n");
+       pr_info_ratelimited("please use driver specific parameters instead.\n");
+       return count;
 }
 
 static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@@ -887,16 +873,15 @@ int blk_register_queue(struct gendisk *disk)
        }
 
        mutex_lock(&q->sysfs_lock);
+
+       ret = disk_register_independent_access_ranges(disk, NULL);
+       if (ret)
+               goto put_dev;
+
        if (q->elevator) {
                ret = elv_register_queue(q, false);
-               if (ret) {
-                       mutex_unlock(&q->sysfs_lock);
-                       mutex_unlock(&q->sysfs_dir_lock);
-                       kobject_del(&q->kobj);
-                       blk_trace_remove_sysfs(dev);
-                       kobject_put(&dev->kobj);
-                       return ret;
-               }
+               if (ret)
+                       goto put_dev;
        }
 
        blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -928,6 +913,16 @@ unlock:
        }
 
        return ret;
+
+put_dev:
+       disk_unregister_independent_access_ranges(disk);
+       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->sysfs_dir_lock);
+       kobject_del(&q->kobj);
+       blk_trace_remove_sysfs(dev);
+       kobject_put(&dev->kobj);
+
+       return ret;
 }
 
 /**
@@ -972,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk)
        mutex_lock(&q->sysfs_lock);
        if (q->elevator)
                elv_unregister_queue(q);
+       disk_unregister_independent_access_ranges(disk);
        mutex_unlock(&q->sysfs_lock);
        mutex_unlock(&q->sysfs_dir_lock);
 
index 7c4e799..39bb6e6 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
+#include "blk-throttle.h"
 
 /* Max dispatch from a group in 1 round */
 #define THROTL_GRP_QUANTUM 8
  */
 #define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
-static struct blkcg_policy blkcg_policy_throtl;
-
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
-/*
- * To implement hierarchical throttling, throtl_grps form a tree and bios
- * are dispatched upwards level by level until they reach the top and get
- * issued.  When dispatching bios from the children and local group at each
- * level, if the bios are dispatched into a single bio_list, there's a risk
- * of a local or child group which can queue many bios at once filling up
- * the list starving others.
- *
- * To avoid such starvation, dispatched bios are queued separately
- * according to where they came from.  When they are again dispatched to
- * the parent, they're popped in round-robin order so that no single source
- * hogs the dispatch window.
- *
- * throtl_qnode is used to keep the queued bios separated by their sources.
- * Bios are queued to throtl_qnode which in turn is queued to
- * throtl_service_queue and then dispatched in round-robin order.
- *
- * It's also used to track the reference counts on blkg's.  A qnode always
- * belongs to a throtl_grp and gets queued on itself or the parent, so
- * incrementing the reference of the associated throtl_grp when a qnode is
- * queued and decrementing when dequeued is enough to keep the whole blkg
- * tree pinned while bios are in flight.
- */
-struct throtl_qnode {
-       struct list_head        node;           /* service_queue->queued[] */
-       struct bio_list         bios;           /* queued bios */
-       struct throtl_grp       *tg;            /* tg this qnode belongs to */
-};
-
-struct throtl_service_queue {
-       struct throtl_service_queue *parent_sq; /* the parent service_queue */
-
-       /*
-        * Bios queued directly to this service_queue or dispatched from
-        * children throtl_grp's.
-        */
-       struct list_head        queued[2];      /* throtl_qnode [READ/WRITE] */
-       unsigned int            nr_queued[2];   /* number of queued bios */
-
-       /*
-        * RB tree of active children throtl_grp's, which are sorted by
-        * their ->disptime.
-        */
-       struct rb_root_cached   pending_tree;   /* RB tree of active tgs */
-       unsigned int            nr_pending;     /* # queued in the tree */
-       unsigned long           first_pending_disptime; /* disptime of the first tg */
-       struct timer_list       pending_timer;  /* fires on first_pending_disptime */
-};
-
 enum tg_state_flags {
        THROTL_TG_PENDING       = 1 << 0,       /* on parent's pending tree */
        THROTL_TG_WAS_EMPTY     = 1 << 1,       /* bio_lists[] became non-empty */
@@ -98,93 +48,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
-enum {
-       LIMIT_LOW,
-       LIMIT_MAX,
-       LIMIT_CNT,
-};
-
-struct throtl_grp {
-       /* must be the first member */
-       struct blkg_policy_data pd;
-
-       /* active throtl group service_queue member */
-       struct rb_node rb_node;
-
-       /* throtl_data this group belongs to */
-       struct throtl_data *td;
-
-       /* this group's service queue */
-       struct throtl_service_queue service_queue;
-
-       /*
-        * qnode_on_self is used when bios are directly queued to this
-        * throtl_grp so that local bios compete fairly with bios
-        * dispatched from children.  qnode_on_parent is used when bios are
-        * dispatched from this throtl_grp into its parent and will compete
-        * with the sibling qnode_on_parents and the parent's
-        * qnode_on_self.
-        */
-       struct throtl_qnode qnode_on_self[2];
-       struct throtl_qnode qnode_on_parent[2];
-
-       /*
-        * Dispatch time in jiffies. This is the estimated time when group
-        * will unthrottle and is ready to dispatch more bio. It is used as
-        * key to sort active groups in service tree.
-        */
-       unsigned long disptime;
-
-       unsigned int flags;
-
-       /* are there any throtl rules between this group and td? */
-       bool has_rules[2];
-
-       /* internally used bytes per second rate limits */
-       uint64_t bps[2][LIMIT_CNT];
-       /* user configured bps limits */
-       uint64_t bps_conf[2][LIMIT_CNT];
-
-       /* internally used IOPS limits */
-       unsigned int iops[2][LIMIT_CNT];
-       /* user configured IOPS limits */
-       unsigned int iops_conf[2][LIMIT_CNT];
-
-       /* Number of bytes dispatched in current slice */
-       uint64_t bytes_disp[2];
-       /* Number of bio's dispatched in current slice */
-       unsigned int io_disp[2];
-
-       unsigned long last_low_overflow_time[2];
-
-       uint64_t last_bytes_disp[2];
-       unsigned int last_io_disp[2];
-
-       unsigned long last_check_time;
-
-       unsigned long latency_target; /* us */
-       unsigned long latency_target_conf; /* us */
-       /* When did we start a new slice */
-       unsigned long slice_start[2];
-       unsigned long slice_end[2];
-
-       unsigned long last_finish_time; /* ns / 1024 */
-       unsigned long checked_last_finish_time; /* ns / 1024 */
-       unsigned long avg_idletime; /* ns / 1024 */
-       unsigned long idletime_threshold; /* us */
-       unsigned long idletime_threshold_conf; /* us */
-
-       unsigned int bio_cnt; /* total bios */
-       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
-       unsigned long bio_cnt_reset_time;
-
-       atomic_t io_split_cnt[2];
-       atomic_t last_io_split_cnt[2];
-
-       struct blkg_rwstat stat_bytes;
-       struct blkg_rwstat stat_ios;
-};
-
 /* We measure latency for request size from <= 4k to >= 1M */
 #define LATENCY_BUCKET_SIZE 9
 
@@ -231,16 +94,6 @@ struct throtl_data
 
 static void throtl_pending_timer_fn(struct timer_list *t);
 
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
-{
-       return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
-}
-
-static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
-{
-       return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
-}
-
 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 {
        return pd_to_blkg(&tg->pd);
@@ -1794,7 +1647,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
        cancel_work_sync(&td->dispatch_work);
 }
 
-static struct blkcg_policy blkcg_policy_throtl = {
+struct blkcg_policy blkcg_policy_throtl = {
        .dfl_cftypes            = throtl_files,
        .legacy_cftypes         = throtl_legacy_files,
 
@@ -2208,9 +2061,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
        } while (parent);
 }
 
-bool blk_throtl_bio(struct bio *bio)
+bool __blk_throtl_bio(struct bio *bio)
 {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        struct blkcg_gq *blkg = bio->bi_blkg;
        struct throtl_qnode *qn = NULL;
        struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -2221,19 +2074,12 @@ bool blk_throtl_bio(struct bio *bio)
 
        rcu_read_lock();
 
-       /* see throtl_charge_bio() */
-       if (bio_flagged(bio, BIO_THROTTLED))
-               goto out;
-
        if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
                blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
                                bio->bi_iter.bi_size);
                blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
        }
 
-       if (!tg->has_rules[rw])
-               goto out;
-
        spin_lock_irq(&q->queue_lock);
 
        throtl_update_latency_buckets(td);
@@ -2317,7 +2163,6 @@ again:
 
 out_unlock:
        spin_unlock_irq(&q->queue_lock);
-out:
        bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
new file mode 100644 (file)
index 0000000..175f03a
--- /dev/null
@@ -0,0 +1,182 @@
+#ifndef BLK_THROTTLE_H
+#define BLK_THROTTLE_H
+
+#include "blk-cgroup-rwstat.h"
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued.  When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from.  When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's.  A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+       struct list_head        node;           /* service_queue->queued[] */
+       struct bio_list         bios;           /* queued bios */
+       struct throtl_grp       *tg;            /* tg this qnode belongs to */
+};
+
+struct throtl_service_queue {
+       struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+       /*
+        * Bios queued directly to this service_queue or dispatched from
+        * children throtl_grp's.
+        */
+       struct list_head        queued[2];      /* throtl_qnode [READ/WRITE] */
+       unsigned int            nr_queued[2];   /* number of queued bios */
+
+       /*
+        * RB tree of active children throtl_grp's, which are sorted by
+        * their ->disptime.
+        */
+       struct rb_root_cached   pending_tree;   /* RB tree of active tgs */
+       unsigned int            nr_pending;     /* # queued in the tree */
+       unsigned long           first_pending_disptime; /* disptime of the first tg */
+       struct timer_list       pending_timer;  /* fires on first_pending_disptime */
+};
+
+enum {
+       LIMIT_LOW,
+       LIMIT_MAX,
+       LIMIT_CNT,
+};
+
+struct throtl_grp {
+       /* must be the first member */
+       struct blkg_policy_data pd;
+
+       /* active throtl group service_queue member */
+       struct rb_node rb_node;
+
+       /* throtl_data this group belongs to */
+       struct throtl_data *td;
+
+       /* this group's service queue */
+       struct throtl_service_queue service_queue;
+
+       /*
+        * qnode_on_self is used when bios are directly queued to this
+        * throtl_grp so that local bios compete fairly with bios
+        * dispatched from children.  qnode_on_parent is used when bios are
+        * dispatched from this throtl_grp into its parent and will compete
+        * with the sibling qnode_on_parents and the parent's
+        * qnode_on_self.
+        */
+       struct throtl_qnode qnode_on_self[2];
+       struct throtl_qnode qnode_on_parent[2];
+
+       /*
+        * Dispatch time in jiffies. This is the estimated time when group
+        * will unthrottle and is ready to dispatch more bio. It is used as
+        * key to sort active groups in service tree.
+        */
+       unsigned long disptime;
+
+       unsigned int flags;
+
+       /* are there any throtl rules between this group and td? */
+       bool has_rules[2];
+
+       /* internally used bytes per second rate limits */
+       uint64_t bps[2][LIMIT_CNT];
+       /* user configured bps limits */
+       uint64_t bps_conf[2][LIMIT_CNT];
+
+       /* internally used IOPS limits */
+       unsigned int iops[2][LIMIT_CNT];
+       /* user configured IOPS limits */
+       unsigned int iops_conf[2][LIMIT_CNT];
+
+       /* Number of bytes dispatched in current slice */
+       uint64_t bytes_disp[2];
+       /* Number of bio's dispatched in current slice */
+       unsigned int io_disp[2];
+
+       unsigned long last_low_overflow_time[2];
+
+       uint64_t last_bytes_disp[2];
+       unsigned int last_io_disp[2];
+
+       unsigned long last_check_time;
+
+       unsigned long latency_target; /* us */
+       unsigned long latency_target_conf; /* us */
+       /* When did we start a new slice */
+       unsigned long slice_start[2];
+       unsigned long slice_end[2];
+
+       unsigned long last_finish_time; /* ns / 1024 */
+       unsigned long checked_last_finish_time; /* ns / 1024 */
+       unsigned long avg_idletime; /* ns / 1024 */
+       unsigned long idletime_threshold; /* us */
+       unsigned long idletime_threshold_conf; /* us */
+
+       unsigned int bio_cnt; /* total bios */
+       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+       unsigned long bio_cnt_reset_time;
+
+       atomic_t io_split_cnt[2];
+       atomic_t last_io_split_cnt[2];
+
+       struct blkg_rwstat stat_bytes;
+       struct blkg_rwstat stat_ios;
+};
+
+extern struct blkcg_policy blkcg_policy_throtl;
+
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
+{
+       return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
+}
+
+/*
+ * Internal throttling interface
+ */
+#ifndef CONFIG_BLK_DEV_THROTTLING
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
+static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+#else /* CONFIG_BLK_DEV_THROTTLING */
+int blk_throtl_init(struct request_queue *q);
+void blk_throtl_exit(struct request_queue *q);
+void blk_throtl_register_queue(struct request_queue *q);
+void blk_throtl_charge_bio_split(struct bio *bio);
+bool __blk_throtl_bio(struct bio *bio);
+static inline bool blk_throtl_bio(struct bio *bio)
+{
+       struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+
+       if (bio_flagged(bio, BIO_THROTTLED))
+               return false;
+       if (!tg->has_rules[bio_data_dir(bio)])
+               return false;
+
+       return __blk_throtl_bio(bio);
+}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif
index 874c1c3..0c119be 100644 (file)
@@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
        unsigned int inflight = wbt_inflight(rwb);
        int status;
 
+       if (!rwb->rqos.q->disk)
+               return;
+
        status = latency_exceeded(rwb, cb->stat);
 
        trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
index 6c3c00a..7afffd5 100644 (file)
@@ -12,6 +12,8 @@
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
 
+struct elevator_type;
+
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT                (5 * HZ)
 
@@ -94,6 +96,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
        return __bvec_gap_to_prev(q, bprv, offset);
 }
 
+static inline bool rq_mergeable(struct request *rq)
+{
+       if (blk_rq_is_passthrough(rq))
+               return false;
+
+       if (req_op(rq) == REQ_OP_FLUSH)
+               return false;
+
+       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+               return false;
+
+       if (req_op(rq) == REQ_OP_ZONE_APPEND)
+               return false;
+
+       if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
+               return false;
+       if (rq->rq_flags & RQF_NOMERGE_FLAGS)
+               return false;
+
+       return true;
+}
+
+/*
+ * There are two different ways to handle DISCARD merges:
+ *  1) If max_discard_segments > 1, the driver treats every bio as a range and
+ *     send the bios to controller together. The ranges don't need to be
+ *     contiguous.
+ *  2) Otherwise, the request will be normal read/write requests.  The ranges
+ *     need to be contiguous.
+ */
+static inline bool blk_discard_mergable(struct request *req)
+{
+       if (req_op(req) == REQ_OP_DISCARD &&
+           queue_max_discard_segments(req->q) > 1)
+               return true;
+       return false;
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
@@ -175,21 +215,28 @@ static inline void blk_integrity_del(struct gendisk *disk)
 
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
+void blk_print_req_error(struct request *req, blk_status_t status);
 
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs, struct request **same_queue_rq);
+               unsigned int nr_segs, bool *same_queue_rq);
 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);
 
-void blk_account_io_start(struct request *req);
-void blk_account_io_done(struct request *req, u64 now);
+void __blk_account_io_start(struct request *req);
+void __blk_account_io_done(struct request *req, u64 now);
+
+/*
+ * Plug flush limits
+ */
+#define BLK_MAX_REQUEST_COUNT  32
+#define BLK_PLUG_FLUSH_SIZE    (128 * 1024)
 
 /*
  * Internal elevator interface
  */
 #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
 
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
 
 int elevator_switch_mq(struct request_queue *q,
                              struct elevator_type *new_e);
@@ -202,7 +249,7 @@ static inline void elevator_exit(struct request_queue *q,
 {
        lockdep_assert_held(&q->sysfs_lock);
 
-       blk_mq_sched_free_requests(q);
+       blk_mq_sched_free_rqs(q);
        __elevator_exit(q, e);
 }
 
@@ -220,7 +267,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);
 
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
+static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+{
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
+       case REQ_OP_WRITE_SAME:
+               return true; /* non-trivial splitting decisions */
+       default:
+               break;
+       }
+
+       /*
+        * All drivers must accept single-segments bios that are <= PAGE_SIZE.
+        * This is a quick and dirty check that relies on the fact that
+        * bi_io_vec[0] is always valid if a bio has data.  The check might
+        * lead to occasional false negatives when bios are cloned, but compared
+        * to the performance impact of cloned bios themselves the loop below
+        * doesn't matter anyway.
+        */
+       return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+               bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+}
+
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+                       unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -240,7 +312,25 @@ int blk_dev_init(void);
  */
 static inline bool blk_do_io_stat(struct request *rq)
 {
-       return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
+       return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
+}
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+       /*
+        * Account IO completion.  flush_rq isn't accounted as a
+        * normal IO on queueing nor completion.  Accounting the
+        * containing request is enough.
+        */
+       if (blk_do_io_stat(req) && req->part &&
+           !(req->rq_flags & RQF_FLUSH_SEQ))
+               __blk_account_io_done(req, now);
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+       if (blk_do_io_stat(req))
+               __blk_account_io_start(req);
 }
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
@@ -285,22 +375,6 @@ void ioc_clear_queue(struct request_queue *q);
 
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
-/*
- * Internal throttling interface
- */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_register_queue(struct request_queue *q);
-extern void blk_throtl_charge_bio_split(struct bio *bio);
-bool blk_throtl_bio(struct bio *bio);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
-static inline bool blk_throtl_bio(struct bio *bio) { return false; }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -368,13 +442,20 @@ extern struct device_attribute dev_attr_events;
 extern struct device_attribute dev_attr_events_async;
 extern struct device_attribute dev_attr_events_poll_msecs;
 
-static inline void bio_clear_hipri(struct bio *bio)
+static inline void bio_clear_polled(struct bio *bio)
 {
        /* can't support alloc cache if we turn off polling */
        bio_clear_flag(bio, BIO_PERCPU_CACHE);
-       bio->bi_opf &= ~REQ_HIPRI;
+       bio->bi_opf &= ~REQ_POLLED;
 }
 
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
 extern const struct address_space_operations def_blk_aops;
 
+int disk_register_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *new_iars);
+void disk_unregister_independent_access_ranges(struct gendisk *disk);
+
 #endif /* BLK_INTERNAL_H */
index 05fc714..7af1a72 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
index ccb9827..10aa378 100644 (file)
@@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
        struct bsg_job *job;
        struct request *rq;
        struct bio *bio;
+       void *reply;
        int ret;
 
        if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
@@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;
 
-       rq = blk_get_request(q, hdr->dout_xfer_len ?
+       rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
                             REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        rq->timeout = timeout;
 
        job = blk_mq_rq_to_pdu(rq);
+       reply = job->reply;
+       memset(job, 0, sizeof(*job));
+       job->reply = reply;
+       job->reply_len = SCSI_SENSE_BUFFERSIZE;
+       job->dd_data = job + 1;
+
        job->request_len = hdr->request_len;
        job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
        if (IS_ERR(job->request)) {
                ret = PTR_ERR(job->request);
-               goto out_put_request;
+               goto out_free_rq;
        }
 
        if (hdr->dout_xfer_len && hdr->din_xfer_len) {
-               job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
+               job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
                if (IS_ERR(job->bidi_rq)) {
                        ret = PTR_ERR(job->bidi_rq);
                        goto out_free_job_request;
@@ -134,11 +141,11 @@ out_unmap_bidi_rq:
                blk_rq_unmap_user(job->bidi_bio);
 out_free_bidi_rq:
        if (job->bidi_rq)
-               blk_put_request(job->bidi_rq);
+               blk_mq_free_request(job->bidi_rq);
 out_free_job_request:
        kfree(job->request);
-out_put_request:
-       blk_put_request(rq);
+out_free_rq:
+       blk_mq_free_request(rq);
        return ret;
 }
 
@@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
        return 0;
 }
 
-/* called right before the request is given to the request_queue user */
-static void bsg_initialize_rq(struct request *req)
-{
-       struct bsg_job *job = blk_mq_rq_to_pdu(req);
-       void *reply = job->reply;
-
-       memset(job, 0, sizeof(*job));
-       job->reply = reply;
-       job->reply_len = SCSI_SENSE_BUFFERSIZE;
-       job->dd_data = job + 1;
-}
-
 static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
                       unsigned int hctx_idx)
 {
@@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
        .queue_rq               = bsg_queue_rq,
        .init_request           = bsg_init_rq,
        .exit_request           = bsg_exit_rq,
-       .initialize_rq_fn       = bsg_initialize_rq,
        .complete               = bsg_complete,
        .timeout                = bsg_timeout,
 };
index ff45d83..1f39f6e 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
-#include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -40,6 +39,7 @@
 
 #include <trace/events/block.h>
 
+#include "elevator.h"
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
@@ -637,7 +637,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
                return NULL;
 
        if (q->nr_hw_queues != 1 &&
-                       !blk_mq_is_sbitmap_shared(q->tag_set->flags))
+           !blk_mq_is_shared_tags(q->tag_set->flags))
                return NULL;
 
        return elevator_get(q, "mq-deadline", false);
similarity index 92%
rename from include/linux/elevator.h
rename to block/elevator.h
index ef9ceea..16cd8bd 100644 (file)
@@ -1,17 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ELEVATOR_H
-#define _LINUX_ELEVATOR_H
+#ifndef _ELEVATOR_H
+#define _ELEVATOR_H
 
 #include <linux/percpu.h>
 #include <linux/hashtable.h>
 
-#ifdef CONFIG_BLOCK
-
 struct io_cq;
 struct elevator_type;
-#ifdef CONFIG_BLK_DEBUG_FS
 struct blk_mq_debugfs_attr;
-#endif
 
 /*
  * Return values from elevator merger
@@ -162,20 +158,9 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_FLUSH  5
 #define ELEVATOR_INSERT_SORT_MERGE     6
 
-#define rq_end_sector(rq)      (blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)      rb_entry((node), struct request, rb_node)
 
 #define rq_entry_fifo(ptr)     list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)      list_del_init(&(rq)->queuelist)
 
-/*
- * Elevator features.
- */
-
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE       (1U << 0)
-/* Supports scheduling on multiple hardware queues */
-#define ELEVATOR_F_MQ_AWARE            (1U << 1)
-
-#endif /* CONFIG_BLOCK */
-#endif
+#endif /* _ELEVATOR_H */
index 1e970c2..4e22b07 100644 (file)
@@ -17,7 +17,7 @@
 #include <linux/fs.h>
 #include "blk.h"
 
-static struct inode *bdev_file_inode(struct file *file)
+static inline struct inode *bdev_file_inode(struct file *file)
 {
        return file->f_mapping->host;
 }
@@ -54,14 +54,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                struct iov_iter *iter, unsigned int nr_pages)
 {
-       struct file *file = iocb->ki_filp;
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+       struct block_device *bdev = iocb->ki_filp->private_data;
        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
        loff_t pos = iocb->ki_pos;
        bool should_dirty = false;
        struct bio bio;
        ssize_t ret;
-       blk_qc_t qc;
 
        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
@@ -78,7 +76,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 
        bio_init(&bio, vecs, nr_pages);
        bio_set_dev(&bio, bdev);
-       bio.bi_iter.bi_sector = pos >> 9;
+       bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
        bio.bi_write_hint = iocb->ki_hint;
        bio.bi_private = current;
        bio.bi_end_io = blkdev_bio_end_io_simple;
@@ -102,13 +100,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
        if (iocb->ki_flags & IOCB_HIPRI)
                bio_set_polled(&bio, iocb);
 
-       qc = submit_bio(&bio);
+       submit_bio(&bio);
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(bio.bi_private))
                        break;
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
+               if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
                        blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);
@@ -126,6 +123,11 @@ out:
        return ret;
 }
 
+enum {
+       DIO_SHOULD_DIRTY        = 1,
+       DIO_IS_SYNC             = 2,
+};
+
 struct blkdev_dio {
        union {
                struct kiocb            *iocb;
@@ -133,35 +135,27 @@ struct blkdev_dio {
        };
        size_t                  size;
        atomic_t                ref;
-       bool                    multi_bio : 1;
-       bool                    should_dirty : 1;
-       bool                    is_sync : 1;
-       struct bio              bio;
+       unsigned int            flags;
+       struct bio              bio ____cacheline_aligned_in_smp;
 };
 
 static struct bio_set blkdev_dio_pool;
 
-static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
-{
-       struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
-       struct request_queue *q = bdev_get_queue(bdev);
-
-       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
-}
-
 static void blkdev_bio_end_io(struct bio *bio)
 {
        struct blkdev_dio *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
+       bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
 
        if (bio->bi_status && !dio->bio.bi_status)
                dio->bio.bi_status = bio->bi_status;
 
-       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
-               if (!dio->is_sync) {
+       if (atomic_dec_and_test(&dio->ref)) {
+               if (!(dio->flags & DIO_IS_SYNC)) {
                        struct kiocb *iocb = dio->iocb;
                        ssize_t ret;
 
+                       WRITE_ONCE(iocb->private, NULL);
+
                        if (likely(!dio->bio.bi_status)) {
                                ret = dio->size;
                                iocb->ki_pos += ret;
@@ -169,9 +163,8 @@ static void blkdev_bio_end_io(struct bio *bio)
                                ret = blk_status_to_errno(dio->bio.bi_status);
                        }
 
-                       dio->iocb->ki_complete(iocb, ret, 0);
-                       if (dio->multi_bio)
-                               bio_put(&dio->bio);
+                       dio->iocb->ki_complete(iocb, ret);
+                       bio_put(&dio->bio);
                } else {
                        struct task_struct *waiter = dio->waiter;
 
@@ -191,16 +184,12 @@ static void blkdev_bio_end_io(struct bio *bio)
 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                unsigned int nr_pages)
 {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = bdev_file_inode(file);
-       struct block_device *bdev = I_BDEV(inode);
+       struct block_device *bdev = iocb->ki_filp->private_data;
        struct blk_plug plug;
        struct blkdev_dio *dio;
        struct bio *bio;
-       bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
        bool is_read = (iov_iter_rw(iter) == READ), is_sync;
        loff_t pos = iocb->ki_pos;
-       blk_qc_t qc = BLK_QC_T_NONE;
        int ret = 0;
 
        if ((pos | iov_iter_alignment(iter)) &
@@ -210,28 +199,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
 
        dio = container_of(bio, struct blkdev_dio, bio);
-       dio->is_sync = is_sync = is_sync_kiocb(iocb);
-       if (dio->is_sync) {
+       atomic_set(&dio->ref, 1);
+       /*
+        * Grab an extra reference to ensure the dio structure which is embedded
+        * into the first bio stays around.
+        */
+       bio_get(bio);
+
+       is_sync = is_sync_kiocb(iocb);
+       if (is_sync) {
+               dio->flags = DIO_IS_SYNC;
                dio->waiter = current;
-               bio_get(bio);
        } else {
+               dio->flags = 0;
                dio->iocb = iocb;
        }
 
        dio->size = 0;
-       dio->multi_bio = false;
-       dio->should_dirty = is_read && iter_is_iovec(iter);
+       if (is_read && iter_is_iovec(iter))
+               dio->flags |= DIO_SHOULD_DIRTY;
 
-       /*
-        * Don't plug for HIPRI/polled IO, as those should go straight
-        * to issue
-        */
-       if (!is_poll)
-               blk_start_plug(&plug);
+       blk_start_plug(&plug);
 
        for (;;) {
                bio_set_dev(bio, bdev);
-               bio->bi_iter.bi_sector = pos >> 9;
+               bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
                bio->bi_write_hint = iocb->ki_hint;
                bio->bi_private = dio;
                bio->bi_end_io = blkdev_bio_end_io;
@@ -246,7 +238,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
                if (is_read) {
                        bio->bi_opf = REQ_OP_READ;
-                       if (dio->should_dirty)
+                       if (dio->flags & DIO_SHOULD_DIRTY)
                                bio_set_pages_dirty(bio);
                } else {
                        bio->bi_opf = dio_bio_write_op(iocb);
@@ -260,40 +252,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
                nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
                if (!nr_pages) {
-                       bool polled = false;
-
-                       if (iocb->ki_flags & IOCB_HIPRI) {
-                               bio_set_polled(bio, iocb);
-                               polled = true;
-                       }
-
-                       qc = submit_bio(bio);
-
-                       if (polled)
-                               WRITE_ONCE(iocb->ki_cookie, qc);
+                       submit_bio(bio);
                        break;
                }
-
-               if (!dio->multi_bio) {
-                       /*
-                        * AIO needs an extra reference to ensure the dio
-                        * structure which is embedded into the first bio
-                        * stays around.
-                        */
-                       if (!is_sync)
-                               bio_get(bio);
-                       dio->multi_bio = true;
-                       atomic_set(&dio->ref, 2);
-               } else {
-                       atomic_inc(&dio->ref);
-               }
-
+               atomic_inc(&dio->ref);
                submit_bio(bio);
                bio = bio_alloc(GFP_KERNEL, nr_pages);
        }
 
-       if (!is_poll)
-               blk_finish_plug(&plug);
+       blk_finish_plug(&plug);
 
        if (!is_sync)
                return -EIOCBQUEUED;
@@ -302,10 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(dio->waiter))
                        break;
-
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
-                       blk_io_schedule();
+               blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);
 
@@ -318,6 +282,94 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        return ret;
 }
 
+static void blkdev_bio_end_io_async(struct bio *bio)
+{
+       struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
+       struct kiocb *iocb = dio->iocb;
+       ssize_t ret;
+
+       if (likely(!bio->bi_status)) {
+               ret = dio->size;
+               iocb->ki_pos += ret;
+       } else {
+               ret = blk_status_to_errno(bio->bi_status);
+       }
+
+       iocb->ki_complete(iocb, ret);
+
+       if (dio->flags & DIO_SHOULD_DIRTY) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
+static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
+                                       struct iov_iter *iter,
+                                       unsigned int nr_pages)
+{
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct blkdev_dio *dio;
+       struct bio *bio;
+       loff_t pos = iocb->ki_pos;
+       int ret = 0;
+
+       if ((pos | iov_iter_alignment(iter)) &
+           (bdev_logical_block_size(bdev) - 1))
+               return -EINVAL;
+
+       bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+       dio = container_of(bio, struct blkdev_dio, bio);
+       dio->flags = 0;
+       dio->iocb = iocb;
+       bio_set_dev(bio, bdev);
+       bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+       bio->bi_write_hint = iocb->ki_hint;
+       bio->bi_end_io = blkdev_bio_end_io_async;
+       bio->bi_ioprio = iocb->ki_ioprio;
+
+       if (iov_iter_is_bvec(iter)) {
+               /*
+                * Users don't rely on the iterator being in any particular
+                * state for async I/O returning -EIOCBQUEUED, hence we can
+                * avoid expensive iov_iter_advance(). Bypass
+                * bio_iov_iter_get_pages() and set the bvec directly.
+                */
+               bio_iov_bvec_set(bio, iter);
+       } else {
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (unlikely(ret)) {
+                       bio->bi_status = BLK_STS_IOERR;
+                       bio_endio(bio);
+                       return ret;
+               }
+       }
+       dio->size = bio->bi_iter.bi_size;
+
+       if (iov_iter_rw(iter) == READ) {
+               bio->bi_opf = REQ_OP_READ;
+               if (iter_is_iovec(iter)) {
+                       dio->flags |= DIO_SHOULD_DIRTY;
+                       bio_set_pages_dirty(bio);
+               }
+       } else {
+               bio->bi_opf = dio_bio_write_op(iocb);
+               task_io_account_write(bio->bi_iter.bi_size);
+       }
+
+       if (iocb->ki_flags & IOCB_HIPRI) {
+               bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+               submit_bio(bio);
+               WRITE_ONCE(iocb->private, bio);
+       } else {
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       bio->bi_opf |= REQ_NOWAIT;
+               submit_bio(bio);
+       }
+       return -EIOCBQUEUED;
+}
+
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
        unsigned int nr_pages;
@@ -326,9 +378,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                return 0;
 
        nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
-       if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
-               return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-
+       if (likely(nr_pages <= BIO_MAX_VECS)) {
+               if (is_sync_kiocb(iocb))
+                       return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+               return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+       }
        return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
@@ -405,8 +459,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                int datasync)
 {
-       struct inode *bd_inode = bdev_file_inode(filp);
-       struct block_device *bdev = I_BDEV(bd_inode);
+       struct block_device *bdev = filp->private_data;
        int error;
 
        error = file_write_and_wait_range(filp, start, end);
@@ -448,6 +501,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
        bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
+
+       filp->private_data = bdev;
        filp->f_mapping = bdev->bd_inode->i_mapping;
        filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
        return 0;
@@ -455,29 +510,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 
 static int blkdev_close(struct inode *inode, struct file *filp)
 {
-       struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
+       struct block_device *bdev = filp->private_data;
 
        blkdev_put(bdev, filp->f_mode);
        return 0;
 }
 
-static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       fmode_t mode = file->f_mode;
-
-       /*
-        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
-        * to updated it before every ioctl.
-        */
-       if (file->f_flags & O_NDELAY)
-               mode |= FMODE_NDELAY;
-       else
-               mode &= ~FMODE_NDELAY;
-
-       return blkdev_ioctl(bdev, mode, cmd, arg);
-}
-
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -487,14 +525,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  */
 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct inode *bd_inode = bdev->bd_inode;
        loff_t size = i_size_read(bd_inode);
        struct blk_plug plug;
        size_t shorted = 0;
        ssize_t ret;
 
-       if (bdev_read_only(I_BDEV(bd_inode)))
+       if (bdev_read_only(bdev))
                return -EPERM;
 
        if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@@ -526,24 +564,26 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
-       loff_t size = i_size_read(bd_inode);
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       loff_t size = i_size_read(bdev->bd_inode);
        loff_t pos = iocb->ki_pos;
        size_t shorted = 0;
        ssize_t ret;
 
-       if (pos >= size)
-               return 0;
-
-       size -= pos;
-       if (iov_iter_count(to) > size) {
-               shorted = iov_iter_count(to) - size;
-               iov_iter_truncate(to, size);
+       if (unlikely(pos + iov_iter_count(to) > size)) {
+               if (pos >= size)
+                       return 0;
+               size -= pos;
+               if (iov_iter_count(to) > size) {
+                       shorted = iov_iter_count(to) - size;
+                       iov_iter_truncate(to, size);
+               }
        }
 
        ret = generic_file_read_iter(iocb, to);
-       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+
+       if (unlikely(shorted))
+               iov_iter_reexpand(to, iov_iter_count(to) + shorted);
        return ret;
 }
 
@@ -565,7 +605,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                return -EOPNOTSUPP;
 
        /* Don't go off the end of the device. */
-       isize = i_size_read(bdev->bd_inode);
+       isize = bdev_nr_bytes(bdev);
        if (start >= isize)
                return -EINVAL;
        if (end >= isize) {
@@ -592,16 +632,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
        switch (mode) {
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                           GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
+               error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL,
+                                            BLKDEV_ZERO_NOUNMAP);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
+               error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL,
+                                            BLKDEV_ZERO_NOFALLBACK);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-               error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, 0);
+               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL, 0);
                break;
        default:
                error = -EOPNOTSUPP;
@@ -618,10 +660,10 @@ const struct file_operations def_blk_fops = {
        .llseek         = blkdev_llseek,
        .read_iter      = blkdev_read_iter,
        .write_iter     = blkdev_write_iter,
-       .iopoll         = blkdev_iopoll,
+       .iopoll         = iocb_bio_iopoll,
        .mmap           = generic_file_mmap,
        .fsync          = blkdev_fsync,
-       .unlocked_ioctl = block_ioctl,
+       .unlocked_ioctl = blkdev_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
 #endif
index b498585..febaaa5 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
@@ -57,6 +58,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
 
        spin_lock(&bdev->bd_size_lock);
        i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+       bdev->bd_nr_sectors = sectors;
        spin_unlock(&bdev->bd_size_lock);
 }
 EXPORT_SYMBOL(set_capacity);
@@ -588,16 +590,6 @@ void del_gendisk(struct gendisk *disk)
         * Prevent new I/O from crossing bio_queue_enter().
         */
        blk_queue_start_drain(q);
-       blk_mq_freeze_queue_wait(q);
-
-       rq_qos_exit(q);
-       blk_sync_queue(q);
-       blk_flush_integrity();
-       /*
-        * Allow using passthrough request again after the queue is torn down.
-        */
-       blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
-       __blk_mq_unfreeze_queue(q, true);
 
        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -620,9 +612,41 @@ void del_gendisk(struct gendisk *disk)
                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
        device_del(disk_to_dev(disk));
+
+       blk_mq_freeze_queue_wait(q);
+
+       rq_qos_exit(q);
+       blk_sync_queue(q);
+       blk_flush_integrity();
+       /*
+        * Allow using passthrough request again after the queue is torn down.
+        */
+       blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+       __blk_mq_unfreeze_queue(q, true);
+
 }
 EXPORT_SYMBOL(del_gendisk);
 
+/**
+ * invalidate_disk - invalidate the disk
+ * @disk: the struct gendisk to invalidate
+ *
+ * A helper to invalidates the disk. It will clean the disk's associated
+ * buffer/page caches and reset its internal states so that the disk
+ * can be reused by the drivers.
+ *
+ * Context: can sleep
+ */
+void invalidate_disk(struct gendisk *disk)
+{
+       struct block_device *bdev = disk->part0;
+
+       invalidate_bdev(bdev);
+       bdev->bd_inode->i_mapping->wb_err = 0;
+       set_capacity(disk, 0);
+}
+EXPORT_SYMBOL(invalidate_disk);
+
 /* sysfs access to bad-blocks list. */
 static ssize_t disk_badblocks_show(struct device *dev,
                                        struct device_attribute *attr,
@@ -882,7 +906,7 @@ ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
        struct block_device *bdev = dev_to_bdev(dev);
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
        struct disk_stats stat;
        unsigned int inflight;
 
@@ -926,7 +950,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
 {
        struct block_device *bdev = dev_to_bdev(dev);
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
        unsigned int inflight[2];
 
        if (queue_is_mq(q))
@@ -1266,6 +1290,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
        if (!disk->bdi)
                goto out_free_disk;
 
+       /* bdev_alloc() might need the queue, set before the first call */
+       disk->queue = q;
+
        disk->part0 = bdev_alloc(disk, 0);
        if (!disk->part0)
                goto out_free_bdi;
@@ -1281,7 +1308,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
        disk_to_dev(disk)->type = &disk_type;
        device_initialize(disk_to_dev(disk));
        inc_diskseq(disk);
-       disk->queue = q;
        q->disk = disk;
        lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@@ -1386,12 +1412,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
 }
 EXPORT_SYMBOL(set_disk_ro);
 
-int bdev_read_only(struct block_device *bdev)
-{
-       return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
-}
-EXPORT_SYMBOL(bdev_read_only);
-
 void inc_diskseq(struct gendisk *disk)
 {
        disk->diskseq = atomic64_inc_return(&diskseq);
index 9dc0841..27cddce 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/genhd.h>
+#include <linux/slab.h>
 
 struct bd_holder_disk {
        struct list_head        list;
index eb0491e..d6af0ac 100644 (file)
@@ -132,7 +132,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
        if (len & 511)
                return -EINVAL;
 
-       if (start + len > i_size_read(bdev->bd_inode))
+       if (start + len > bdev_nr_bytes(bdev))
                return -EINVAL;
 
        err = truncate_bdev_range(bdev, mode, start, start + len - 1);
@@ -164,7 +164,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
                return -EINVAL;
        if (len & 511)
                return -EINVAL;
-       if (end >= (uint64_t)i_size_read(bdev->bd_inode))
+       if (end >= (uint64_t)bdev_nr_bytes(bdev))
                return -EINVAL;
        if (end < start)
                return -EINVAL;
@@ -538,12 +538,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
  *
  * New commands must be compatible and go into blkdev_common_ioctl
  */
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
-                       unsigned long arg)
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-       int ret;
-       loff_t size;
+       struct block_device *bdev = I_BDEV(file->f_mapping->host);
        void __user *argp = (void __user *)arg;
+       fmode_t mode = file->f_mode;
+       int ret;
+
+       /*
+        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+        * to updated it before every ioctl.
+        */
+       if (file->f_flags & O_NDELAY)
+               mode |= FMODE_NDELAY;
+       else
+               mode &= ~FMODE_NDELAY;
 
        switch (cmd) {
        /* These need separate implementations for the data structure */
@@ -560,10 +569,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                return put_long(argp,
                        (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
        case BLKGETSIZE:
-               size = i_size_read(bdev->bd_inode);
-               if ((size >> 9) > ~0UL)
+               if (bdev_nr_sectors(bdev) > ~0UL)
                        return -EFBIG;
-               return put_ulong(argp, size >> 9);
+               return put_ulong(argp, bdev_nr_sectors(bdev));
 
        /* The data is compatible, but the command number is different */
        case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -571,7 +579,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKBSZSET:
                return blkdev_bszset(bdev, mode, argp);
        case BLKGETSIZE64:
-               return put_u64(argp, i_size_read(bdev->bd_inode));
+               return put_u64(argp, bdev_nr_bytes(bdev));
 
        /* Incompatible alignment on i386 */
        case BLKTRACESETUP:
@@ -588,7 +596,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                return -ENOTTY;
        return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 }
-EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
 
 #ifdef CONFIG_COMPAT
 
@@ -606,7 +613,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct gendisk *disk = bdev->bd_disk;
        fmode_t mode = file->f_mode;
-       loff_t size;
 
        /*
         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -632,10 +638,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return compat_put_long(argp,
                        (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
        case BLKGETSIZE:
-               size = i_size_read(bdev->bd_inode);
-               if ((size >> 9) > ~0UL)
+               if (bdev_nr_sectors(bdev) > ~0UL)
                        return -EFBIG;
-               return compat_put_ulong(argp, size >> 9);
+               return compat_put_ulong(argp, bdev_nr_sectors(bdev));
 
        /* The data is compatible, but the command number is different */
        case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -643,7 +648,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case BLKBSZSET_32:
                return blkdev_bszset(bdev, mode, argp);
        case BLKGETSIZE64_32:
-               return put_u64(argp, i_size_read(bdev->bd_inode));
+               return put_u64(argp, bdev_nr_bytes(bdev));
 
        /* Incompatible alignment on i386 */
        case BLKTRACESETUP32:
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
deleted file mode 100644 (file)
index 2c4a55b..0000000
+++ /dev/null
@@ -1,578 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2019 Google LLC
- */
-
-/**
- * DOC: The Keyslot Manager
- *
- * Many devices with inline encryption support have a limited number of "slots"
- * into which encryption contexts may be programmed, and requests can be tagged
- * with a slot number to specify the key to use for en/decryption.
- *
- * As the number of slots is limited, and programming keys is expensive on
- * many inline encryption hardware, we don't want to program the same key into
- * multiple slots - if multiple requests are using the same key, we want to
- * program just one slot with that key and use that slot for all requests.
- *
- * The keyslot manager manages these keyslots appropriately, and also acts as
- * an abstraction between the inline encryption hardware and the upper layers.
- *
- * Lower layer devices will set up a keyslot manager in their request queue
- * and tell it how to perform device specific operations like programming/
- * evicting keys from keyslots.
- *
- * Upper layers will call blk_ksm_get_slot_for_key() to program a
- * key into some slot in the inline encryption hardware.
- */
-
-#define pr_fmt(fmt) "blk-crypto: " fmt
-
-#include <linux/keyslot-manager.h>
-#include <linux/device.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/pm_runtime.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-
-struct blk_ksm_keyslot {
-       atomic_t slot_refs;
-       struct list_head idle_slot_node;
-       struct hlist_node hash_node;
-       const struct blk_crypto_key *key;
-       struct blk_keyslot_manager *ksm;
-};
-
-static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm)
-{
-       /*
-        * Calling into the driver requires ksm->lock held and the device
-        * resumed.  But we must resume the device first, since that can acquire
-        * and release ksm->lock via blk_ksm_reprogram_all_keys().
-        */
-       if (ksm->dev)
-               pm_runtime_get_sync(ksm->dev);
-       down_write(&ksm->lock);
-}
-
-static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
-{
-       up_write(&ksm->lock);
-       if (ksm->dev)
-               pm_runtime_put_sync(ksm->dev);
-}
-
-static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm)
-{
-       return ksm->num_slots == 0;
-}
-
-/**
- * blk_ksm_init() - Initialize a keyslot manager
- * @ksm: The keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Allocate memory for keyslots and initialize a keyslot manager. Called by
- * e.g. storage drivers to set up a keyslot manager in their request_queue.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots)
-{
-       unsigned int slot;
-       unsigned int i;
-       unsigned int slot_hashtable_size;
-
-       memset(ksm, 0, sizeof(*ksm));
-
-       if (num_slots == 0)
-               return -EINVAL;
-
-       ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL);
-       if (!ksm->slots)
-               return -ENOMEM;
-
-       ksm->num_slots = num_slots;
-
-       init_rwsem(&ksm->lock);
-
-       init_waitqueue_head(&ksm->idle_slots_wait_queue);
-       INIT_LIST_HEAD(&ksm->idle_slots);
-
-       for (slot = 0; slot < num_slots; slot++) {
-               ksm->slots[slot].ksm = ksm;
-               list_add_tail(&ksm->slots[slot].idle_slot_node,
-                             &ksm->idle_slots);
-       }
-
-       spin_lock_init(&ksm->idle_slots_lock);
-
-       slot_hashtable_size = roundup_pow_of_two(num_slots);
-       /*
-        * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
-        * buckets.  This only makes a difference when there is only 1 keyslot.
-        */
-       if (slot_hashtable_size < 2)
-               slot_hashtable_size = 2;
-
-       ksm->log_slot_ht_size = ilog2(slot_hashtable_size);
-       ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size,
-                                            sizeof(ksm->slot_hashtable[0]),
-                                            GFP_KERNEL);
-       if (!ksm->slot_hashtable)
-               goto err_destroy_ksm;
-       for (i = 0; i < slot_hashtable_size; i++)
-               INIT_HLIST_HEAD(&ksm->slot_hashtable[i]);
-
-       return 0;
-
-err_destroy_ksm:
-       blk_ksm_destroy(ksm);
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init);
-
-static void blk_ksm_destroy_callback(void *ksm)
-{
-       blk_ksm_destroy(ksm);
-}
-
-/**
- * devm_blk_ksm_init() - Resource-managed blk_ksm_init()
- * @dev: The device which owns the blk_keyslot_manager.
- * @ksm: The blk_keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically
- * on driver detach.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-                     unsigned int num_slots)
-{
-       int err = blk_ksm_init(ksm, num_slots);
-
-       if (err)
-               return err;
-
-       return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm);
-}
-EXPORT_SYMBOL_GPL(devm_blk_ksm_init);
-
-static inline struct hlist_head *
-blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
-                           const struct blk_crypto_key *key)
-{
-       return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)];
-}
-
-static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot)
-{
-       struct blk_keyslot_manager *ksm = slot->ksm;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ksm->idle_slots_lock, flags);
-       list_del(&slot->idle_slot_node);
-       spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_keyslot(
-                                       struct blk_keyslot_manager *ksm,
-                                       const struct blk_crypto_key *key)
-{
-       const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key);
-       struct blk_ksm_keyslot *slotp;
-
-       hlist_for_each_entry(slotp, head, hash_node) {
-               if (slotp->key == key)
-                       return slotp;
-       }
-       return NULL;
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot(
-                                       struct blk_keyslot_manager *ksm,
-                                       const struct blk_crypto_key *key)
-{
-       struct blk_ksm_keyslot *slot;
-
-       slot = blk_ksm_find_keyslot(ksm, key);
-       if (!slot)
-               return NULL;
-       if (atomic_inc_return(&slot->slot_refs) == 1) {
-               /* Took first reference to this slot; remove it from LRU list */
-               blk_ksm_remove_slot_from_lru_list(slot);
-       }
-       return slot;
-}
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot)
-{
-       return slot - slot->ksm->slots;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx);
-
-/**
- * blk_ksm_get_slot_for_key() - Program a key into a keyslot.
- * @ksm: The keyslot manager to program the key into.
- * @key: Pointer to the key object to program, including the raw key, crypto
- *      mode, and data unit size.
- * @slot_ptr: A pointer to return the pointer of the allocated keyslot.
- *
- * Get a keyslot that's been programmed with the specified key.  If one already
- * exists, return it with incremented refcount.  Otherwise, wait for a keyslot
- * to become idle and program it.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the
- *        allocated keyslot), or some other blk_status_t otherwise (and
- *        keyslot is set to NULL).
- */
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     struct blk_ksm_keyslot **slot_ptr)
-{
-       struct blk_ksm_keyslot *slot;
-       int slot_idx;
-       int err;
-
-       *slot_ptr = NULL;
-
-       if (blk_ksm_is_passthrough(ksm))
-               return BLK_STS_OK;
-
-       down_read(&ksm->lock);
-       slot = blk_ksm_find_and_grab_keyslot(ksm, key);
-       up_read(&ksm->lock);
-       if (slot)
-               goto success;
-
-       for (;;) {
-               blk_ksm_hw_enter(ksm);
-               slot = blk_ksm_find_and_grab_keyslot(ksm, key);
-               if (slot) {
-                       blk_ksm_hw_exit(ksm);
-                       goto success;
-               }
-
-               /*
-                * If we're here, that means there wasn't a slot that was
-                * already programmed with the key. So try to program it.
-                */
-               if (!list_empty(&ksm->idle_slots))
-                       break;
-
-               blk_ksm_hw_exit(ksm);
-               wait_event(ksm->idle_slots_wait_queue,
-                          !list_empty(&ksm->idle_slots));
-       }
-
-       slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot,
-                               idle_slot_node);
-       slot_idx = blk_ksm_get_slot_idx(slot);
-
-       err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx);
-       if (err) {
-               wake_up(&ksm->idle_slots_wait_queue);
-               blk_ksm_hw_exit(ksm);
-               return errno_to_blk_status(err);
-       }
-
-       /* Move this slot to the hash list for the new key. */
-       if (slot->key)
-               hlist_del(&slot->hash_node);
-       slot->key = key;
-       hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key));
-
-       atomic_set(&slot->slot_refs, 1);
-
-       blk_ksm_remove_slot_from_lru_list(slot);
-
-       blk_ksm_hw_exit(ksm);
-success:
-       *slot_ptr = slot;
-       return BLK_STS_OK;
-}
-
-/**
- * blk_ksm_put_slot() - Release a reference to a slot
- * @slot: The keyslot to release the reference of.
- *
- * Context: Any context.
- */
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot)
-{
-       struct blk_keyslot_manager *ksm;
-       unsigned long flags;
-
-       if (!slot)
-               return;
-
-       ksm = slot->ksm;
-
-       if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
-                                       &ksm->idle_slots_lock, flags)) {
-               list_add_tail(&slot->idle_slot_node, &ksm->idle_slots);
-               spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-               wake_up(&ksm->idle_slots_wait_queue);
-       }
-}
-
-/**
- * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is
- *                                 supported by a ksm.
- * @ksm: The keyslot manager to check
- * @cfg: The crypto configuration to check for.
- *
- * Checks for crypto_mode/data unit size/dun bytes support.
- *
- * Return: Whether or not this ksm supports the specified crypto config.
- */
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-                                 const struct blk_crypto_config *cfg)
-{
-       if (!ksm)
-               return false;
-       if (!(ksm->crypto_modes_supported[cfg->crypto_mode] &
-             cfg->data_unit_size))
-               return false;
-       if (ksm->max_dun_bytes_supported < cfg->dun_bytes)
-               return false;
-       return true;
-}
-
-/**
- * blk_ksm_evict_key() - Evict a key from the lower layer device.
- * @ksm: The keyslot manager to evict from
- * @key: The key to evict
- *
- * Find the keyslot that the specified key was programmed into, and evict that
- * slot from the lower layer device. The slot must not be in use by any
- * in-flight IO when this function is called.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
- *        if the keyslot is still in use, or another -errno value on other
- *        error.
- */
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-                     const struct blk_crypto_key *key)
-{
-       struct blk_ksm_keyslot *slot;
-       int err = 0;
-
-       if (blk_ksm_is_passthrough(ksm)) {
-               if (ksm->ksm_ll_ops.keyslot_evict) {
-                       blk_ksm_hw_enter(ksm);
-                       err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1);
-                       blk_ksm_hw_exit(ksm);
-                       return err;
-               }
-               return 0;
-       }
-
-       blk_ksm_hw_enter(ksm);
-       slot = blk_ksm_find_keyslot(ksm, key);
-       if (!slot)
-               goto out_unlock;
-
-       if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
-               err = -EBUSY;
-               goto out_unlock;
-       }
-       err = ksm->ksm_ll_ops.keyslot_evict(ksm, key,
-                                           blk_ksm_get_slot_idx(slot));
-       if (err)
-               goto out_unlock;
-
-       hlist_del(&slot->hash_node);
-       slot->key = NULL;
-       err = 0;
-out_unlock:
-       blk_ksm_hw_exit(ksm);
-       return err;
-}
-
-/**
- * blk_ksm_reprogram_all_keys() - Re-program all keyslots.
- * @ksm: The keyslot manager
- *
- * Re-program all keyslots that are supposed to have a key programmed.  This is
- * intended only for use by drivers for hardware that loses its keys on reset.
- *
- * Context: Process context. Takes and releases ksm->lock.
- */
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
-{
-       unsigned int slot;
-
-       if (blk_ksm_is_passthrough(ksm))
-               return;
-
-       /* This is for device initialization, so don't resume the device */
-       down_write(&ksm->lock);
-       for (slot = 0; slot < ksm->num_slots; slot++) {
-               const struct blk_crypto_key *key = ksm->slots[slot].key;
-               int err;
-
-               if (!key)
-                       continue;
-
-               err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot);
-               WARN_ON(err);
-       }
-       up_write(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm)
-{
-       if (!ksm)
-               return;
-       kvfree(ksm->slot_hashtable);
-       kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots);
-       memzero_explicit(ksm, sizeof(*ksm));
-}
-EXPORT_SYMBOL_GPL(blk_ksm_destroy);
-
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q)
-{
-       if (blk_integrity_queue_supports_integrity(q)) {
-               pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-               return false;
-       }
-       q->ksm = ksm;
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_register);
-
-void blk_ksm_unregister(struct request_queue *q)
-{
-       q->ksm = NULL;
-}
-
-/**
- * blk_ksm_intersect_modes() - restrict supported modes by child device
- * @parent: The keyslot manager for parent device
- * @child: The keyslot manager for child device, or NULL
- *
- * Clear any crypto mode support bits in @parent that aren't set in @child.
- * If @child is NULL, then all parent bits are cleared.
- *
- * Only use this when setting up the keyslot manager for a layered device,
- * before it's been exposed yet.
- */
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-                            const struct blk_keyslot_manager *child)
-{
-       if (child) {
-               unsigned int i;
-
-               parent->max_dun_bytes_supported =
-                       min(parent->max_dun_bytes_supported,
-                           child->max_dun_bytes_supported);
-               for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported);
-                    i++) {
-                       parent->crypto_modes_supported[i] &=
-                               child->crypto_modes_supported[i];
-               }
-       } else {
-               parent->max_dun_bytes_supported = 0;
-               memset(parent->crypto_modes_supported, 0,
-                      sizeof(parent->crypto_modes_supported));
-       }
-}
-EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes);
-
-/**
- * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes
- *                        and DUN bytes that another KSM supports. Here,
- *                        "superset" refers to the mathematical meaning of the
- *                        word - i.e. if two KSMs have the *same* capabilities,
- *                        they *are* considered supersets of each other.
- * @ksm_superset: The KSM that we want to verify is a superset
- * @ksm_subset: The KSM that we want to verify is a subset
- *
- * Return: True if @ksm_superset supports a superset of the crypto modes and DUN
- *        bytes that @ksm_subset supports.
- */
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-                        struct blk_keyslot_manager *ksm_subset)
-{
-       int i;
-
-       if (!ksm_subset)
-               return true;
-
-       if (!ksm_superset)
-               return false;
-
-       for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) {
-               if (ksm_subset->crypto_modes_supported[i] &
-                   (~ksm_superset->crypto_modes_supported[i])) {
-                       return false;
-               }
-       }
-
-       if (ksm_subset->max_dun_bytes_supported >
-           ksm_superset->max_dun_bytes_supported) {
-               return false;
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_is_superset);
-
-/**
- * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of
- *                                another KSM
- * @target_ksm: The KSM whose restrictions to update.
- * @reference_ksm: The KSM to whose restrictions this function will update
- *                @target_ksm's restrictions to.
- *
- * Blk-crypto requires that crypto capabilities that were
- * advertised when a bio was created continue to be supported by the
- * device until that bio is ended. This is turn means that a device cannot
- * shrink its advertised crypto capabilities without any explicit
- * synchronization with upper layers. So if there's no such explicit
- * synchronization, @reference_ksm must support all the crypto capabilities that
- * @target_ksm does
- * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true).
- *
- * Note also that as long as the crypto capabilities are being expanded, the
- * order of updates becoming visible is not important because it's alright
- * for blk-crypto to see stale values - they only cause blk-crypto to
- * believe that a crypto capability isn't supported when it actually is (which
- * might result in blk-crypto-fallback being used if available, or the bio being
- * failed).
- */
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-                                struct blk_keyslot_manager *reference_ksm)
-{
-       memcpy(target_ksm->crypto_modes_supported,
-              reference_ksm->crypto_modes_supported,
-              sizeof(target_ksm->crypto_modes_supported));
-
-       target_ksm->max_dun_bytes_supported =
-                               reference_ksm->max_dun_bytes_supported;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities);
-
-/**
- * blk_ksm_init_passthrough() - Init a passthrough keyslot manager
- * @ksm: The keyslot manager to init
- *
- * Initialize a passthrough keyslot manager.
- * Called by e.g. storage drivers to set up a keyslot manager in their
- * request_queue, when the storage driver wants to manage its keys by itself.
- * This is useful for inline encryption hardware that doesn't have the concept
- * of keyslots, and for layered devices.
- */
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm)
-{
-       memset(ksm, 0, sizeof(*ksm));
-       init_rwsem(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough);
index a0ffbab..fdd74a4 100644 (file)
@@ -9,12 +9,12 @@
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
-#include <linux/elevator.h>
 #include <linux/module.h>
 #include <linux/sbitmap.h>
 
 #include <trace/events/block.h>
 
+#include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
@@ -453,11 +453,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
        struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
        struct blk_mq_tags *tags = hctx->sched_tags;
-       unsigned int shift = tags->bitmap_tags->sb.shift;
+       unsigned int shift = tags->bitmap_tags.sb.shift;
 
        kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
 }
 
 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
index 7f3c393..85d919b 100644 (file)
@@ -9,7 +9,6 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
-#include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -20,6 +19,7 @@
 
 #include <trace/events/block.h>
 
+#include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
  */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int prio_aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */
@@ -51,17 +56,16 @@ enum dd_prio {
 
 enum { DD_PRIO_COUNT = 3 };
 
-/* I/O statistics per I/O priority. */
+/*
+ * I/O statistics per I/O priority. It is fine if these counters overflow.
+ * What matters is that these counters are at least as wide as
+ * log2(max_outstanding_requests).
+ */
 struct io_stats_per_prio {
-       local_t inserted;
-       local_t merged;
-       local_t dispatched;
-       local_t completed;
-};
-
-/* I/O statistics for all I/O priorities (enum dd_prio). */
-struct io_stats {
-       struct io_stats_per_prio stats[DD_PRIO_COUNT];
+       uint32_t inserted;
+       uint32_t merged;
+       uint32_t dispatched;
+       atomic_t completed;
 };
 
 /*
@@ -74,6 +78,7 @@ struct dd_per_prio {
        struct list_head fifo_list[DD_DIR_COUNT];
        /* Next request in FIFO order. Read, write or both are NULL. */
        struct request *next_rq[DD_DIR_COUNT];
+       struct io_stats_per_prio stats;
 };
 
 struct deadline_data {
@@ -88,8 +93,6 @@ struct deadline_data {
        unsigned int batching;          /* number of sequential requests made */
        unsigned int starved;           /* times reads have starved writes */
 
-       struct io_stats __percpu *stats;
-
        /*
         * settings that change how the i/o scheduler behaves
         */
@@ -98,38 +101,12 @@ struct deadline_data {
        int writes_starved;
        int front_merges;
        u32 async_depth;
+       int prio_aging_expire;
 
        spinlock_t lock;
        spinlock_t zone_lock;
 };
 
-/* Count one event of type 'event_type' and with I/O priority 'prio' */
-#define dd_count(dd, event_type, prio) do {                            \
-       struct io_stats *io_stats = get_cpu_ptr((dd)->stats);           \
-                                                                       \
-       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
-       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
-       local_inc(&io_stats->stats[(prio)].event_type);                 \
-       put_cpu_ptr(io_stats);                                          \
-} while (0)
-
-/*
- * Returns the total number of dd_count(dd, event_type, prio) calls across all
- * CPUs. No locking or barriers since it is fine if the returned sum is slightly
- * outdated.
- */
-#define dd_sum(dd, event_type, prio) ({                                        \
-       unsigned int cpu;                                               \
-       u32 sum = 0;                                                    \
-                                                                       \
-       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
-       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
-       for_each_present_cpu(cpu)                                       \
-               sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->      \
-                                 stats[(prio)].event_type);            \
-       sum;                                                            \
-})
-
 /* Maps an I/O priority class to a deadline scheduler priority. */
 static const enum dd_prio ioprio_class_to_prio[] = {
        [IOPRIO_CLASS_NONE]     = DD_BE_PRIO,
@@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
        const u8 ioprio_class = dd_rq_ioclass(next);
        const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 
-       dd_count(dd, merged, prio);
+       lockdep_assert_held(&dd->lock);
+
+       dd->per_prio[prio].stats.merged++;
 
        /*
         * if next expires before rq, assign its expire time to rq
@@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
        deadline_remove_request(rq->q, per_prio, rq);
 }
 
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+       const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+       lockdep_assert_held(&dd->lock);
+
+       return stats->inserted - atomic_read(&stats->completed);
+}
+
 /*
  * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@@ -356,11 +345,26 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 }
 
 /*
+ * Returns true if and only if @rq started after @latest_start where
+ * @latest_start is in jiffies.
+ */
+static bool started_after(struct deadline_data *dd, struct request *rq,
+                         unsigned long latest_start)
+{
+       unsigned long start_time = (unsigned long)rq->fifo_time;
+
+       start_time -= dd->fifo_expire[rq_data_dir(rq)];
+
+       return time_after(start_time, latest_start);
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
  */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-                                            struct dd_per_prio *per_prio)
+                                            struct dd_per_prio *per_prio,
+                                            unsigned long latest_start)
 {
        struct request *rq, *next_rq;
        enum dd_data_dir data_dir;
@@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
        if (!list_empty(&per_prio->dispatch)) {
                rq = list_first_entry(&per_prio->dispatch, struct request,
                                      queuelist);
+               if (started_after(dd, rq, latest_start))
+                       return NULL;
                list_del_init(&rq->queuelist);
                goto done;
        }
@@ -449,6 +455,9 @@ dispatch_find_request:
        dd->batching = 0;
 
 dispatch_request:
+       if (started_after(dd, rq, latest_start))
+               return NULL;
+
        /*
         * rq is the selected appropriate request.
         */
@@ -457,7 +466,7 @@ dispatch_request:
 done:
        ioprio_class = dd_rq_ioclass(rq);
        prio = ioprio_class_to_prio[ioprio_class];
-       dd_count(dd, dispatched, prio);
+       dd->per_prio[prio].stats.dispatched++;
        /*
         * If the request needs its target zone locked, do it.
         */
@@ -467,6 +476,34 @@ done:
 }
 
 /*
+ * Check whether there are any requests with priority other than DD_RT_PRIO
+ * that were inserted more than prio_aging_expire jiffies ago.
+ */
+static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
+                                                     unsigned long now)
+{
+       struct request *rq;
+       enum dd_prio prio;
+       int prio_cnt;
+
+       lockdep_assert_held(&dd->lock);
+
+       prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
+                  !!dd_queued(dd, DD_IDLE_PRIO);
+       if (prio_cnt < 2)
+               return NULL;
+
+       for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
+                                          now - dd->prio_aging_expire);
+               if (rq)
+                       return rq;
+       }
+
+       return NULL;
+}
+
+/*
  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
  *
  * One confusing aspect here is that we get called for a specific
@@ -477,15 +514,26 @@ done:
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+       const unsigned long now = jiffies;
        struct request *rq;
        enum dd_prio prio;
 
        spin_lock(&dd->lock);
+       rq = dd_dispatch_prio_aged_requests(dd, now);
+       if (rq)
+               goto unlock;
+
+       /*
+        * Next, dispatch requests in priority order. Ignore lower priority
+        * requests if any higher priority requests are pending.
+        */
        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-               rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
-               if (rq)
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
+               if (rq || dd_queued(dd, prio))
                        break;
        }
+
+unlock:
        spin_unlock(&dd->lock);
 
        return rq;
@@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
 
        dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
 
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
 }
 
 /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
 
        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                struct dd_per_prio *per_prio = &dd->per_prio[prio];
+               const struct io_stats_per_prio *stats = &per_prio->stats;
+               uint32_t queued;
 
                WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
                WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
-       }
 
-       free_percpu(dd->stats);
+               spin_lock(&dd->lock);
+               queued = dd_queued(dd, prio);
+               spin_unlock(&dd->lock);
+
+               WARN_ONCE(queued != 0,
+                         "statistics for priority %d: i %u m %u d %u c %u\n",
+                         prio, stats->inserted, stats->merged,
+                         stats->dispatched, atomic_read(&stats->completed));
+       }
 
        kfree(dd);
 }
@@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 
        eq->elevator_data = dd;
 
-       dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
-                                    GFP_KERNEL | __GFP_ZERO);
-       if (!dd->stats)
-               goto free_dd;
-
        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
        dd->front_merges = 1;
        dd->last_dir = DD_WRITE;
        dd->fifo_batch = fifo_batch;
+       dd->prio_aging_expire = prio_aging_expire;
        spin_lock_init(&dd->lock);
        spin_lock_init(&dd->zone_lock);
 
        q->elevator = eq;
        return 0;
 
-free_dd:
-       kfree(dd);
-
 put_eq:
        kobject_put(&eq->kobj);
        return ret;
@@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
        blk_req_zone_write_unlock(rq);
 
        prio = ioprio_class_to_prio[ioprio_class];
-       dd_count(dd, inserted, prio);
-       rq->elv.priv[0] = (void *)(uintptr_t)1;
+       per_prio = &dd->per_prio[prio];
+       if (!rq->elv.priv[0]) {
+               per_prio->stats.inserted++;
+               rq->elv.priv[0] = (void *)(uintptr_t)1;
+       }
 
        if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
                blk_mq_free_requests(&free);
@@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
        trace_block_rq_insert(rq);
 
-       per_prio = &dd->per_prio[prio];
        if (at_head) {
                list_add(&rq->queuelist, &per_prio->dispatch);
        } else {
@@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
 
        /*
         * The block layer core may call dd_finish_request() without having
-        * called dd_insert_requests(). Hence only update statistics for
-        * requests for which dd_insert_requests() has been called. See also
-        * blk_mq_request_bypass_insert().
+        * called dd_insert_requests(). Skip requests that bypassed I/O
+        * scheduling. See also blk_mq_request_bypass_insert().
         */
-       if (rq->elv.priv[0])
-               dd_count(dd, completed, prio);
+       if (!rq->elv.priv[0])
+               return;
+
+       atomic_inc(&per_prio->stats.completed);
 
        if (blk_queue_is_zoned(q)) {
                unsigned long flags;
@@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)         \
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
        STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
        DD_ATTR(front_merges),
        DD_ATTR(async_depth),
        DD_ATTR(fifo_batch),
+       DD_ATTR(prio_aging_expire),
        __ATTR_NULL
 };
 
@@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
        return 0;
 }
 
-/* Number of requests queued for a given priority level. */
-static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
-{
-       return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
-}
-
 static int dd_queued_show(void *data, struct seq_file *m)
 {
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
+       u32 rt, be, idle;
+
+       spin_lock(&dd->lock);
+       rt = dd_queued(dd, DD_RT_PRIO);
+       be = dd_queued(dd, DD_BE_PRIO);
+       idle = dd_queued(dd, DD_IDLE_PRIO);
+       spin_unlock(&dd->lock);
+
+       seq_printf(m, "%u %u %u\n", rt, be, idle);
 
-       seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
-                  dd_queued(dd, DD_BE_PRIO),
-                  dd_queued(dd, DD_IDLE_PRIO));
        return 0;
 }
 
 /* Number of requests owned by the block driver for a given priority. */
 static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
 {
-       return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
-               - dd_sum(dd, completed, prio);
+       const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+       lockdep_assert_held(&dd->lock);
+
+       return stats->dispatched + stats->merged -
+               atomic_read(&stats->completed);
 }
 
 static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 {
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
+       u32 rt, be, idle;
+
+       spin_lock(&dd->lock);
+       rt = dd_owned_by_driver(dd, DD_RT_PRIO);
+       be = dd_owned_by_driver(dd, DD_BE_PRIO);
+       idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
+       spin_unlock(&dd->lock);
+
+       seq_printf(m, "%u %u %u\n", rt, be, idle);
 
-       seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
-                  dd_owned_by_driver(dd, DD_BE_PRIO),
-                  dd_owned_by_driver(dd, DD_IDLE_PRIO));
        return 0;
 }
 
index 278593b..7aff4eb 100644 (file)
@@ -2,6 +2,8 @@
 #
 # Partition configuration
 #
+menu "Partition Types"
+
 config PARTITION_ADVANCED
        bool "Advanced partition selection"
        help
@@ -267,3 +269,5 @@ config CMDLINE_PARTITION
        help
          Say Y here if you want to read the partition table from bootargs.
          The format for the command line is just like mtdparts.
+
+endmenu
index 7bea19d..334b72e 100644 (file)
@@ -5,6 +5,7 @@
  * Copyright (C) 2020 Christoph Hellwig
  */
 #include <linux/fs.h>
+#include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
@@ -90,6 +91,7 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
 {
        spin_lock(&bdev->bd_size_lock);
        i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+       bdev->bd_nr_sectors = sectors;
        spin_unlock(&bdev->bd_size_lock);
 }
 
@@ -203,7 +205,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
        struct block_device *bdev = dev_to_bdev(dev);
 
        return sprintf(buf, "%u\n",
-               queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
+               queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
                                bdev->bd_start_sect));
 }
 
@@ -213,7 +215,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
        struct block_device *bdev = dev_to_bdev(dev);
 
        return sprintf(buf, "%u\n",
-               queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
+               queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
                                bdev->bd_start_sect));
 }
 
index 7ca5c4c..5e9be13 100644 (file)
@@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
  */
 static u64 last_lba(struct gendisk *disk)
 {
-       return div_u64(disk->part0->bd_inode->i_size,
+       return div_u64(bdev_nr_bytes(disk->part0),
                       queue_logical_block_size(disk->queue)) - 1ULL;
 }
 
index 9bca396..403756d 100644 (file)
@@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
                                char name[],
                                union label_t *label,
                                sector_t labelsect,
-                               loff_t i_size,
+                               sector_t nr_sectors,
                                dasd_information2_t *info)
 {
        loff_t offset, geo_size, size;
@@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
        } else {
                /*
                 * Formated w/o large volume support. If the sanity check
-                * 'size based on geo == size based on i_size' is true, then
+                * 'size based on geo == size based on nr_sectors' is true, then
                 * we can safely assume that we know the formatted size of
                 * the disk, otherwise we need additional information
                 * that we can only get from a real DASD device.
                 */
                geo_size = geo->cylinders * geo->heads
                        * geo->sectors * secperblk;
-               size = i_size >> 9;
+               size = nr_sectors;
                if (size != geo_size) {
                        if (!info) {
                                strlcat(state->pp_buf, "\n", PAGE_SIZE);
@@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
                        if (!strcmp(info->type, "ECKD"))
                                if (geo_size < size)
                                        size = geo_size;
-                       /* else keep size based on i_size */
+                       /* else keep size based on nr_sectors */
                }
        }
        /* first and only partition starts in the first block after the label */
@@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
        struct gendisk *disk = state->disk;
        struct block_device *bdev = disk->part0;
        int blocksize, res;
-       loff_t i_size, offset, size;
+       loff_t offset, size;
+       sector_t nr_sectors;
        dasd_information2_t *info;
        struct hd_geometry *geo;
        char type[5] = {0,};
@@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
        blocksize = bdev_logical_block_size(bdev);
        if (blocksize <= 0)
                goto out_symbol;
-       i_size = i_size_read(bdev->bd_inode);
-       if (i_size == 0)
+       nr_sectors = bdev_nr_sectors(bdev);
+       if (nr_sectors == 0)
                goto out_symbol;
        info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
        if (info == NULL)
@@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
                                                   label);
                } else if (!strncmp(type, "LNX1", 4)) {
                        res = find_lnx1_partitions(state, geo, blocksize, name,
-                                                  label, labelsect, i_size,
+                                                  label, labelsect, nr_sectors,
                                                   info);
                } else if (!strncmp(type, "CMS1", 4)) {
                        res = find_cms1_partitions(state, geo, blocksize, name,
@@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
                res = 1;
                if (info->format == DASD_FORMAT_LDL) {
                        strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
-                       size = i_size >> 9;
+                       size = nr_sectors;
                        offset = (info->label_block + 1) * (blocksize >> 9);
                        put_partition(state, 1, offset, size-offset);
                        strlcat(state->pp_buf, "\n", PAGE_SIZE);
index 00c203b..25a52a2 100644 (file)
@@ -5,7 +5,7 @@
  */
 
 #include <linux/t10-pi.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/crc-t10dif.h>
 #include <linux/module.h>
 #include <net/checksum.h>
index 8bd288d..3dd5a77 100644 (file)
@@ -1076,7 +1076,7 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
        af_alg_free_resources(areq);
        sock_put(sk);
 
-       iocb->ki_complete(iocb, err ? err : (int)resultlen, 0);
+       iocb->ki_complete(iocb, err ? err : (int)resultlen);
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
index eed6531..75f1a6c 100644 (file)
@@ -2459,18 +2459,70 @@ static void ata_dev_config_devslp(struct ata_device *dev)
        }
 }
 
+static void ata_dev_config_cpr(struct ata_device *dev)
+{
+       unsigned int err_mask;
+       size_t buf_len;
+       int i, nr_cpr = 0;
+       struct ata_cpr_log *cpr_log = NULL;
+       u8 *desc, *buf = NULL;
+
+       if (!ata_identify_page_supported(dev,
+                                ATA_LOG_CONCURRENT_POSITIONING_RANGES))
+               goto out;
+
+       /*
+        * Read IDENTIFY DEVICE data log, page 0x47
+        * (concurrent positioning ranges). We can have at most 255 32B range
+        * descriptors plus a 64B header.
+        */
+       buf_len = (64 + 255 * 32 + 511) & ~511;
+       buf = kzalloc(buf_len, GFP_KERNEL);
+       if (!buf)
+               goto out;
+
+       err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE,
+                                    ATA_LOG_CONCURRENT_POSITIONING_RANGES,
+                                    buf, buf_len >> 9);
+       if (err_mask)
+               goto out;
+
+       nr_cpr = buf[0];
+       if (!nr_cpr)
+               goto out;
+
+       cpr_log = kzalloc(struct_size(cpr_log, cpr, nr_cpr), GFP_KERNEL);
+       if (!cpr_log)
+               goto out;
+
+       cpr_log->nr_cpr = nr_cpr;
+       desc = &buf[64];
+       for (i = 0; i < nr_cpr; i++, desc += 32) {
+               cpr_log->cpr[i].num = desc[0];
+               cpr_log->cpr[i].num_storage_elements = desc[1];
+               cpr_log->cpr[i].start_lba = get_unaligned_le64(&desc[8]);
+               cpr_log->cpr[i].num_lbas = get_unaligned_le64(&desc[16]);
+       }
+
+out:
+       swap(dev->cpr_log, cpr_log);
+       kfree(cpr_log);
+       kfree(buf);
+}
+
 static void ata_dev_print_features(struct ata_device *dev)
 {
        if (!(dev->flags & ATA_DFLAG_FEATURES_MASK))
                return;
 
        ata_dev_info(dev,
-                    "Features:%s%s%s%s%s\n",
+                    "Features:%s%s%s%s%s%s\n",
                     dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "",
                     dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "",
                     dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "",
                     dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "",
-                    dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "");
+                    dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "",
+                    dev->cpr_log ? " CPR" : "");
 }
 
 /**
@@ -2634,6 +2686,7 @@ int ata_dev_configure(struct ata_device *dev)
                ata_dev_config_sense_reporting(dev);
                ata_dev_config_zac(dev);
                ata_dev_config_trusted(dev);
+               ata_dev_config_cpr(dev);
                dev->cdb_len = 32;
 
                if (ata_msg_drv(ap) && print_info)
index 1fb4611..15a279f 100644 (file)
@@ -1895,7 +1895,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
  */
 static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 {
-       int num_pages;
+       int i, num_pages = 0;
        static const u8 pages[] = {
                0x00,   /* page 0x00, this page */
                0x80,   /* page 0x80, unit serial no page */
@@ -1905,13 +1905,17 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
                0xb1,   /* page 0xb1, block device characteristics page */
                0xb2,   /* page 0xb2, thin provisioning page */
                0xb6,   /* page 0xb6, zoned block device characteristics */
+               0xb9,   /* page 0xb9, concurrent positioning ranges */
        };
 
-       num_pages = sizeof(pages);
-       if (!(args->dev->flags & ATA_DFLAG_ZAC))
-               num_pages--;
+       for (i = 0; i < sizeof(pages); i++) {
+               if (pages[i] == 0xb6 &&
+                   !(args->dev->flags & ATA_DFLAG_ZAC))
+                       continue;
+               rbuf[num_pages + 4] = pages[i];
+               num_pages++;
+       }
        rbuf[3] = num_pages;    /* number of supported VPD pages */
-       memcpy(rbuf + 4, pages, num_pages);
        return 0;
 }
 
@@ -2121,6 +2125,26 @@ static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
        return 0;
 }
 
+static unsigned int ata_scsiop_inq_b9(struct ata_scsi_args *args, u8 *rbuf)
+{
+       struct ata_cpr_log *cpr_log = args->dev->cpr_log;
+       u8 *desc = &rbuf[64];
+       int i;
+
+       /* SCSI Concurrent Positioning Ranges VPD page: SBC-5 rev 1 or later */
+       rbuf[1] = 0xb9;
+       put_unaligned_be16(64 + (int)cpr_log->nr_cpr * 32 - 4, &rbuf[3]);
+
+       for (i = 0; i < cpr_log->nr_cpr; i++, desc += 32) {
+               desc[0] = cpr_log->cpr[i].num;
+               desc[1] = cpr_log->cpr[i].num_storage_elements;
+               put_unaligned_be64(cpr_log->cpr[i].start_lba, &desc[8]);
+               put_unaligned_be64(cpr_log->cpr[i].num_lbas, &desc[16]);
+       }
+
+       return 0;
+}
+
 /**
  *     modecpy - Prepare response for MODE SENSE
  *     @dest: output buffer
@@ -4120,11 +4144,17 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
                        break;
                case 0xb6:
-                       if (dev->flags & ATA_DFLAG_ZAC) {
+                       if (dev->flags & ATA_DFLAG_ZAC)
                                ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
-                               break;
-                       }
-                       fallthrough;
+                       else
+                               ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+                       break;
+               case 0xb9:
+                       if (dev->cpr_log)
+                               ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b9);
+                       else
+                               ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+                       break;
                default:
                        ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
                        break;
index cfa29dc..fabf870 100644 (file)
@@ -281,14 +281,14 @@ static int regcache_rbtree_insert_to_block(struct regmap *map,
        if (!blk)
                return -ENOMEM;
 
+       rbnode->block = blk;
+
        if (BITS_TO_LONGS(blklen) > BITS_TO_LONGS(rbnode->blklen)) {
                present = krealloc(rbnode->cache_present,
                                   BITS_TO_LONGS(blklen) * sizeof(*present),
                                   GFP_KERNEL);
-               if (!present) {
-                       kfree(blk);
+               if (!present)
                        return -ENOMEM;
-               }
 
                memset(present + BITS_TO_LONGS(rbnode->blklen), 0,
                       (BITS_TO_LONGS(blklen) - BITS_TO_LONGS(rbnode->blklen))
@@ -305,7 +305,6 @@ static int regcache_rbtree_insert_to_block(struct regmap *map,
        }
 
        /* update the rbnode block, its size and the base register */
-       rbnode->block = blk;
        rbnode->blklen = blklen;
        rbnode->base_reg = base_reg;
        rbnode->cache_present = present;
index ab3e37a..d97eaf6 100644 (file)
@@ -180,14 +180,6 @@ config BLK_DEV_LOOP
          bits of, say, a sound file). This is also safe if the file resides
          on a remote file server.
 
-         There are several ways of encrypting disks. Some of these require
-         kernel patches. The vanilla kernel offers the cryptoloop option
-         and a Device Mapper target (which is superior, as it supports all
-         file systems). If you want to use the cryptoloop, say Y to both
-         LOOP and CRYPTOLOOP, and make sure you have a recent (version 2.12
-         or later) version of util-linux. Additionally, be aware that
-         the cryptoloop is not safe for storing journaled filesystems.
-
          Note that this loop device has nothing to do with the loopback
          device used for network connections from the machine to itself.
 
@@ -211,21 +203,6 @@ config BLK_DEV_LOOP_MIN_COUNT
          is used, it can be set to 0, since needed loop devices can be
          dynamically allocated with the /dev/loop-control interface.
 
-config BLK_DEV_CRYPTOLOOP
-       tristate "Cryptoloop Support (DEPRECATED)"
-       select CRYPTO
-       select CRYPTO_CBC
-       depends on BLK_DEV_LOOP
-       help
-         Say Y here if you want to be able to use the ciphers that are 
-         provided by the CryptoAPI as loop transformation. This might be
-         used as hard disk encryption.
-
-         WARNING: This device is not safe for journaled file systems like
-         ext3 or Reiserfs. Please use the Device Mapper crypto module
-         instead, which can be configured to be on-disk compatible with the
-         cryptoloop device.  cryptoloop support will be removed in Linux 5.16.
-
 source "drivers/block/drbd/Kconfig"
 
 config BLK_DEV_NBD
@@ -304,8 +281,8 @@ config BLK_DEV_RAM_SIZE
 config CDROM_PKTCDVD
        tristate "Packet writing on CD/DVD media (DEPRECATED)"
        depends on !UML
+       depends on SCSI
        select CDROM
-       select SCSI_COMMON
        help
          Note: This driver is deprecated and will be removed from the
          kernel in the near future!
index bc68817..11a74f1 100644 (file)
@@ -24,7 +24,6 @@ obj-$(CONFIG_CDROM_PKTCDVD)   += pktcdvd.o
 obj-$(CONFIG_SUNVDC)           += sunvdc.o
 
 obj-$(CONFIG_BLK_DEV_NBD)      += nbd.o
-obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)       += virtio_blk.o
 
 obj-$(CONFIG_BLK_DEV_SX8)      += sx8.o
index 8b17140..bf5c124 100644 (file)
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/fs.h>
 #include <linux/blk-mq.h>
-#include <linux/elevator.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 
@@ -1780,6 +1780,7 @@ static const struct blk_mq_ops amiflop_mq_ops = {
 static int fd_alloc_disk(int drive, int system)
 {
        struct gendisk *disk;
+       int err;
 
        disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
        if (IS_ERR(disk))
@@ -1798,8 +1799,10 @@ static int fd_alloc_disk(int drive, int system)
        set_capacity(disk, 880 * 2);
 
        unit[drive].gendisk[system] = disk;
-       add_disk(disk);
-       return 0;
+       err = add_disk(disk);
+       if (err)
+               blk_cleanup_disk(disk);
+       return err;
 }
 
 static int fd_alloc_drive(int drive)
index 06b360f..52484bc 100644 (file)
@@ -37,8 +37,7 @@ static ssize_t aoedisk_show_state(struct device *dev,
        struct gendisk *disk = dev_to_disk(dev);
        struct aoedev *d = disk->private_data;
 
-       return snprintf(page, PAGE_SIZE,
-                       "%s%s\n",
+       return sysfs_emit(page, "%s%s\n",
                        (d->flags & DEVFL_UP) ? "up" : "down",
                        (d->flags & DEVFL_KICKME) ? ",kickme" :
                        (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : "");
@@ -52,8 +51,8 @@ static ssize_t aoedisk_show_mac(struct device *dev,
        struct aoetgt *t = d->targets[0];
 
        if (t == NULL)
-               return snprintf(page, PAGE_SIZE, "none\n");
-       return snprintf(page, PAGE_SIZE, "%pm\n", t->addr);
+               return sysfs_emit(page, "none\n");
+       return sysfs_emit(page, "%pm\n", t->addr);
 }
 static ssize_t aoedisk_show_netif(struct device *dev,
                                  struct device_attribute *attr, char *page)
@@ -85,7 +84,7 @@ static ssize_t aoedisk_show_netif(struct device *dev,
        ne = nd;
        nd = nds;
        if (*nd == NULL)
-               return snprintf(page, PAGE_SIZE, "none\n");
+               return sysfs_emit(page, "none\n");
        for (p = page; nd < ne; nd++)
                p += scnprintf(p, PAGE_SIZE - (p-page), "%s%s",
                        p == page ? "" : ",", (*nd)->name);
@@ -99,7 +98,7 @@ static ssize_t aoedisk_show_fwver(struct device *dev,
        struct gendisk *disk = dev_to_disk(dev);
        struct aoedev *d = disk->private_data;
 
-       return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
+       return sysfs_emit(page, "0x%04x\n", (unsigned int) d->fw_ver);
 }
 static ssize_t aoedisk_show_payload(struct device *dev,
                                    struct device_attribute *attr, char *page)
@@ -107,7 +106,7 @@ static ssize_t aoedisk_show_payload(struct device *dev,
        struct gendisk *disk = dev_to_disk(dev);
        struct aoedev *d = disk->private_data;
 
-       return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+       return sysfs_emit(page, "%lu\n", d->maxbcnt);
 }
 
 static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
@@ -417,7 +416,9 @@ aoeblk_gdalloc(void *vp)
 
        spin_unlock_irqrestore(&d->lock, flags);
 
-       device_add_disk(NULL, gd, aoe_attr_groups);
+       err = device_add_disk(NULL, gd, aoe_attr_groups);
+       if (err)
+               goto out_disk_cleanup;
        aoedisk_add_debugfs(d);
 
        spin_lock_irqsave(&d->lock, flags);
@@ -426,6 +427,8 @@ aoeblk_gdalloc(void *vp)
        spin_unlock_irqrestore(&d->lock, flags);
        return;
 
+out_disk_cleanup:
+       blk_cleanup_disk(gd);
 err_tagset:
        blk_mq_free_tag_set(set);
 err_mempool:
index a093644..d14bdc3 100644 (file)
@@ -68,6 +68,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/blk-mq.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/completion.h>
 #include <linux/wait.h>
@@ -298,6 +299,7 @@ static struct atari_floppy_struct {
                                   disk change detection) */
        int flags;              /* flags */
        struct gendisk *disk[NUM_DISK_MINORS];
+       bool registered[NUM_DISK_MINORS];
        int ref;
        int type;
        struct blk_mq_tag_set tag_set;
@@ -456,10 +458,20 @@ static DEFINE_TIMER(fd_timer, check_change);
        
 static void fd_end_request_cur(blk_status_t err)
 {
+       DPRINT(("fd_end_request_cur(), bytes %d of %d\n",
+               blk_rq_cur_bytes(fd_request),
+               blk_rq_bytes(fd_request)));
+
        if (!blk_update_request(fd_request, err,
                                blk_rq_cur_bytes(fd_request))) {
+               DPRINT(("calling __blk_mq_end_request()\n"));
                __blk_mq_end_request(fd_request, err);
                fd_request = NULL;
+       } else {
+               /* requeue rest of request */
+               DPRINT(("calling blk_mq_requeue_request()\n"));
+               blk_mq_requeue_request(fd_request, true);
+               fd_request = NULL;
        }
 }
 
@@ -653,9 +665,6 @@ static inline void copy_buffer(void *from, void *to)
                *p2++ = *p1++;
 }
 
-  
-  
-
 /* General Interrupt Handling */
 
 static void (*FloppyIRQHandler)( int status ) = NULL;
@@ -700,12 +709,21 @@ static void fd_error( void )
        if (fd_request->error_count >= MAX_ERRORS) {
                printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
                fd_end_request_cur(BLK_STS_IOERR);
+               finish_fdc();
+               return;
        }
        else if (fd_request->error_count == RECALIBRATE_ERRORS) {
                printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
                if (SelectedDrive != -1)
                        SUD.track = -1;
        }
+       /* need to re-run request to recalibrate */
+       atari_disable_irq( IRQ_MFP_FDC );
+
+       setup_req_params( SelectedDrive );
+       do_fd_action( SelectedDrive );
+
+       atari_enable_irq( IRQ_MFP_FDC );
 }
 
 
@@ -732,8 +750,10 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
        if (type) {
                type--;
                if (type >= NUM_DISK_MINORS ||
-                   minor2disktype[type].drive_types > DriveType)
+                   minor2disktype[type].drive_types > DriveType) {
+                       finish_fdc();
                        return -EINVAL;
+               }
        }
 
        q = unit[drive].disk[type]->queue;
@@ -751,6 +771,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
        }
 
        if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) {
+               finish_fdc();
                ret = -EINVAL;
                goto out;
        }
@@ -791,6 +812,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
 
        wait_for_completion(&format_wait);
 
+       finish_fdc();
        ret = FormatError ? -EIO : 0;
 out:
        blk_mq_unquiesce_queue(q);
@@ -825,6 +847,7 @@ static void do_fd_action( int drive )
                    else {
                        /* all sectors finished */
                        fd_end_request_cur(BLK_STS_OK);
+                       finish_fdc();
                        return;
                    }
                }
@@ -1229,6 +1252,7 @@ static void fd_rwsec_done1(int status)
        else {
                /* all sectors finished */
                fd_end_request_cur(BLK_STS_OK);
+               finish_fdc();
        }
        return;
   
@@ -1350,7 +1374,7 @@ static void fd_times_out(struct timer_list *unused)
 
 static void finish_fdc( void )
 {
-       if (!NeedSeek) {
+       if (!NeedSeek || !stdma_is_locked_by(floppy_irq)) {
                finish_fdc_done( 0 );
        }
        else {
@@ -1385,7 +1409,8 @@ static void finish_fdc_done( int dummy )
        start_motor_off_timer();
 
        local_irq_save(flags);
-       stdma_release();
+       if (stdma_is_locked_by(floppy_irq))
+               stdma_release();
        local_irq_restore(flags);
 
        DPRINT(("finish_fdc() finished\n"));
@@ -1435,8 +1460,7 @@ static int floppy_revalidate(struct gendisk *disk)
        unsigned int drive = p - unit;
 
        if (test_bit(drive, &changed_floppies) ||
-           test_bit(drive, &fake_change) ||
-           p->disktype == 0) {
+           test_bit(drive, &fake_change) || !p->disktype) {
                if (UD.flags & FTD_MSG)
                        printk(KERN_ERR "floppy: clear format %p!\n", UDT);
                BufferDrive = -1;
@@ -1475,15 +1499,6 @@ static void setup_req_params( int drive )
                        ReqTrack, ReqSector, (unsigned long)ReqData ));
 }
 
-static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
-{
-       spin_lock_irq(&ataflop_lock);
-       atari_disable_irq(IRQ_MFP_FDC);
-       finish_fdc();
-       atari_enable_irq(IRQ_MFP_FDC);
-       spin_unlock_irq(&ataflop_lock);
-}
-
 static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                                     const struct blk_mq_queue_data *bd)
 {
@@ -1491,6 +1506,10 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
        int drive = floppy - unit;
        int type = floppy->type;
 
+       DPRINT(("Queue request: drive %d type %d sectors %d of %d last %d\n",
+               drive, type, blk_rq_cur_sectors(bd->rq),
+               blk_rq_sectors(bd->rq), bd->last));
+
        spin_lock_irq(&ataflop_lock);
        if (fd_request) {
                spin_unlock_irq(&ataflop_lock);
@@ -1511,6 +1530,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                /* drive not connected */
                printk(KERN_ERR "Unknown Device: fd%d\n", drive );
                fd_end_request_cur(BLK_STS_IOERR);
+               stdma_release();
                goto out;
        }
                
@@ -1527,11 +1547,13 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                if (--type >= NUM_DISK_MINORS) {
                        printk(KERN_WARNING "fd%d: invalid disk format", drive );
                        fd_end_request_cur(BLK_STS_IOERR);
+                       stdma_release();
                        goto out;
                }
                if (minor2disktype[type].drive_types > DriveType)  {
                        printk(KERN_WARNING "fd%d: unsupported disk format", drive );
                        fd_end_request_cur(BLK_STS_IOERR);
+                       stdma_release();
                        goto out;
                }
                type = minor2disktype[type].index;
@@ -1550,8 +1572,6 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
        setup_req_params( drive );
        do_fd_action( drive );
 
-       if (bd->last)
-               finish_fdc();
        atari_enable_irq( IRQ_MFP_FDC );
 
 out:
@@ -1634,6 +1654,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
                /* what if type > 0 here? Overwrite specified entry ? */
                if (type) {
                        /* refuse to re-set a predefined type for now */
+                       finish_fdc();
                        return -EINVAL;
                }
 
@@ -1701,8 +1722,10 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 
                /* sanity check */
                if (setprm.track != dtp->blocks/dtp->spt/2 ||
-                   setprm.head != 2)
+                   setprm.head != 2) {
+                       finish_fdc();
                        return -EINVAL;
+               }
 
                UDT = dtp;
                set_capacity(disk, UDT->blocks);
@@ -1962,7 +1985,6 @@ static const struct block_device_operations floppy_fops = {
 
 static const struct blk_mq_ops ataflop_mq_ops = {
        .queue_rq = ataflop_queue_rq,
-       .commit_rqs = ataflop_commit_rqs,
 };
 
 static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
@@ -2000,12 +2022,28 @@ static void ataflop_probe(dev_t dev)
                return;
        mutex_lock(&ataflop_probe_lock);
        if (!unit[drive].disk[type]) {
-               if (ataflop_alloc_disk(drive, type) == 0)
+               if (ataflop_alloc_disk(drive, type) == 0) {
                        add_disk(unit[drive].disk[type]);
+                       unit[drive].registered[type] = true;
+               }
        }
        mutex_unlock(&ataflop_probe_lock);
 }
 
+static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs)
+{
+       int type;
+
+       for (type = 0; type < NUM_DISK_MINORS; type++) {
+               if (!fs->disk[type])
+                       continue;
+               if (fs->registered[type])
+                       del_gendisk(fs->disk[type]);
+               blk_cleanup_disk(fs->disk[type]);
+       }
+       blk_mq_free_tag_set(&fs->tag_set);
+}
+
 static int __init atari_floppy_init (void)
 {
        int i;
@@ -2064,7 +2102,10 @@ static int __init atari_floppy_init (void)
        for (i = 0; i < FD_MAX_UNITS; i++) {
                unit[i].track = -1;
                unit[i].flags = 0;
-               add_disk(unit[i].disk[0]);
+               ret = add_disk(unit[i].disk[0]);
+               if (ret)
+                       goto err_out_dma;
+               unit[i].registered[0] = true;
        }
 
        printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n",
@@ -2074,12 +2115,11 @@ static int __init atari_floppy_init (void)
 
        return 0;
 
+err_out_dma:
+       atari_stram_free(DMABuffer);
 err:
-       while (--i >= 0) {
-               blk_cleanup_queue(unit[i].disk[0]->queue);
-               put_disk(unit[i].disk[0]);
-               blk_mq_free_tag_set(&unit[i].tag_set);
-       }
+       while (--i >= 0)
+               atari_cleanup_floppy_disk(&unit[i]);
 
        unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_unlock:
@@ -2128,18 +2168,10 @@ __setup("floppy=", atari_floppy_setup);
 
 static void __exit atari_floppy_exit(void)
 {
-       int i, type;
+       int i;
 
-       for (i = 0; i < FD_MAX_UNITS; i++) {
-               for (type = 0; type < NUM_DISK_MINORS; type++) {
-                       if (!unit[i].disk[type])
-                               continue;
-                       del_gendisk(unit[i].disk[type]);
-                       blk_cleanup_queue(unit[i].disk[type]->queue);
-                       put_disk(unit[i].disk[type]);
-               }
-               blk_mq_free_tag_set(&unit[i].tag_set);
-       }
+       for (i = 0; i < FD_MAX_UNITS; i++)
+               atari_cleanup_floppy_disk(&unit[i]);
        unregister_blkdev(FLOPPY_MAJOR, "fd");
 
        del_timer_sync(&fd_timer);
index 530b312..aa04727 100644 (file)
@@ -282,7 +282,7 @@ out:
        return err;
 }
 
-static blk_qc_t brd_submit_bio(struct bio *bio)
+static void brd_submit_bio(struct bio *bio)
 {
        struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
        sector_t sector = bio->bi_iter.bi_sector;
@@ -299,16 +299,14 @@ static blk_qc_t brd_submit_bio(struct bio *bio)
 
                err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
                                  bio_op(bio), sector);
-               if (err)
-                       goto io_error;
+               if (err) {
+                       bio_io_error(bio);
+                       return;
+               }
                sector += len >> SECTOR_SHIFT;
        }
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
-io_error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int brd_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
deleted file mode 100644 (file)
index f0a91fa..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-   Linux loop encryption enabling module
-
-   Copyright (C)  2002 Herbert Valerio Riedel <hvr@gnu.org>
-   Copyright (C)  2003 Fruhwirth Clemens <clemens@endorphin.org>
-
- */
-
-#include <linux/module.h>
-
-#include <crypto/skcipher.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/blkdev.h>
-#include <linux/scatterlist.h>
-#include <linux/uaccess.h>
-#include "loop.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI");
-MODULE_AUTHOR("Herbert Valerio Riedel <hvr@gnu.org>");
-
-#define LOOP_IV_SECTOR_BITS 9
-#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS)
-
-static int
-cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
-{
-       int err = -EINVAL;
-       int cipher_len;
-       int mode_len;
-       char cms[LO_NAME_SIZE];                 /* cipher-mode string */
-       char *mode;
-       char *cmsp = cms;                       /* c-m string pointer */
-       struct crypto_sync_skcipher *tfm;
-
-       /* encryption breaks for non sector aligned offsets */
-
-       if (info->lo_offset % LOOP_IV_SECTOR_SIZE)
-               goto out;
-
-       strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
-       cms[LO_NAME_SIZE - 1] = 0;
-
-       cipher_len = strcspn(cmsp, "-");
-
-       mode = cmsp + cipher_len;
-       mode_len = 0;
-       if (*mode) {
-               mode++;
-               mode_len = strcspn(mode, "-");
-       }
-
-       if (!mode_len) {
-               mode = "cbc";
-               mode_len = 3;
-       }
-
-       if (cipher_len + mode_len + 3 > LO_NAME_SIZE)
-               return -EINVAL;
-
-       memmove(cms, mode, mode_len);
-       cmsp = cms + mode_len;
-       *cmsp++ = '(';
-       memcpy(cmsp, info->lo_crypt_name, cipher_len);
-       cmsp += cipher_len;
-       *cmsp++ = ')';
-       *cmsp = 0;
-
-       tfm = crypto_alloc_sync_skcipher(cms, 0, 0);
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       err = crypto_sync_skcipher_setkey(tfm, info->lo_encrypt_key,
-                                         info->lo_encrypt_key_size);
-
-       if (err != 0)
-               goto out_free_tfm;
-
-       lo->key_data = tfm;
-       return 0;
-
- out_free_tfm:
-       crypto_free_sync_skcipher(tfm);
-
- out:
-       return err;
-}
-
-
-typedef int (*encdec_cbc_t)(struct skcipher_request *req);
-
-static int
-cryptoloop_transfer(struct loop_device *lo, int cmd,
-                   struct page *raw_page, unsigned raw_off,
-                   struct page *loop_page, unsigned loop_off,
-                   int size, sector_t IV)
-{
-       struct crypto_sync_skcipher *tfm = lo->key_data;
-       SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-       struct scatterlist sg_out;
-       struct scatterlist sg_in;
-
-       encdec_cbc_t encdecfunc;
-       struct page *in_page, *out_page;
-       unsigned in_offs, out_offs;
-       int err;
-
-       skcipher_request_set_sync_tfm(req, tfm);
-       skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
-                                     NULL, NULL);
-
-       sg_init_table(&sg_out, 1);
-       sg_init_table(&sg_in, 1);
-
-       if (cmd == READ) {
-               in_page = raw_page;
-               in_offs = raw_off;
-               out_page = loop_page;
-               out_offs = loop_off;
-               encdecfunc = crypto_skcipher_decrypt;
-       } else {
-               in_page = loop_page;
-               in_offs = loop_off;
-               out_page = raw_page;
-               out_offs = raw_off;
-               encdecfunc = crypto_skcipher_encrypt;
-       }
-
-       while (size > 0) {
-               const int sz = min(size, LOOP_IV_SECTOR_SIZE);
-               u32 iv[4] = { 0, };
-               iv[0] = cpu_to_le32(IV & 0xffffffff);
-
-               sg_set_page(&sg_in, in_page, sz, in_offs);
-               sg_set_page(&sg_out, out_page, sz, out_offs);
-
-               skcipher_request_set_crypt(req, &sg_in, &sg_out, sz, iv);
-               err = encdecfunc(req);
-               if (err)
-                       goto out;
-
-               IV++;
-               size -= sz;
-               in_offs += sz;
-               out_offs += sz;
-       }
-
-       err = 0;
-
-out:
-       skcipher_request_zero(req);
-       return err;
-}
-
-static int
-cryptoloop_ioctl(struct loop_device *lo, int cmd, unsigned long arg)
-{
-       return -EINVAL;
-}
-
-static int
-cryptoloop_release(struct loop_device *lo)
-{
-       struct crypto_sync_skcipher *tfm = lo->key_data;
-       if (tfm != NULL) {
-               crypto_free_sync_skcipher(tfm);
-               lo->key_data = NULL;
-               return 0;
-       }
-       printk(KERN_ERR "cryptoloop_release(): tfm == NULL?\n");
-       return -EINVAL;
-}
-
-static struct loop_func_table cryptoloop_funcs = {
-       .number = LO_CRYPT_CRYPTOAPI,
-       .init = cryptoloop_init,
-       .ioctl = cryptoloop_ioctl,
-       .transfer = cryptoloop_transfer,
-       .release = cryptoloop_release,
-       .owner = THIS_MODULE
-};
-
-static int __init
-init_cryptoloop(void)
-{
-       int rc = loop_register_transfer(&cryptoloop_funcs);
-
-       if (rc)
-               printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n");
-       else
-               pr_warn("the cryptoloop driver has been deprecated and will be removed in in Linux 5.16\n");
-       return rc;
-}
-
-static void __exit
-cleanup_cryptoloop(void)
-{
-       if (loop_unregister_transfer(LO_CRYPT_CRYPTOAPI))
-               printk(KERN_ERR
-                       "cryptoloop: loop_unregister_transfer failed\n");
-}
-
-module_init(init_cryptoloop);
-module_exit(cleanup_cryptoloop);
index 5d91813..f27d5b0 100644 (file)
@@ -1448,7 +1448,7 @@ extern void conn_free_crypto(struct drbd_connection *connection);
 /* drbd_req */
 extern void do_submit(struct work_struct *ws);
 extern void __drbd_make_request(struct drbd_device *, struct bio *);
-extern blk_qc_t drbd_submit_bio(struct bio *bio);
+void drbd_submit_bio(struct bio *bio);
 extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
@@ -1826,8 +1826,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
 /* Returns the number of 512 byte sectors of the device */
 static inline sector_t drbd_get_capacity(struct block_device *bdev)
 {
-       /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
-       return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+       return bdev ? bdev_nr_sectors(bdev) : 0;
 }
 
 /**
index 55234a5..19db80a 100644 (file)
@@ -2794,7 +2794,9 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                goto out_idr_remove_vol;
        }
 
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
 
        /* inherit the connection state */
        device->state.conn = first_connection(resource)->cstate;
@@ -2808,6 +2810,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
        drbd_debugfs_device_add(device);
        return NO_ERROR;
 
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
 out_idr_remove_vol:
        idr_remove(&connection->peer_devices, vnr);
 out_idr_remove_from_resource:
index 5ca2336..3235532 100644 (file)
@@ -1596,7 +1596,7 @@ void do_submit(struct work_struct *ws)
        }
 }
 
-blk_qc_t drbd_submit_bio(struct bio *bio)
+void drbd_submit_bio(struct bio *bio)
 {
        struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 
@@ -1609,7 +1609,6 @@ blk_qc_t drbd_submit_bio(struct bio *bio)
 
        inc_ap_bio(device);
        __drbd_make_request(device, bio);
-       return BLK_QC_T_NONE;
 }
 
 static bool net_timeout_reached(struct drbd_request *net_req,
index fef79ea..3873e78 100644 (file)
@@ -184,6 +184,7 @@ static int print_unex = 1;
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/major.h>
 #include <linux/platform_device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/mutex.h>
@@ -4478,6 +4479,7 @@ static const struct blk_mq_ops floppy_mq_ops = {
 };
 
 static struct platform_device floppy_device[N_DRIVE];
+static bool registered[N_DRIVE];
 
 static bool floppy_available(int drive)
 {
@@ -4693,8 +4695,12 @@ static int __init do_floppy_init(void)
                if (err)
                        goto out_remove_drives;
 
-               device_add_disk(&floppy_device[drive].dev, disks[drive][0],
-                               NULL);
+               registered[drive] = true;
+
+               err = device_add_disk(&floppy_device[drive].dev,
+                                     disks[drive][0], NULL);
+               if (err)
+                       goto out_remove_drives;
        }
 
        return 0;
@@ -4703,7 +4709,8 @@ out_remove_drives:
        while (drive--) {
                if (floppy_available(drive)) {
                        del_gendisk(disks[drive][0]);
-                       platform_device_unregister(&floppy_device[drive]);
+                       if (registered[drive])
+                               platform_device_unregister(&floppy_device[drive]);
                }
        }
 out_release_dma:
@@ -4946,30 +4953,14 @@ static void __exit floppy_module_exit(void)
                                if (disks[drive][i])
                                        del_gendisk(disks[drive][i]);
                        }
-                       platform_device_unregister(&floppy_device[drive]);
+                       if (registered[drive])
+                               platform_device_unregister(&floppy_device[drive]);
                }
                for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
                        if (disks[drive][i])
-                               blk_cleanup_queue(disks[drive][i]->queue);
+                               blk_cleanup_disk(disks[drive][i]);
                }
                blk_mq_free_tag_set(&tag_sets[drive]);
-
-               /*
-                * These disks have not called add_disk().  Don't put down
-                * queue reference in put_disk().
-                */
-               if (!(allowed_drive_mask & (1 << drive)) ||
-                   fdc_state[FDC(drive)].version == FDC_NONE) {
-                       for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
-                               if (disks[drive][i])
-                                       disks[drive][i]->queue = NULL;
-                       }
-               }
-
-               for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
-                       if (disks[drive][i])
-                               put_disk(disks[drive][i]);
-               }
        }
 
        cancel_delayed_work_sync(&fd_timeout);
index 7bf4686..3c09a33 100644 (file)
@@ -133,58 +133,6 @@ static void loop_global_unlock(struct loop_device *lo, bool global)
 static int max_part;
 static int part_shift;
 
-static int transfer_xor(struct loop_device *lo, int cmd,
-                       struct page *raw_page, unsigned raw_off,
-                       struct page *loop_page, unsigned loop_off,
-                       int size, sector_t real_block)
-{
-       char *raw_buf = kmap_atomic(raw_page) + raw_off;
-       char *loop_buf = kmap_atomic(loop_page) + loop_off;
-       char *in, *out, *key;
-       int i, keysize;
-
-       if (cmd == READ) {
-               in = raw_buf;
-               out = loop_buf;
-       } else {
-               in = loop_buf;
-               out = raw_buf;
-       }
-
-       key = lo->lo_encrypt_key;
-       keysize = lo->lo_encrypt_key_size;
-       for (i = 0; i < size; i++)
-               *out++ = *in++ ^ key[(i & 511) % keysize];
-
-       kunmap_atomic(loop_buf);
-       kunmap_atomic(raw_buf);
-       cond_resched();
-       return 0;
-}
-
-static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
-{
-       if (unlikely(info->lo_encrypt_key_size <= 0))
-               return -EINVAL;
-       return 0;
-}
-
-static struct loop_func_table none_funcs = {
-       .number = LO_CRYPT_NONE,
-}; 
-
-static struct loop_func_table xor_funcs = {
-       .number = LO_CRYPT_XOR,
-       .transfer = transfer_xor,
-       .init = xor_init
-}; 
-
-/* xfer_funcs[0] is special - its release function is never called */
-static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
-       &none_funcs,
-       &xor_funcs
-};
-
 static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
 {
        loff_t loopsize;
@@ -228,8 +176,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
        /*
         * We support direct I/O only if lo_offset is aligned with the
         * logical I/O size of backing device, and the logical block
-        * size of loop is bigger than the backing device's and the loop
-        * needn't transform transfer.
+        * size of loop is bigger than the backing device's.
         *
         * TODO: the above condition may be loosed in the future, and
         * direct I/O may be switched runtime at that time because most
@@ -238,8 +185,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
        if (dio) {
                if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
                                !(lo->lo_offset & dio_align) &&
-                               mapping->a_ops->direct_IO &&
-                               !lo->transfer)
+                               mapping->a_ops->direct_IO)
                        use_dio = true;
                else
                        use_dio = false;
@@ -273,19 +219,6 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 }
 
 /**
- * loop_validate_block_size() - validates the passed in block size
- * @bsize: size to validate
- */
-static int
-loop_validate_block_size(unsigned short bsize)
-{
-       if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
  * loop_set_size() - sets device size and notifies userspace
  * @lo: struct loop_device to set the size for
  * @size: new size of the loop device
@@ -299,24 +232,6 @@ static void loop_set_size(struct loop_device *lo, loff_t size)
                kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 }
 
-static inline int
-lo_do_transfer(struct loop_device *lo, int cmd,
-              struct page *rpage, unsigned roffs,
-              struct page *lpage, unsigned loffs,
-              int size, sector_t rblock)
-{
-       int ret;
-
-       ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
-       if (likely(!ret))
-               return 0;
-
-       printk_ratelimited(KERN_ERR
-               "loop: Transfer error at byte offset %llu, length %i.\n",
-               (unsigned long long)rblock << 9, size);
-       return ret;
-}
-
 static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
 {
        struct iov_iter i;
@@ -356,41 +271,6 @@ static int lo_write_simple(struct loop_device *lo, struct request *rq,
        return ret;
 }
 
-/*
- * This is the slow, transforming version that needs to double buffer the
- * data as it cannot do the transformations in place without having direct
- * access to the destination pages of the backing file.
- */
-static int lo_write_transfer(struct loop_device *lo, struct request *rq,
-               loff_t pos)
-{
-       struct bio_vec bvec, b;
-       struct req_iterator iter;
-       struct page *page;
-       int ret = 0;
-
-       page = alloc_page(GFP_NOIO);
-       if (unlikely(!page))
-               return -ENOMEM;
-
-       rq_for_each_segment(bvec, rq, iter) {
-               ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
-                       bvec.bv_offset, bvec.bv_len, pos >> 9);
-               if (unlikely(ret))
-                       break;
-
-               b.bv_page = page;
-               b.bv_offset = 0;
-               b.bv_len = bvec.bv_len;
-               ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
-               if (ret < 0)
-                       break;
-       }
-
-       __free_page(page);
-       return ret;
-}
-
 static int lo_read_simple(struct loop_device *lo, struct request *rq,
                loff_t pos)
 {
@@ -420,64 +300,12 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq,
        return 0;
 }
 
-static int lo_read_transfer(struct loop_device *lo, struct request *rq,
-               loff_t pos)
-{
-       struct bio_vec bvec, b;
-       struct req_iterator iter;
-       struct iov_iter i;
-       struct page *page;
-       ssize_t len;
-       int ret = 0;
-
-       page = alloc_page(GFP_NOIO);
-       if (unlikely(!page))
-               return -ENOMEM;
-
-       rq_for_each_segment(bvec, rq, iter) {
-               loff_t offset = pos;
-
-               b.bv_page = page;
-               b.bv_offset = 0;
-               b.bv_len = bvec.bv_len;
-
-               iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
-               len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
-               if (len < 0) {
-                       ret = len;
-                       goto out_free_page;
-               }
-
-               ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
-                       bvec.bv_offset, len, offset >> 9);
-               if (ret)
-                       goto out_free_page;
-
-               flush_dcache_page(bvec.bv_page);
-
-               if (len != bvec.bv_len) {
-                       struct bio *bio;
-
-                       __rq_for_each_bio(bio, rq)
-                               zero_fill_bio(bio);
-                       break;
-               }
-       }
-
-       ret = 0;
-out_free_page:
-       __free_page(page);
-       return ret;
-}
-
 static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
                        int mode)
 {
        /*
         * We use fallocate to manipulate the space mappings used by the image
-        * a.k.a. discard/zerorange. However we do not support this if
-        * encryption is enabled, because it may give an attacker useful
-        * information.
+        * a.k.a. discard/zerorange.
         */
        struct file *file = lo->lo_backing_file;
        struct request_queue *q = lo->lo_queue;
@@ -554,7 +382,7 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
                blk_mq_complete_request(rq);
 }
 
-static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
 {
        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
@@ -627,7 +455,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
        lo_rw_aio_do_completion(cmd);
 
        if (ret != -EIOCBQUEUED)
-               cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+               lo_rw_aio_complete(&cmd->iocb, ret);
        return 0;
 }
 
@@ -660,16 +488,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
        case REQ_OP_DISCARD:
                return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_WRITE:
-               if (lo->transfer)
-                       return lo_write_transfer(lo, rq, pos);
-               else if (cmd->use_aio)
+               if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, WRITE);
                else
                        return lo_write_simple(lo, rq, pos);
        case REQ_OP_READ:
-               if (lo->transfer)
-                       return lo_read_transfer(lo, rq, pos);
-               else if (cmd->use_aio)
+               if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, READ);
                else
                        return lo_read_simple(lo, rq, pos);
@@ -934,7 +758,7 @@ static void loop_config_discard(struct loop_device *lo)
         * not blkdev_issue_discard(). This maintains consistent behavior with
         * file-backed loop devices: discarded regions read back as zero.
         */
-       if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+       if (S_ISBLK(inode->i_mode)) {
                struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
 
                max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
@@ -943,11 +767,9 @@ static void loop_config_discard(struct loop_device *lo)
 
        /*
         * We use punch hole to reclaim the free space used by the
-        * image a.k.a. discard. However we do not support discard if
-        * encryption is enabled, because it may give an attacker
-        * useful information.
+        * image a.k.a. discard.
         */
-       } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
+       } else if (!file->f_op->fallocate) {
                max_discard_sectors = 0;
                granularity = 0;
 
@@ -1084,43 +906,6 @@ static void loop_update_rotational(struct loop_device *lo)
                blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
 }
 
-static int
-loop_release_xfer(struct loop_device *lo)
-{
-       int err = 0;
-       struct loop_func_table *xfer = lo->lo_encryption;
-
-       if (xfer) {
-               if (xfer->release)
-                       err = xfer->release(lo);
-               lo->transfer = NULL;
-               lo->lo_encryption = NULL;
-               module_put(xfer->owner);
-       }
-       return err;
-}
-
-static int
-loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
-              const struct loop_info64 *i)
-{
-       int err = 0;
-
-       if (xfer) {
-               struct module *owner = xfer->owner;
-
-               if (!try_module_get(owner))
-                       return -EINVAL;
-               if (xfer->init)
-                       err = xfer->init(lo, i);
-               if (err)
-                       module_put(owner);
-               else
-                       lo->lo_encryption = xfer;
-       }
-       return err;
-}
-
 /**
  * loop_set_status_from_info - configure device from loop_info
  * @lo: struct loop_device to configure
@@ -1133,55 +918,27 @@ static int
 loop_set_status_from_info(struct loop_device *lo,
                          const struct loop_info64 *info)
 {
-       int err;
-       struct loop_func_table *xfer;
-       kuid_t uid = current_uid();
-
        if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                return -EINVAL;
 
-       err = loop_release_xfer(lo);
-       if (err)
-               return err;
-
-       if (info->lo_encrypt_type) {
-               unsigned int type = info->lo_encrypt_type;
-
-               if (type >= MAX_LO_CRYPT)
-                       return -EINVAL;
-               xfer = xfer_funcs[type];
-               if (xfer == NULL)
-                       return -EINVAL;
-       } else
-               xfer = NULL;
-
-       err = loop_init_xfer(lo, xfer, info);
-       if (err)
-               return err;
+       switch (info->lo_encrypt_type) {
+       case LO_CRYPT_NONE:
+               break;
+       case LO_CRYPT_XOR:
+               pr_warn("support for the xor transformation has been removed.\n");
+               return -EINVAL;
+       case LO_CRYPT_CRYPTOAPI:
+               pr_warn("support for cryptoloop has been removed.  Use dm-crypt instead.\n");
+               return -EINVAL;
+       default:
+               return -EINVAL;
+       }
 
        lo->lo_offset = info->lo_offset;
        lo->lo_sizelimit = info->lo_sizelimit;
        memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
-       memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
        lo->lo_file_name[LO_NAME_SIZE-1] = 0;
-       lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
-
-       if (!xfer)
-               xfer = &none_funcs;
-       lo->transfer = xfer->transfer;
-       lo->ioctl = xfer->ioctl;
-
        lo->lo_flags = info->lo_flags;
-
-       lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
-       lo->lo_init[0] = info->lo_init[0];
-       lo->lo_init[1] = info->lo_init[1];
-       if (info->lo_encrypt_key_size) {
-               memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
-                      info->lo_encrypt_key_size);
-               lo->lo_key_owner = uid;
-       }
-
        return 0;
 }
 
@@ -1236,7 +993,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
        }
 
        if (config->block_size) {
-               error = loop_validate_block_size(config->block_size);
+               error = blk_validate_block_size(config->block_size);
                if (error)
                        goto out_unlock;
        }
@@ -1329,7 +1086,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 {
        struct file *filp = NULL;
        gfp_t gfp = lo->old_gfp_mask;
-       struct block_device *bdev = lo->lo_device;
        int err = 0;
        bool partscan = false;
        int lo_number;
@@ -1381,36 +1137,23 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
        lo->lo_backing_file = NULL;
        spin_unlock_irq(&lo->lo_lock);
 
-       loop_release_xfer(lo);
-       lo->transfer = NULL;
-       lo->ioctl = NULL;
        lo->lo_device = NULL;
-       lo->lo_encryption = NULL;
        lo->lo_offset = 0;
        lo->lo_sizelimit = 0;
-       lo->lo_encrypt_key_size = 0;
-       memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
-       memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
        memset(lo->lo_file_name, 0, LO_NAME_SIZE);
        blk_queue_logical_block_size(lo->lo_queue, 512);
        blk_queue_physical_block_size(lo->lo_queue, 512);
        blk_queue_io_min(lo->lo_queue, 512);
-       if (bdev) {
-               invalidate_bdev(bdev);
-               bdev->bd_inode->i_mapping->wb_err = 0;
-       }
-       set_capacity(lo->lo_disk, 0);
+       invalidate_disk(lo->lo_disk);
        loop_sysfs_exit(lo);
-       if (bdev) {
-               /* let user-space know about this change */
-               kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
-       }
+       /* let user-space know about this change */
+       kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
        mapping_set_gfp_mask(filp->f_mapping, gfp);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        blk_mq_unfreeze_queue(lo->lo_queue);
 
-       partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+       partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        lo_number = lo->lo_number;
        disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
 out_unlock:
@@ -1498,7 +1241,6 @@ static int
 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 {
        int err;
-       kuid_t uid = current_uid();
        int prev_lo_flags;
        bool partscan = false;
        bool size_changed = false;
@@ -1506,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;
-       if (lo->lo_encrypt_key_size &&
-           !uid_eq(lo->lo_key_owner, uid) &&
-           !capable(CAP_SYS_ADMIN)) {
-               err = -EPERM;
-               goto out_unlock;
-       }
        if (lo->lo_state != Lo_bound) {
                err = -ENXIO;
                goto out_unlock;
@@ -1597,14 +1333,6 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
        info->lo_sizelimit = lo->lo_sizelimit;
        info->lo_flags = lo->lo_flags;
        memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
-       memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
-       info->lo_encrypt_type =
-               lo->lo_encryption ? lo->lo_encryption->number : 0;
-       if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
-               info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
-               memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
-                      lo->lo_encrypt_key_size);
-       }
 
        /* Drop lo_mutex while we call into the filesystem. */
        path = lo->lo_backing_file->f_path;
@@ -1630,16 +1358,8 @@ loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
        info64->lo_rdevice = info->lo_rdevice;
        info64->lo_offset = info->lo_offset;
        info64->lo_sizelimit = 0;
-       info64->lo_encrypt_type = info->lo_encrypt_type;
-       info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
        info64->lo_flags = info->lo_flags;
-       info64->lo_init[0] = info->lo_init[0];
-       info64->lo_init[1] = info->lo_init[1];
-       if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
-       else
-               memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
-       memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
 }
 
 static int
@@ -1651,16 +1371,8 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
        info->lo_inode = info64->lo_inode;
        info->lo_rdevice = info64->lo_rdevice;
        info->lo_offset = info64->lo_offset;
-       info->lo_encrypt_type = info64->lo_encrypt_type;
-       info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
        info->lo_flags = info64->lo_flags;
-       info->lo_init[0] = info64->lo_init[0];
-       info->lo_init[1] = info64->lo_init[1];
-       if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
-       else
-               memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
-       memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
 
        /* error in case values were truncated */
        if (info->lo_device != info64->lo_device ||
@@ -1759,7 +1471,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
        if (lo->lo_state != Lo_bound)
                return -ENXIO;
 
-       err = loop_validate_block_size(arg);
+       err = blk_validate_block_size(arg);
        if (err)
                return err;
 
@@ -1809,7 +1521,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
                err = loop_set_block_size(lo, arg);
                break;
        default:
-               err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+               err = -EINVAL;
        }
        mutex_unlock(&lo->lo_mutex);
        return err;
@@ -1885,7 +1597,6 @@ struct compat_loop_info {
        compat_ulong_t  lo_inode;       /* ioctl r/o */
        compat_dev_t    lo_rdevice;     /* ioctl r/o */
        compat_int_t    lo_offset;
-       compat_int_t    lo_encrypt_type;
        compat_int_t    lo_encrypt_key_size;    /* ioctl w/o */
        compat_int_t    lo_flags;       /* ioctl r/o */
        char            lo_name[LO_NAME_SIZE];
@@ -1914,16 +1625,8 @@ loop_info64_from_compat(const struct compat_loop_info __user *arg,
        info64->lo_rdevice = info.lo_rdevice;
        info64->lo_offset = info.lo_offset;
        info64->lo_sizelimit = 0;
-       info64->lo_encrypt_type = info.lo_encrypt_type;
-       info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
        info64->lo_flags = info.lo_flags;
-       info64->lo_init[0] = info.lo_init[0];
-       info64->lo_init[1] = info.lo_init[1];
-       if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
-       else
-               memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
-       memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
        return 0;
 }
 
@@ -1943,24 +1646,14 @@ loop_info64_to_compat(const struct loop_info64 *info64,
        info.lo_inode = info64->lo_inode;
        info.lo_rdevice = info64->lo_rdevice;
        info.lo_offset = info64->lo_offset;
-       info.lo_encrypt_type = info64->lo_encrypt_type;
-       info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
        info.lo_flags = info64->lo_flags;
-       info.lo_init[0] = info64->lo_init[0];
-       info.lo_init[1] = info64->lo_init[1];
-       if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
-       else
-               memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
-       memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
 
        /* error in case values were truncated */
        if (info.lo_device != info64->lo_device ||
            info.lo_rdevice != info64->lo_rdevice ||
            info.lo_inode != info64->lo_inode ||
-           info.lo_offset != info64->lo_offset ||
-           info.lo_init[0] != info64->lo_init[0] ||
-           info.lo_init[1] != info64->lo_init[1])
+           info.lo_offset != info64->lo_offset)
                return -EOVERFLOW;
 
        if (copy_to_user(arg, &info, sizeof(info)))
@@ -2101,43 +1794,6 @@ MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
 
-int loop_register_transfer(struct loop_func_table *funcs)
-{
-       unsigned int n = funcs->number;
-
-       if (n >= MAX_LO_CRYPT || xfer_funcs[n])
-               return -EINVAL;
-       xfer_funcs[n] = funcs;
-       return 0;
-}
-
-int loop_unregister_transfer(int number)
-{
-       unsigned int n = number;
-       struct loop_func_table *xfer;
-
-       if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
-               return -EINVAL;
-       /*
-        * This function is called from only cleanup_cryptoloop().
-        * Given that each loop device that has a transfer enabled holds a
-        * reference to the module implementing it we should never get here
-        * with a transfer that is set (unless forced module unloading is
-        * requested). Thus, check module's refcount and warn if this is
-        * not a clean unloading.
-        */
-#ifdef CONFIG_MODULE_UNLOAD
-       if (xfer->owner && module_refcount(xfer->owner) != -1)
-               pr_err("Danger! Unregistering an in use transfer function.\n");
-#endif
-
-       xfer_funcs[n] = NULL;
-       return 0;
-}
-
-EXPORT_SYMBOL(loop_register_transfer);
-EXPORT_SYMBOL(loop_unregister_transfer);
-
 static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
 {
@@ -2394,13 +2050,19 @@ static int loop_add(int i)
        disk->event_flags       = DISK_EVENT_FLAG_UEVENT;
        sprintf(disk->disk_name, "loop%d", i);
        /* Make this loop device reachable from pathname. */
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
+
        /* Show this loop device. */
        mutex_lock(&loop_ctl_mutex);
        lo->idr_visible = true;
        mutex_unlock(&loop_ctl_mutex);
+
        return i;
 
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
 out_cleanup_tags:
        blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
index 04c88dd..082d4b6 100644 (file)
@@ -32,23 +32,10 @@ struct loop_device {
        loff_t          lo_offset;
        loff_t          lo_sizelimit;
        int             lo_flags;
-       int             (*transfer)(struct loop_device *, int cmd,
-                                   struct page *raw_page, unsigned raw_off,
-                                   struct page *loop_page, unsigned loop_off,
-                                   int size, sector_t real_block);
        char            lo_file_name[LO_NAME_SIZE];
-       char            lo_crypt_name[LO_NAME_SIZE];
-       char            lo_encrypt_key[LO_KEY_SIZE];
-       int             lo_encrypt_key_size;
-       struct loop_func_table *lo_encryption;
-       __u32           lo_init[2];
-       kuid_t          lo_key_owner;   /* Who set the key */
-       int             (*ioctl)(struct loop_device *, int cmd, 
-                                unsigned long arg); 
 
        struct file *   lo_backing_file;
        struct block_device *lo_device;
-       void            *key_data; 
 
        gfp_t           old_gfp_mask;
 
@@ -82,21 +69,4 @@ struct loop_cmd {
        struct cgroup_subsys_state *memcg_css;
 };
 
-/* Support for loadable transfer modules */
-struct loop_func_table {
-       int number;     /* filter type */ 
-       int (*transfer)(struct loop_device *lo, int cmd,
-                       struct page *raw_page, unsigned raw_off,
-                       struct page *loop_page, unsigned loop_off,
-                       int size, sector_t real_block);
-       int (*init)(struct loop_device *, const struct loop_info64 *); 
-       /* release is called from loop_unregister_transfer or clr_fd */
-       int (*release)(struct loop_device *); 
-       int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
-       struct module *owner;
-}; 
-
-int loop_register_transfer(struct loop_func_table *funcs);
-int loop_unregister_transfer(int number); 
-
 #endif
index 9018557..c91b901 100644 (file)
@@ -3633,7 +3633,9 @@ skip_create_disk:
        set_capacity(dd->disk, capacity);
 
        /* Enable the block device and add it to /dev */
-       device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups);
+       rv = device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups);
+       if (rv)
+               goto read_capacity_error;
 
        if (dd->mtip_svc_handler) {
                set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
@@ -4061,7 +4063,6 @@ block_initialize_err:
 
 msi_initialize_err:
        if (dd->isr_workq) {
-               flush_workqueue(dd->isr_workq);
                destroy_workqueue(dd->isr_workq);
                drop_cpu(dd->work[0].cpu_binding);
                drop_cpu(dd->work[1].cpu_binding);
@@ -4119,7 +4120,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
        mtip_block_remove(dd);
 
        if (dd->isr_workq) {
-               flush_workqueue(dd->isr_workq);
                destroy_workqueue(dd->isr_workq);
                drop_cpu(dd->work[0].cpu_binding);
                drop_cpu(dd->work[1].cpu_binding);
index 26798da..78282f0 100644 (file)
@@ -84,7 +84,7 @@ static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos)
        return true;
 }
 
-static blk_qc_t n64cart_submit_bio(struct bio *bio)
+static void n64cart_submit_bio(struct bio *bio)
 {
        struct bio_vec bvec;
        struct bvec_iter iter;
@@ -92,16 +92,14 @@ static blk_qc_t n64cart_submit_bio(struct bio *bio)
        u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 
        bio_for_each_segment(bvec, bio, iter) {
-               if (!n64cart_do_bvec(dev, &bvec, pos))
-                       goto io_error;
+               if (!n64cart_do_bvec(dev, &bvec, pos)) {
+                       bio_io_error(bio);
+                       return;
+               }
                pos += bvec.bv_len;
        }
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
-io_error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
 }
 
 static const struct block_device_operations n64cart_fops = {
@@ -117,6 +115,7 @@ static const struct block_device_operations n64cart_fops = {
 static int __init n64cart_probe(struct platform_device *pdev)
 {
        struct gendisk *disk;
+       int err = -ENOMEM;
 
        if (!start || !size) {
                pr_err("start or size not specified\n");
@@ -134,7 +133,7 @@ static int __init n64cart_probe(struct platform_device *pdev)
 
        disk = blk_alloc_disk(NUMA_NO_NODE);
        if (!disk)
-               return -ENOMEM;
+               goto out;
 
        disk->first_minor = 0;
        disk->flags = GENHD_FL_NO_PART_SCAN;
@@ -149,11 +148,18 @@ static int __init n64cart_probe(struct platform_device *pdev)
        blk_queue_physical_block_size(disk->queue, 4096);
        blk_queue_logical_block_size(disk->queue, 4096);
 
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
 
        pr_info("n64cart: %u kb disk\n", size / 1024);
 
        return 0;
+
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
+out:
+       return err;
 }
 
 static struct platform_driver n64cart_driver = {
index 1183f78..b47b2a8 100644 (file)
@@ -122,15 +122,21 @@ struct nbd_device {
        struct work_struct remove_work;
 
        struct list_head list;
-       struct task_struct *task_recv;
        struct task_struct *task_setup;
 
        unsigned long flags;
+       pid_t pid; /* pid of nbd-client, if attached */
 
        char *backend;
 };
 
 #define NBD_CMD_REQUEUED       1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT       2
 
 struct nbd_cmd {
        struct nbd_device *nbd;
@@ -217,7 +223,7 @@ static ssize_t pid_show(struct device *dev,
        struct gendisk *disk = dev_to_disk(dev);
        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 
-       return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+       return sprintf(buf, "%d\n", nbd->pid);
 }
 
 static const struct device_attribute pid_attr = {
@@ -310,26 +316,19 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
        nsock->sent = 0;
 }
 
-static void nbd_size_clear(struct nbd_device *nbd)
-{
-       if (nbd->config->bytesize) {
-               set_capacity(nbd->disk, 0);
-               kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
-       }
-}
-
 static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
                loff_t blksize)
 {
        if (!blksize)
                blksize = 1u << NBD_DEF_BLKSIZE_BITS;
-       if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
+
+       if (blk_validate_block_size(blksize))
                return -EINVAL;
 
        nbd->config->bytesize = bytesize;
        nbd->config->blksize_bits = __ffs(blksize);
 
-       if (!nbd->task_recv)
+       if (!nbd->pid)
                return 0;
 
        if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
@@ -405,6 +404,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
        if (!mutex_trylock(&cmd->lock))
                return BLK_EH_RESET_TIMER;
 
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               mutex_unlock(&cmd->lock);
+               return BLK_EH_DONE;
+       }
+
        if (!refcount_inc_not_zero(&nbd->config_refs)) {
                cmd->status = BLK_STS_TIMEOUT;
                mutex_unlock(&cmd->lock);
@@ -484,7 +488,8 @@ done:
 }
 
 /*
- *  Send or receive packet.
+ *  Send or receive packet. Return a positive value on success and
+ *  negtive value on failue, and never return 0.
  */
 static int sock_xmit(struct nbd_device *nbd, int index, int send,
                     struct iov_iter *iter, int msg_flags, int *sent)
@@ -610,7 +615,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
        result = sock_xmit(nbd, index, 1, &from,
                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
        trace_nbd_header_sent(req, handle);
-       if (result <= 0) {
+       if (result < 0) {
                if (was_interrupted(result)) {
                        /* If we havne't sent anything we can just return BUSY,
                         * however if we have sent something we need to make
@@ -654,7 +659,7 @@ send_pages:
                                skip = 0;
                        }
                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
-                       if (result <= 0) {
+                       if (result < 0) {
                                if (was_interrupted(result)) {
                                        /* We've already sent the header, we
                                         * have no choice but to set pending and
@@ -688,38 +693,45 @@ out:
        return 0;
 }
 
-/* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
+static int nbd_read_reply(struct nbd_device *nbd, int index,
+                         struct nbd_reply *reply)
 {
-       struct nbd_config *config = nbd->config;
-       int result;
-       struct nbd_reply reply;
-       struct nbd_cmd *cmd;
-       struct request *req = NULL;
-       u64 handle;
-       u16 hwq;
-       u32 tag;
-       struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
+       struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
        struct iov_iter to;
-       int ret = 0;
+       int result;
 
-       reply.magic = 0;
-       iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
+       reply->magic = 0;
+       iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply));
        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-       if (result <= 0) {
-               if (!nbd_disconnected(config))
+       if (result < 0) {
+               if (!nbd_disconnected(nbd->config))
                        dev_err(disk_to_dev(nbd->disk),
                                "Receive control failed (result %d)\n", result);
-               return ERR_PTR(result);
+               return result;
        }
 
-       if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
+       if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
-                               (unsigned long)ntohl(reply.magic));
-               return ERR_PTR(-EPROTO);
+                               (unsigned long)ntohl(reply->magic));
+               return -EPROTO;
        }
 
-       memcpy(&handle, reply.handle, sizeof(handle));
+       return 0;
+}
+
+/* NULL returned = something went wrong, inform userspace */
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+                                       struct nbd_reply *reply)
+{
+       int result;
+       struct nbd_cmd *cmd;
+       struct request *req = NULL;
+       u64 handle;
+       u16 hwq;
+       u32 tag;
+       int ret = 0;
+
+       memcpy(&handle, reply->handle, sizeof(handle));
        tag = nbd_handle_to_tag(handle);
        hwq = blk_mq_unique_tag_to_hwq(tag);
        if (hwq < nbd->tag_set.nr_hw_queues)
@@ -734,6 +746,16 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
        cmd = blk_mq_rq_to_pdu(req);
 
        mutex_lock(&cmd->lock);
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+                       tag, cmd->status, cmd->flags);
+               ret = -ENOENT;
+               goto out;
+       }
+       if (cmd->index != index) {
+               dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+                       tag, index, cmd->index);
+       }
        if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
                dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
                        req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
@@ -752,9 +774,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
                ret = -ENOENT;
                goto out;
        }
-       if (ntohl(reply.error)) {
+       if (ntohl(reply->error)) {
                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
-                       ntohl(reply.error));
+                       ntohl(reply->error));
                cmd->status = BLK_STS_IOERR;
                goto out;
        }
@@ -763,11 +785,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
        if (rq_data_dir(req) != WRITE) {
                struct req_iterator iter;
                struct bio_vec bvec;
+               struct iov_iter to;
 
                rq_for_each_segment(bvec, req, iter) {
                        iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
                        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-                       if (result <= 0) {
+                       if (result < 0) {
                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
                                        result);
                                /*
@@ -776,7 +799,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
                                 * and let the timeout stuff handle resubmitting
                                 * this request onto another connection.
                                 */
-                               if (nbd_disconnected(config)) {
+                               if (nbd_disconnected(nbd->config)) {
                                        cmd->status = BLK_STS_IOERR;
                                        goto out;
                                }
@@ -800,24 +823,46 @@ static void recv_work(struct work_struct *work)
                                                     work);
        struct nbd_device *nbd = args->nbd;
        struct nbd_config *config = nbd->config;
+       struct request_queue *q = nbd->disk->queue;
+       struct nbd_sock *nsock;
        struct nbd_cmd *cmd;
        struct request *rq;
 
        while (1) {
-               cmd = nbd_read_stat(nbd, args->index);
-               if (IS_ERR(cmd)) {
-                       struct nbd_sock *nsock = config->socks[args->index];
+               struct nbd_reply reply;
 
-                       mutex_lock(&nsock->tx_lock);
-                       nbd_mark_nsock_dead(nbd, nsock, 1);
-                       mutex_unlock(&nsock->tx_lock);
+               if (nbd_read_reply(nbd, args->index, &reply))
+                       break;
+
+               /*
+                * Grab .q_usage_counter so request pool won't go away, then no
+                * request use-after-free is possible during nbd_handle_reply().
+                * If queue is frozen, there won't be any inflight requests, we
+                * needn't to handle the incoming garbage message.
+                */
+               if (!percpu_ref_tryget(&q->q_usage_counter)) {
+                       dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+                               __func__);
+                       break;
+               }
+
+               cmd = nbd_handle_reply(nbd, args->index, &reply);
+               if (IS_ERR(cmd)) {
+                       percpu_ref_put(&q->q_usage_counter);
                        break;
                }
 
                rq = blk_mq_rq_from_pdu(cmd);
                if (likely(!blk_should_fake_timeout(rq->q)))
                        blk_mq_complete_request(rq);
+               percpu_ref_put(&q->q_usage_counter);
        }
+
+       nsock = config->socks[args->index];
+       mutex_lock(&nsock->tx_lock);
+       nbd_mark_nsock_dead(nbd, nsock, 1);
+       mutex_unlock(&nsock->tx_lock);
+
        nbd_config_put(nbd);
        atomic_dec(&config->recv_threads);
        wake_up(&config->recv_wq);
@@ -833,6 +878,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
                return true;
 
        mutex_lock(&cmd->lock);
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               mutex_unlock(&cmd->lock);
+               return true;
+       }
        cmd->status = BLK_STS_IOERR;
        mutex_unlock(&cmd->lock);
 
@@ -914,7 +963,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
        if (!refcount_inc_not_zero(&nbd->config_refs)) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Socks array is empty\n");
-               blk_mq_start_request(req);
                return -EINVAL;
        }
        config = nbd->config;
@@ -923,7 +971,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Attempted send on invalid socket\n");
                nbd_config_put(nbd);
-               blk_mq_start_request(req);
                return -EINVAL;
        }
        cmd->status = BLK_STS_OK;
@@ -947,7 +994,6 @@ again:
                         */
                        sock_shutdown(nbd);
                        nbd_config_put(nbd);
-                       blk_mq_start_request(req);
                        return -EIO;
                }
                goto again;
@@ -969,7 +1015,13 @@ again:
         * returns EAGAIN can be retried on a different socket.
         */
        ret = nbd_send_cmd(nbd, cmd, index);
-       if (ret == -EAGAIN) {
+       /*
+        * Access to this flag is protected by cmd->lock, thus it's safe to set
+        * the flag after nbd_send_cmd() succeed to send request to server.
+        */
+       if (!ret)
+               __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+       else if (ret == -EAGAIN) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Request send failed, requeueing\n");
                nbd_mark_nsock_dead(nbd, nsock, 1);
@@ -1206,7 +1258,7 @@ static void send_disconnects(struct nbd_device *nbd)
                iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
                mutex_lock(&nsock->tx_lock);
                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
-               if (ret <= 0)
+               if (ret < 0)
                        dev_err(disk_to_dev(nbd->disk),
                                "Send disconnect failed %d\n", ret);
                mutex_unlock(&nsock->tx_lock);
@@ -1237,11 +1289,13 @@ static void nbd_config_put(struct nbd_device *nbd)
                                        &nbd->config_lock)) {
                struct nbd_config *config = nbd->config;
                nbd_dev_dbg_close(nbd);
-               nbd_size_clear(nbd);
+               invalidate_disk(nbd->disk);
+               if (nbd->config->bytesize)
+                       kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
                if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
                                       &config->runtime_flags))
                        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-               nbd->task_recv = NULL;
+               nbd->pid = 0;
                if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
                                       &config->runtime_flags)) {
                        device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
@@ -1282,7 +1336,7 @@ static int nbd_start_device(struct nbd_device *nbd)
        int num_connections = config->num_connections;
        int error = 0, i;
 
-       if (nbd->task_recv)
+       if (nbd->pid)
                return -EBUSY;
        if (!config->socks)
                return -EINVAL;
@@ -1301,7 +1355,7 @@ static int nbd_start_device(struct nbd_device *nbd)
        }
 
        blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
-       nbd->task_recv = current;
+       nbd->pid = task_pid_nr(current);
 
        nbd_parse_flags(nbd);
 
@@ -1557,8 +1611,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 {
        struct nbd_device *nbd = s->private;
 
-       if (nbd->task_recv)
-               seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+       if (nbd->pid)
+               seq_printf(s, "recv: %d\n", nbd->pid);
 
        return 0;
 }
@@ -1762,7 +1816,9 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
        disk->fops = &nbd_fops;
        disk->private_data = nbd;
        sprintf(disk->disk_name, "nbd%d", index);
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_err_disk;
 
        /*
         * Now publish the device.
@@ -1771,6 +1827,8 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
        nbd_total_devices++;
        return nbd;
 
+out_err_disk:
+       blk_cleanup_disk(disk);
 out_free_idr:
        mutex_lock(&nbd_index_mutex);
        idr_remove(&nbd_index_idr, index);
@@ -2135,7 +2193,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
        mutex_lock(&nbd->config_lock);
        config = nbd->config;
        if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
-           !nbd->task_recv) {
+           !nbd->pid) {
                dev_err(nbd_to_dev(nbd),
                        "not configured, cannot reconfigure\n");
                ret = -EINVAL;
index 187d779..323af5c 100644 (file)
@@ -92,6 +92,10 @@ static int g_submit_queues = 1;
 module_param_named(submit_queues, g_submit_queues, int, 0444);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
 
+static int g_poll_queues = 1;
+module_param_named(poll_queues, g_poll_queues, int, 0444);
+MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");
+
 static int g_home_node = NUMA_NO_NODE;
 module_param_named(home_node, g_home_node, int, 0444);
 MODULE_PARM_DESC(home_node, "Home node for the device");
@@ -324,29 +328,69 @@ nullb_device_##NAME##_store(struct config_item *item, const char *page,   \
 }                                                                      \
 CONFIGFS_ATTR(nullb_device_, NAME);
 
-static int nullb_apply_submit_queues(struct nullb_device *dev,
-                                    unsigned int submit_queues)
+static int nullb_update_nr_hw_queues(struct nullb_device *dev,
+                                    unsigned int submit_queues,
+                                    unsigned int poll_queues)
+
 {
-       struct nullb *nullb = dev->nullb;
        struct blk_mq_tag_set *set;
+       int ret, nr_hw_queues;
 
-       if (!nullb)
+       if (!dev->nullb)
                return 0;
 
        /*
+        * Make sure at least one queue exists for each of submit and poll.
+        */
+       if (!submit_queues || !poll_queues)
+               return -EINVAL;
+
+       /*
         * Make sure that null_init_hctx() does not access nullb->queues[] past
         * the end of that array.
         */
-       if (submit_queues > nr_cpu_ids)
+       if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
                return -EINVAL;
-       set = nullb->tag_set;
-       blk_mq_update_nr_hw_queues(set, submit_queues);
-       return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM;
+
+       /*
+        * Keep previous and new queue numbers in nullb_device for reference in
+        * the call back function null_map_queues().
+        */
+       dev->prev_submit_queues = dev->submit_queues;
+       dev->prev_poll_queues = dev->poll_queues;
+       dev->submit_queues = submit_queues;
+       dev->poll_queues = poll_queues;
+
+       set = dev->nullb->tag_set;
+       nr_hw_queues = submit_queues + poll_queues;
+       blk_mq_update_nr_hw_queues(set, nr_hw_queues);
+       ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;
+
+       if (ret) {
+               /* on error, revert the queue numbers */
+               dev->submit_queues = dev->prev_submit_queues;
+               dev->poll_queues = dev->prev_poll_queues;
+       }
+
+       return ret;
+}
+
+static int nullb_apply_submit_queues(struct nullb_device *dev,
+                                    unsigned int submit_queues)
+{
+       return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
+}
+
+static int nullb_apply_poll_queues(struct nullb_device *dev,
+                                  unsigned int poll_queues)
+{
+       return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
 }
 
 NULLB_DEVICE_ATTR(size, ulong, NULL);
 NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
 NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
+NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
 NULLB_DEVICE_ATTR(home_node, uint, NULL);
 NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
 NULLB_DEVICE_ATTR(blocksize, uint, NULL);
@@ -466,6 +510,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
        &nullb_device_attr_size,
        &nullb_device_attr_completion_nsec,
        &nullb_device_attr_submit_queues,
+       &nullb_device_attr_poll_queues,
        &nullb_device_attr_home_node,
        &nullb_device_attr_queue_mode,
        &nullb_device_attr_blocksize,
@@ -593,6 +638,9 @@ static struct nullb_device *null_alloc_dev(void)
        dev->size = g_gb * 1024;
        dev->completion_nsec = g_completion_nsec;
        dev->submit_queues = g_submit_queues;
+       dev->prev_submit_queues = g_submit_queues;
+       dev->poll_queues = g_poll_queues;
+       dev->prev_poll_queues = g_poll_queues;
        dev->home_node = g_home_node;
        dev->queue_mode = g_queue_mode;
        dev->blocksize = g_bs;
@@ -1422,7 +1470,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
        return &nullb->queues[index];
 }
 
-static blk_qc_t null_submit_bio(struct bio *bio)
+static void null_submit_bio(struct bio *bio)
 {
        sector_t sector = bio->bi_iter.bi_sector;
        sector_t nr_sectors = bio_sectors(bio);
@@ -1434,7 +1482,6 @@ static blk_qc_t null_submit_bio(struct bio *bio)
        cmd->bio = bio;
 
        null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
-       return BLK_QC_T_NONE;
 }
 
 static bool should_timeout_request(struct request *rq)
@@ -1455,12 +1502,100 @@ static bool should_requeue_request(struct request *rq)
        return false;
 }
 
+static int null_map_queues(struct blk_mq_tag_set *set)
+{
+       struct nullb *nullb = set->driver_data;
+       int i, qoff;
+       unsigned int submit_queues = g_submit_queues;
+       unsigned int poll_queues = g_poll_queues;
+
+       if (nullb) {
+               struct nullb_device *dev = nullb->dev;
+
+               /*
+                * Refer nr_hw_queues of the tag set to check if the expected
+                * number of hardware queues are prepared. If block layer failed
+                * to prepare them, use previous numbers of submit queues and
+                * poll queues to map queues.
+                */
+               if (set->nr_hw_queues ==
+                   dev->submit_queues + dev->poll_queues) {
+                       submit_queues = dev->submit_queues;
+                       poll_queues = dev->poll_queues;
+               } else if (set->nr_hw_queues ==
+                          dev->prev_submit_queues + dev->prev_poll_queues) {
+                       submit_queues = dev->prev_submit_queues;
+                       poll_queues = dev->prev_poll_queues;
+               } else {
+                       pr_warn("tag set has unexpected nr_hw_queues: %d\n",
+                               set->nr_hw_queues);
+                       return -EINVAL;
+               }
+       }
+
+       for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+               struct blk_mq_queue_map *map = &set->map[i];
+
+               switch (i) {
+               case HCTX_TYPE_DEFAULT:
+                       map->nr_queues = submit_queues;
+                       break;
+               case HCTX_TYPE_READ:
+                       map->nr_queues = 0;
+                       continue;
+               case HCTX_TYPE_POLL:
+                       map->nr_queues = poll_queues;
+                       break;
+               }
+               map->queue_offset = qoff;
+               qoff += map->nr_queues;
+               blk_mq_map_queues(map);
+       }
+
+       return 0;
+}
+
+static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+       struct nullb_queue *nq = hctx->driver_data;
+       LIST_HEAD(list);
+       int nr = 0;
+
+       spin_lock(&nq->poll_lock);
+       list_splice_init(&nq->poll_list, &list);
+       spin_unlock(&nq->poll_lock);
+
+       while (!list_empty(&list)) {
+               struct nullb_cmd *cmd;
+               struct request *req;
+
+               req = list_first_entry(&list, struct request, queuelist);
+               list_del_init(&req->queuelist);
+               cmd = blk_mq_rq_to_pdu(req);
+               cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
+                                               blk_rq_sectors(req));
+               end_cmd(cmd);
+               nr++;
+       }
+
+       return nr;
+}
+
 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
 {
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
        pr_info("rq %p timed out\n", rq);
 
+       if (hctx->type == HCTX_TYPE_POLL) {
+               struct nullb_queue *nq = hctx->driver_data;
+
+               spin_lock(&nq->poll_lock);
+               list_del_init(&rq->queuelist);
+               spin_unlock(&nq->poll_lock);
+       }
+
        /*
         * If the device is marked as blocking (i.e. memory backed or zoned
         * device), the submission path may be blocked waiting for resources
@@ -1481,10 +1616,11 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct nullb_queue *nq = hctx->driver_data;
        sector_t nr_sectors = blk_rq_sectors(bd->rq);
        sector_t sector = blk_rq_pos(bd->rq);
+       const bool is_poll = hctx->type == HCTX_TYPE_POLL;
 
        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+       if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
                hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                cmd->timer.function = null_cmd_timer_expired;
        }
@@ -1508,6 +1644,13 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                        return BLK_STS_OK;
                }
        }
+
+       if (is_poll) {
+               spin_lock(&nq->poll_lock);
+               list_add_tail(&bd->rq->queuelist, &nq->poll_list);
+               spin_unlock(&nq->poll_lock);
+               return BLK_STS_OK;
+       }
        if (cmd->fake_timeout)
                return BLK_STS_OK;
 
@@ -1543,6 +1686,8 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
        init_waitqueue_head(&nq->wait);
        nq->queue_depth = nullb->queue_depth;
        nq->dev = nullb->dev;
+       INIT_LIST_HEAD(&nq->poll_list);
+       spin_lock_init(&nq->poll_lock);
 }
 
 static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
@@ -1568,6 +1713,8 @@ static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .complete       = null_complete_rq,
        .timeout        = null_timeout_rq,
+       .poll           = null_poll,
+       .map_queues     = null_map_queues,
        .init_hctx      = null_init_hctx,
        .exit_hctx      = null_exit_hctx,
 };
@@ -1664,13 +1811,17 @@ static int setup_commands(struct nullb_queue *nq)
 
 static int setup_queues(struct nullb *nullb)
 {
-       nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue),
+       int nqueues = nr_cpu_ids;
+
+       if (g_poll_queues)
+               nqueues += g_poll_queues;
+
+       nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue),
                                GFP_KERNEL);
        if (!nullb->queues)
                return -ENOMEM;
 
        nullb->queue_depth = nullb->dev->hw_queue_depth;
-
        return 0;
 }
 
@@ -1722,9 +1873,14 @@ static int null_gendisk_register(struct nullb *nullb)
 
 static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
 {
+       int poll_queues;
+
        set->ops = &null_mq_ops;
        set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
                                                g_submit_queues;
+       poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
+       if (poll_queues)
+               set->nr_hw_queues += poll_queues;
        set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
                                                g_hw_queue_depth;
        set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
@@ -1734,7 +1890,11 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
                set->flags |= BLK_MQ_F_NO_SCHED;
        if (g_shared_tag_bitmap)
                set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
-       set->driver_data = NULL;
+       set->driver_data = nullb;
+       if (g_poll_queues)
+               set->nr_maps = 3;
+       else
+               set->nr_maps = 1;
 
        if ((nullb && nullb->dev->blocking) || g_blocking)
                set->flags |= BLK_MQ_F_BLOCKING;
@@ -1754,6 +1914,13 @@ static int null_validate_conf(struct nullb_device *dev)
                dev->submit_queues = nr_cpu_ids;
        else if (dev->submit_queues == 0)
                dev->submit_queues = 1;
+       dev->prev_submit_queues = dev->submit_queues;
+
+       if (dev->poll_queues > g_poll_queues)
+               dev->poll_queues = g_poll_queues;
+       else if (dev->poll_queues == 0)
+               dev->poll_queues = 1;
+       dev->prev_poll_queues = dev->poll_queues;
 
        dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
        dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
index 64bef12..78eb56b 100644 (file)
@@ -32,6 +32,9 @@ struct nullb_queue {
        struct nullb_device *dev;
        unsigned int requeue_selection;
 
+       struct list_head poll_list;
+       spinlock_t poll_lock;
+
        struct nullb_cmd *cmds;
 };
 
@@ -83,6 +86,9 @@ struct nullb_device {
        unsigned int zone_max_open; /* max number of open zones */
        unsigned int zone_max_active; /* max number of active zones */
        unsigned int submit_queues; /* number of submission queues */
+       unsigned int prev_submit_queues; /* number of submission queues before change */
+       unsigned int poll_queues; /* number of IOPOLL submission queues */
+       unsigned int prev_poll_queues; /* number of IOPOLL submission queues before change */
        unsigned int home_node; /* home node for the device */
        unsigned int queue_mode; /* block interface */
        unsigned int blocksize; /* block size */
index f9cdd11..f6b1d63 100644 (file)
@@ -183,8 +183,6 @@ static int pcd_audio_ioctl(struct cdrom_device_info *cdi,
 static int pcd_packet(struct cdrom_device_info *cdi,
                      struct packet_command *cgc);
 
-static int pcd_detect(void);
-static void pcd_probe_capabilities(void);
 static void do_pcd_read_drq(void);
 static blk_status_t pcd_queue_rq(struct blk_mq_hw_ctx *hctx,
                                 const struct blk_mq_queue_data *bd);
@@ -302,53 +300,6 @@ static const struct blk_mq_ops pcd_mq_ops = {
        .queue_rq       = pcd_queue_rq,
 };
 
-static void pcd_init_units(void)
-{
-       struct pcd_unit *cd;
-       int unit;
-
-       pcd_drive_count = 0;
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               struct gendisk *disk;
-
-               if (blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1,
-                               BLK_MQ_F_SHOULD_MERGE))
-                       continue;
-
-               disk = blk_mq_alloc_disk(&cd->tag_set, cd);
-               if (IS_ERR(disk)) {
-                       blk_mq_free_tag_set(&cd->tag_set);
-                       continue;
-               }
-
-               INIT_LIST_HEAD(&cd->rq_list);
-               blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
-               cd->disk = disk;
-               cd->pi = &cd->pia;
-               cd->present = 0;
-               cd->last_sense = 0;
-               cd->changed = 1;
-               cd->drive = (*drives[unit])[D_SLV];
-               if ((*drives[unit])[D_PRT])
-                       pcd_drive_count++;
-
-               cd->name = &cd->info.name[0];
-               snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
-               cd->info.ops = &pcd_dops;
-               cd->info.handle = cd;
-               cd->info.speed = 0;
-               cd->info.capacity = 1;
-               cd->info.mask = 0;
-               disk->major = major;
-               disk->first_minor = unit;
-               disk->minors = 1;
-               strcpy(disk->disk_name, cd->name);      /* umm... */
-               disk->fops = &pcd_bdops;
-               disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-               disk->events = DISK_EVENT_MEDIA_CHANGE;
-       }
-}
-
 static int pcd_open(struct cdrom_device_info *cdi, int purpose)
 {
        struct pcd_unit *cd = cdi->handle;
@@ -630,10 +581,11 @@ static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr)
        return CDS_DISC_OK;
 }
 
-static int pcd_identify(struct pcd_unit *cd, char *id)
+static int pcd_identify(struct pcd_unit *cd)
 {
-       int k, s;
        char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+       char id[18];
+       int k, s;
 
        pcd_bufblk = -1;
 
@@ -661,108 +613,47 @@ static int pcd_identify(struct pcd_unit *cd, char *id)
 }
 
 /*
- * returns  0, with id set if drive is detected
- *         -1, if drive detection failed
+ * returns 0, with id set if drive is detected, otherwise an error code.
  */
-static int pcd_probe(struct pcd_unit *cd, int ms, char *id)
+static int pcd_probe(struct pcd_unit *cd, int ms)
 {
        if (ms == -1) {
                for (cd->drive = 0; cd->drive <= 1; cd->drive++)
-                       if (!pcd_reset(cd) && !pcd_identify(cd, id))
+                       if (!pcd_reset(cd) && !pcd_identify(cd))
                                return 0;
        } else {
                cd->drive = ms;
-               if (!pcd_reset(cd) && !pcd_identify(cd, id))
+               if (!pcd_reset(cd) && !pcd_identify(cd))
                        return 0;
        }
-       return -1;
+       return -ENODEV;
 }
 
-static void pcd_probe_capabilities(void)
+static int pcd_probe_capabilities(struct pcd_unit *cd)
 {
-       int unit, r;
-       char buffer[32];
        char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 };
-       struct pcd_unit *cd;
-
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->present)
-                       continue;
-               r = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
-               if (r)
-                       continue;
-               /* we should now have the cap page */
-               if ((buffer[11] & 1) == 0)
-                       cd->info.mask |= CDC_CD_R;
-               if ((buffer[11] & 2) == 0)
-                       cd->info.mask |= CDC_CD_RW;
-               if ((buffer[12] & 1) == 0)
-                       cd->info.mask |= CDC_PLAY_AUDIO;
-               if ((buffer[14] & 1) == 0)
-                       cd->info.mask |= CDC_LOCK;
-               if ((buffer[14] & 8) == 0)
-                       cd->info.mask |= CDC_OPEN_TRAY;
-               if ((buffer[14] >> 6) == 0)
-                       cd->info.mask |= CDC_CLOSE_TRAY;
-       }
-}
-
-static int pcd_detect(void)
-{
-       char id[18];
-       int k, unit;
-       struct pcd_unit *cd;
+       char buffer[32];
+       int ret;
 
-       printk("%s: %s version %s, major %d, nice %d\n",
-              name, name, PCD_VERSION, major, nice);
+       ret = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
+       if (ret)
+               return ret;
+
+       /* we should now have the cap page */
+       if ((buffer[11] & 1) == 0)
+               cd->info.mask |= CDC_CD_R;
+       if ((buffer[11] & 2) == 0)
+               cd->info.mask |= CDC_CD_RW;
+       if ((buffer[12] & 1) == 0)
+               cd->info.mask |= CDC_PLAY_AUDIO;
+       if ((buffer[14] & 1) == 0)
+               cd->info.mask |= CDC_LOCK;
+       if ((buffer[14] & 8) == 0)
+               cd->info.mask |= CDC_OPEN_TRAY;
+       if ((buffer[14] >> 6) == 0)
+               cd->info.mask |= CDC_CLOSE_TRAY;
 
-       par_drv = pi_register_driver(name);
-       if (!par_drv) {
-               pr_err("failed to register %s driver\n", name);
-               return -1;
-       }
-
-       k = 0;
-       if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
-               cd = pcd;
-               if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1,
-                           pcd_buffer, PI_PCD, verbose, cd->name)) {
-                       if (!pcd_probe(cd, -1, id)) {
-                               cd->present = 1;
-                               k++;
-                       } else
-                               pi_release(cd->pi);
-               }
-       } else {
-               for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-                       int *conf = *drives[unit];
-                       if (!conf[D_PRT])
-                               continue;
-                       if (!cd->disk)
-                               continue;
-                       if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD],
-                                    conf[D_UNI], conf[D_PRO], conf[D_DLY],
-                                    pcd_buffer, PI_PCD, verbose, cd->name)) 
-                               continue;
-                       if (!pcd_probe(cd, conf[D_SLV], id)) {
-                               cd->present = 1;
-                               k++;
-                       } else
-                               pi_release(cd->pi);
-               }
-       }
-       if (k)
-               return 0;
-
-       printk("%s: No CD-ROM drive found\n", name);
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->disk)
-                       continue;
-               blk_cleanup_disk(cd->disk);
-               blk_mq_free_tag_set(&cd->tag_set);
-       }
-       pi_unregister_driver(par_drv);
-       return -1;
+       return 0;
 }
 
 /* I/O request processing */
@@ -999,43 +890,130 @@ static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn)
        return 0;
 }
 
+static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port,
+               int mode, int unit, int protocol, int delay, int ms)
+{
+       struct gendisk *disk;
+       int ret;
+
+       ret = blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1,
+                                     BLK_MQ_F_SHOULD_MERGE);
+       if (ret)
+               return ret;
+
+       disk = blk_mq_alloc_disk(&cd->tag_set, cd);
+       if (IS_ERR(disk)) {
+               ret = PTR_ERR(disk);
+               goto out_free_tag_set;
+       }
+
+       INIT_LIST_HEAD(&cd->rq_list);
+       blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+       cd->disk = disk;
+       cd->pi = &cd->pia;
+       cd->present = 0;
+       cd->last_sense = 0;
+       cd->changed = 1;
+       cd->drive = (*drives[cd - pcd])[D_SLV];
+
+       cd->name = &cd->info.name[0];
+       snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
+       cd->info.ops = &pcd_dops;
+       cd->info.handle = cd;
+       cd->info.speed = 0;
+       cd->info.capacity = 1;
+       cd->info.mask = 0;
+       disk->major = major;
+       disk->first_minor = unit;
+       disk->minors = 1;
+       strcpy(disk->disk_name, cd->name);      /* umm... */
+       disk->fops = &pcd_bdops;
+       disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       disk->events = DISK_EVENT_MEDIA_CHANGE;
+
+       if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pcd_buffer, PI_PCD, verbose, cd->name)) {
+               ret = -ENODEV;
+               goto out_free_disk;
+       }
+       ret = pcd_probe(cd, ms);
+       if (ret)
+               goto out_pi_release;
+
+       cd->present = 1;
+       pcd_probe_capabilities(cd);
+       ret = register_cdrom(cd->disk, &cd->info);
+       if (ret)
+               goto out_pi_release;
+       ret = add_disk(cd->disk);
+       if (ret)
+               goto out_unreg_cdrom;
+       return 0;
+
+out_unreg_cdrom:
+       unregister_cdrom(&cd->info);
+out_pi_release:
+       pi_release(cd->pi);
+out_free_disk:
+       blk_cleanup_disk(cd->disk);
+out_free_tag_set:
+       blk_mq_free_tag_set(&cd->tag_set);
+       return ret;
+}
+
 static int __init pcd_init(void)
 {
-       struct pcd_unit *cd;
-       int unit;
+       int found = 0, unit;
 
        if (disable)
                return -EINVAL;
 
-       pcd_init_units();
+       if (register_blkdev(major, name))
+               return -EBUSY;
 
-       if (pcd_detect())
-               return -ENODEV;
+       pr_info("%s: %s version %s, major %d, nice %d\n",
+               name, name, PCD_VERSION, major, nice);
 
-       /* get the atapi capabilities page */
-       pcd_probe_capabilities();
+       par_drv = pi_register_driver(name);
+       if (!par_drv) {
+               pr_err("failed to register %s driver\n", name);
+               goto out_unregister_blkdev;
+       }
 
-       if (register_blkdev(major, name)) {
-               for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-                       if (!cd->disk)
-                               continue;
+       for (unit = 0; unit < PCD_UNITS; unit++) {
+               if ((*drives[unit])[D_PRT])
+                       pcd_drive_count++;
+       }
+
+       if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+               if (!pcd_init_unit(pcd, 1, -1, -1, -1, -1, -1, -1))
+                       found++;
+       } else {
+               for (unit = 0; unit < PCD_UNITS; unit++) {
+                       struct pcd_unit *cd = &pcd[unit];
+                       int *conf = *drives[unit];
 
-                       blk_cleanup_queue(cd->disk->queue);
-                       blk_mq_free_tag_set(&cd->tag_set);
-                       put_disk(cd->disk);
+                       if (!conf[D_PRT])
+                               continue;
+                       if (!pcd_init_unit(cd, 0, conf[D_PRT], conf[D_MOD],
+                                       conf[D_UNI], conf[D_PRO], conf[D_DLY],
+                                       conf[D_SLV]))
+                               found++;
                }
-               return -EBUSY;
        }
 
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (cd->present) {
-                       register_cdrom(cd->disk, &cd->info);
-                       cd->disk->private_data = cd;
-                       add_disk(cd->disk);
-               }
+       if (!found) {
+               pr_info("%s: No CD-ROM drive found\n", name);
+               goto out_unregister_pi_driver;
        }
 
        return 0;
+
+out_unregister_pi_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+       unregister_blkdev(major, name);
+       return -ENODEV;
 }
 
 static void __exit pcd_exit(void)
@@ -1044,20 +1022,18 @@ static void __exit pcd_exit(void)
        int unit;
 
        for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->disk)
+               if (!cd->present)
                        continue;
 
-               if (cd->present) {
-                       del_gendisk(cd->disk);
-                       pi_release(cd->pi);
-                       unregister_cdrom(&cd->info);
-               }
-               blk_cleanup_queue(cd->disk->queue);
+               unregister_cdrom(&cd->info);
+               del_gendisk(cd->disk);
+               pi_release(cd->pi);
+               blk_cleanup_disk(cd->disk);
+
                blk_mq_free_tag_set(&cd->tag_set);
-               put_disk(cd->disk);
        }
-       unregister_blkdev(major, name);
        pi_unregister_driver(par_drv);
+       unregister_blkdev(major, name);
 }
 
 MODULE_LICENSE("GPL");
index 675327d..fba8650 100644 (file)
@@ -775,14 +775,14 @@ static int pd_special_command(struct pd_unit *disk,
        struct request *rq;
        struct pd_req *req;
 
-       rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
+       rq = blk_mq_alloc_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        req = blk_mq_rq_to_pdu(rq);
 
        req->func = func;
        blk_execute_rq(disk->gd, rq, 0);
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
        return 0;
 }
 
@@ -875,9 +875,27 @@ static const struct blk_mq_ops pd_mq_ops = {
        .queue_rq       = pd_queue_rq,
 };
 
-static void pd_probe_drive(struct pd_unit *disk)
+static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port,
+               int mode, int unit, int protocol, int delay)
 {
+       int index = disk - pd;
+       int *parm = *drives[index];
        struct gendisk *p;
+       int ret;
+
+       disk->pi = &disk->pia;
+       disk->access = 0;
+       disk->changed = 1;
+       disk->capacity = 0;
+       disk->drive = parm[D_SLV];
+       snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a' + index);
+       disk->alt_geom = parm[D_GEO];
+       disk->standby = parm[D_SBY];
+       INIT_LIST_HEAD(&disk->rq_list);
+
+       if (!pi_init(disk->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pd_scratch, PI_PD, verbose, disk->name))
+               return -ENXIO;
 
        memset(&disk->tag_set, 0, sizeof(disk->tag_set));
        disk->tag_set.ops = &pd_mq_ops;
@@ -887,14 +905,14 @@ static void pd_probe_drive(struct pd_unit *disk)
        disk->tag_set.queue_depth = 2;
        disk->tag_set.numa_node = NUMA_NO_NODE;
        disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
-
-       if (blk_mq_alloc_tag_set(&disk->tag_set))
-               return;
+       ret = blk_mq_alloc_tag_set(&disk->tag_set);
+       if (ret)
+               goto pi_release;
 
        p = blk_mq_alloc_disk(&disk->tag_set, disk);
        if (IS_ERR(p)) {
-               blk_mq_free_tag_set(&disk->tag_set);
-               return;
+               ret = PTR_ERR(p);
+               goto free_tag_set;
        }
        disk->gd = p;
 
@@ -905,102 +923,88 @@ static void pd_probe_drive(struct pd_unit *disk)
        p->minors = 1 << PD_BITS;
        p->events = DISK_EVENT_MEDIA_CHANGE;
        p->private_data = disk;
-
        blk_queue_max_hw_sectors(p->queue, cluster);
        blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH);
 
        if (disk->drive == -1) {
-               for (disk->drive = 0; disk->drive <= 1; disk->drive++)
-                       if (pd_special_command(disk, pd_identify) == 0)
-                               return;
-       } else if (pd_special_command(disk, pd_identify) == 0)
-               return;
-       disk->gd = NULL;
+               for (disk->drive = 0; disk->drive <= 1; disk->drive++) {
+                       ret = pd_special_command(disk, pd_identify);
+                       if (ret == 0)
+                               break;
+               }
+       } else {
+               ret = pd_special_command(disk, pd_identify);
+       }
+       if (ret)
+               goto put_disk;
+       set_capacity(disk->gd, disk->capacity);
+       ret = add_disk(disk->gd);
+       if (ret)
+               goto cleanup_disk;
+       return 0;
+cleanup_disk:
+       blk_cleanup_disk(disk->gd);
+put_disk:
        put_disk(p);
+       disk->gd = NULL;
+free_tag_set:
+       blk_mq_free_tag_set(&disk->tag_set);
+pi_release:
+       pi_release(disk->pi);
+       return ret;
 }
 
-static int pd_detect(void)
+static int __init pd_init(void)
 {
        int found = 0, unit, pd_drive_count = 0;
        struct pd_unit *disk;
 
-       for (unit = 0; unit < PD_UNITS; unit++) {
-               int *parm = *drives[unit];
-               struct pd_unit *disk = pd + unit;
-               disk->pi = &disk->pia;
-               disk->access = 0;
-               disk->changed = 1;
-               disk->capacity = 0;
-               disk->drive = parm[D_SLV];
-               snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a'+unit);
-               disk->alt_geom = parm[D_GEO];
-               disk->standby = parm[D_SBY];
-               if (parm[D_PRT])
-                       pd_drive_count++;
-               INIT_LIST_HEAD(&disk->rq_list);
-       }
+       if (disable)
+               return -ENODEV;
+
+       if (register_blkdev(major, name))
+               return -ENODEV;
+
+       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+              name, name, PD_VERSION, major, cluster, nice);
 
        par_drv = pi_register_driver(name);
        if (!par_drv) {
                pr_err("failed to register %s driver\n", name);
-               return -1;
+               goto out_unregister_blkdev;
        }
 
-       if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
-               disk = pd;
-               if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch,
-                           PI_PD, verbose, disk->name)) {
-                       pd_probe_drive(disk);
-                       if (!disk->gd)
-                               pi_release(disk->pi);
-               }
+       for (unit = 0; unit < PD_UNITS; unit++) {
+               int *parm = *drives[unit];
 
+               if (parm[D_PRT])
+                       pd_drive_count++;
+       }
+
+       if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+               if (!pd_probe_drive(pd, 1, -1, -1, -1, -1, -1))
+                       found++;
        } else {
                for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
                        int *parm = *drives[unit];
                        if (!parm[D_PRT])
                                continue;
-                       if (pi_init(disk->pi, 0, parm[D_PRT], parm[D_MOD],
-                                    parm[D_UNI], parm[D_PRO], parm[D_DLY],
-                                    pd_scratch, PI_PD, verbose, disk->name)) {
-                               pd_probe_drive(disk);
-                               if (!disk->gd)
-                                       pi_release(disk->pi);
-                       }
-               }
-       }
-       for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
-               if (disk->gd) {
-                       set_capacity(disk->gd, disk->capacity);
-                       add_disk(disk->gd);
-                       found = 1;
+                       if (!pd_probe_drive(disk, 0, parm[D_PRT], parm[D_MOD],
+                                       parm[D_UNI], parm[D_PRO], parm[D_DLY]))
+                               found++;
                }
        }
        if (!found) {
                printk("%s: no valid drive found\n", name);
-               pi_unregister_driver(par_drv);
+               goto out_pi_unregister_driver;
        }
-       return found;
-}
-
-static int __init pd_init(void)
-{
-       if (disable)
-               goto out1;
-
-       if (register_blkdev(major, name))
-               goto out1;
-
-       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
-              name, name, PD_VERSION, major, cluster, nice);
-       if (!pd_detect())
-               goto out2;
 
        return 0;
 
-out2:
+out_pi_unregister_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
        unregister_blkdev(major, name);
-out1:
        return -ENODEV;
 }
 
index d5b9c88..bf8d0ef 100644 (file)
@@ -214,7 +214,6 @@ static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static void pf_release(struct gendisk *disk, fmode_t mode);
 
-static int pf_detect(void);
 static void do_pf_read(void);
 static void do_pf_read_start(void);
 static void do_pf_write(void);
@@ -285,45 +284,6 @@ static const struct blk_mq_ops pf_mq_ops = {
        .queue_rq       = pf_queue_rq,
 };
 
-static void __init pf_init_units(void)
-{
-       struct pf_unit *pf;
-       int unit;
-
-       pf_drive_count = 0;
-       for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) {
-               struct gendisk *disk;
-
-               if (blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1,
-                               BLK_MQ_F_SHOULD_MERGE))
-                       continue;
-
-               disk = blk_mq_alloc_disk(&pf->tag_set, pf);
-               if (IS_ERR(disk)) {
-                       blk_mq_free_tag_set(&pf->tag_set);
-                       continue;
-               }
-
-               INIT_LIST_HEAD(&pf->rq_list);
-               blk_queue_max_segments(disk->queue, cluster);
-               blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
-               pf->disk = disk;
-               pf->pi = &pf->pia;
-               pf->media_status = PF_NM;
-               pf->drive = (*drives[unit])[D_SLV];
-               pf->lun = (*drives[unit])[D_LUN];
-               snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit);
-               disk->major = major;
-               disk->first_minor = unit;
-               disk->minors = 1;
-               strcpy(disk->disk_name, pf->name);
-               disk->fops = &pf_fops;
-               disk->events = DISK_EVENT_MEDIA_CHANGE;
-               if (!(*drives[unit])[D_PRT])
-                       pf_drive_count++;
-       }
-}
-
 static int pf_open(struct block_device *bdev, fmode_t mode)
 {
        struct pf_unit *pf = bdev->bd_disk->private_data;
@@ -691,9 +651,9 @@ static int pf_identify(struct pf_unit *pf)
        return 0;
 }
 
-/*     returns  0, with id set if drive is detected
-               -1, if drive detection failed
-*/
+/*
+ * returns 0, with id set if drive is detected, otherwise an error code.
+ */
 static int pf_probe(struct pf_unit *pf)
 {
        if (pf->drive == -1) {
@@ -715,60 +675,7 @@ static int pf_probe(struct pf_unit *pf)
                        if (!pf_identify(pf))
                                return 0;
        }
-       return -1;
-}
-
-static int pf_detect(void)
-{
-       struct pf_unit *pf = units;
-       int k, unit;
-
-       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
-              name, name, PF_VERSION, major, cluster, nice);
-
-       par_drv = pi_register_driver(name);
-       if (!par_drv) {
-               pr_err("failed to register %s driver\n", name);
-               return -1;
-       }
-       k = 0;
-       if (pf_drive_count == 0) {
-               if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF,
-                           verbose, pf->name)) {
-                       if (!pf_probe(pf) && pf->disk) {
-                               pf->present = 1;
-                               k++;
-                       } else
-                               pi_release(pf->pi);
-               }
-
-       } else
-               for (unit = 0; unit < PF_UNITS; unit++, pf++) {
-                       int *conf = *drives[unit];
-                       if (!conf[D_PRT])
-                               continue;
-                       if (pi_init(pf->pi, 0, conf[D_PRT], conf[D_MOD],
-                                   conf[D_UNI], conf[D_PRO], conf[D_DLY],
-                                   pf_scratch, PI_PF, verbose, pf->name)) {
-                               if (pf->disk && !pf_probe(pf)) {
-                                       pf->present = 1;
-                                       k++;
-                               } else
-                                       pi_release(pf->pi);
-                       }
-               }
-       if (k)
-               return 0;
-
-       printk("%s: No ATAPI disk detected\n", name);
-       for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               if (!pf->disk)
-                       continue;
-               blk_cleanup_disk(pf->disk);
-               blk_mq_free_tag_set(&pf->tag_set);
-       }
-       pi_unregister_driver(par_drv);
-       return -1;
+       return -ENODEV;
 }
 
 /* The i/o request engine */
@@ -1014,61 +921,134 @@ static void do_pf_write_done(void)
        next_request(0);
 }
 
+static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port,
+               int mode, int unit, int protocol, int delay, int ms)
+{
+       struct gendisk *disk;
+       int ret;
+
+       ret = blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1,
+                                     BLK_MQ_F_SHOULD_MERGE);
+       if (ret)
+               return ret;
+
+       disk = blk_mq_alloc_disk(&pf->tag_set, pf);
+       if (IS_ERR(disk)) {
+               ret = PTR_ERR(disk);
+               goto out_free_tag_set;
+       }
+       disk->major = major;
+       disk->first_minor = pf - units;
+       disk->minors = 1;
+       strcpy(disk->disk_name, pf->name);
+       disk->fops = &pf_fops;
+       disk->events = DISK_EVENT_MEDIA_CHANGE;
+       disk->private_data = pf;
+
+       blk_queue_max_segments(disk->queue, cluster);
+       blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+
+       INIT_LIST_HEAD(&pf->rq_list);
+       pf->disk = disk;
+       pf->pi = &pf->pia;
+       pf->media_status = PF_NM;
+       pf->drive = (*drives[disk->first_minor])[D_SLV];
+       pf->lun = (*drives[disk->first_minor])[D_LUN];
+       snprintf(pf->name, PF_NAMELEN, "%s%d", name, disk->first_minor);
+
+       if (!pi_init(pf->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pf_scratch, PI_PF, verbose, pf->name)) {
+               ret = -ENODEV;
+               goto out_free_disk;
+       }
+       ret = pf_probe(pf);
+       if (ret)
+               goto out_pi_release;
+
+       ret = add_disk(disk);
+       if (ret)
+               goto out_pi_release;
+       pf->present = 1;
+       return 0;
+
+out_pi_release:
+       pi_release(pf->pi);
+out_free_disk:
+       blk_cleanup_disk(pf->disk);
+out_free_tag_set:
+       blk_mq_free_tag_set(&pf->tag_set);
+       return ret;
+}
+
 static int __init pf_init(void)
 {                              /* preliminary initialisation */
        struct pf_unit *pf;
-       int unit;
+       int found = 0, unit;
 
        if (disable)
                return -EINVAL;
 
-       pf_init_units();
+       if (register_blkdev(major, name))
+               return -EBUSY;
 
-       if (pf_detect())
-               return -ENODEV;
-       pf_busy = 0;
+       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+              name, name, PF_VERSION, major, cluster, nice);
 
-       if (register_blkdev(major, name)) {
-               for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-                       if (!pf->disk)
-                               continue;
-                       blk_cleanup_queue(pf->disk->queue);
-                       blk_mq_free_tag_set(&pf->tag_set);
-                       put_disk(pf->disk);
-               }
-               return -EBUSY;
+       par_drv = pi_register_driver(name);
+       if (!par_drv) {
+               pr_err("failed to register %s driver\n", name);
+               goto out_unregister_blkdev;
        }
 
-       for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               struct gendisk *disk = pf->disk;
+       for (unit = 0; unit < PF_UNITS; unit++) {
+               if (!(*drives[unit])[D_PRT])
+                       pf_drive_count++;
+       }
 
-               if (!pf->present)
-                       continue;
-               disk->private_data = pf;
-               add_disk(disk);
+       pf = units;
+       if (pf_drive_count == 0) {
+               if (pf_init_unit(pf, 1, -1, -1, -1, -1, -1, verbose))
+                       found++;
+       } else {
+               for (unit = 0; unit < PF_UNITS; unit++, pf++) {
+                       int *conf = *drives[unit];
+                       if (!conf[D_PRT])
+                               continue;
+                       if (pf_init_unit(pf, 0, conf[D_PRT], conf[D_MOD],
+                                   conf[D_UNI], conf[D_PRO], conf[D_DLY],
+                                   verbose))
+                               found++;
+               }
+       }
+       if (!found) {
+               printk("%s: No ATAPI disk detected\n", name);
+               goto out_unregister_pi_driver;
        }
+       pf_busy = 0;
        return 0;
+
+out_unregister_pi_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+       unregister_blkdev(major, name);
+       return -ENODEV;
 }
 
 static void __exit pf_exit(void)
 {
        struct pf_unit *pf;
        int unit;
-       unregister_blkdev(major, name);
+
        for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               if (!pf->disk)
+               if (!pf->present)
                        continue;
-
-               if (pf->present)
-                       del_gendisk(pf->disk);
-
-               blk_cleanup_queue(pf->disk->queue);
+               del_gendisk(pf->disk);
+               blk_cleanup_disk(pf->disk);
                blk_mq_free_tag_set(&pf->tag_set);
-               put_disk(pf->disk);
-
-               if (pf->present)
-                       pi_release(pf->pi);
+               pi_release(pf->pi);
        }
+
+       unregister_blkdev(major, name);
 }
 
 MODULE_LICENSE("GPL");
index 0f26b25..b53f648 100644 (file)
@@ -703,7 +703,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
        struct request *rq;
        int ret = 0;
 
-       rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
+       rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
                             REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
@@ -726,7 +726,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
        if (scsi_req(rq)->result)
                ret = -EIO;
 out:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
        return ret;
 }
 
@@ -2400,7 +2400,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
        }
 }
 
-static blk_qc_t pkt_submit_bio(struct bio *bio)
+static void pkt_submit_bio(struct bio *bio)
 {
        struct pktcdvd_device *pd;
        char b[BDEVNAME_SIZE];
@@ -2423,7 +2423,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
         */
        if (bio_data_dir(bio) == READ) {
                pkt_make_request_read(pd, bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2455,10 +2455,9 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
                pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
        } while (split != bio);
 
-       return BLK_QC_T_NONE;
+       return;
 end_io:
        bio_io_error(bio);
-       return BLK_QC_T_NONE;
 }
 
 static void pkt_init_queue(struct pktcdvd_device *pd)
@@ -2537,6 +2536,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
        int i;
        char b[BDEVNAME_SIZE];
        struct block_device *bdev;
+       struct scsi_device *sdev;
 
        if (pd->pkt_dev == dev) {
                pkt_err(pd, "recursive setup not allowed\n");
@@ -2560,10 +2560,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
        bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
-       if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
+       sdev = scsi_device_from_queue(bdev->bd_disk->queue);
+       if (!sdev) {
                blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
                return -EINVAL;
        }
+       put_device(&sdev->sdev_gendev);
 
        /* This is safe, since we have a reference from open(). */
        __module_get(THIS_MODULE);
@@ -2729,7 +2731,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
        /* inherit events of the host device */
        disk->events = pd->bdev->bd_disk->events;
 
-       add_disk(disk);
+       ret = add_disk(disk);
+       if (ret)
+               goto out_mem2;
 
        pkt_sysfs_dev_new(pd);
        pkt_debugfs_dev_new(pd);
index c7b19e1..d1ebf19 100644 (file)
@@ -578,7 +578,7 @@ out:
        return next;
 }
 
-static blk_qc_t ps3vram_submit_bio(struct bio *bio)
+static void ps3vram_submit_bio(struct bio *bio)
 {
        struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
        struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -594,13 +594,11 @@ static blk_qc_t ps3vram_submit_bio(struct bio *bio)
        spin_unlock_irq(&priv->lock);
 
        if (busy)
-               return BLK_QC_T_NONE;
+               return;
 
        do {
                bio = ps3vram_do_bio(dev, bio);
        } while (bio);
-
-       return BLK_QC_T_NONE;
 }
 
 static const struct block_device_operations ps3vram_fops = {
index e65c9d7..953fa13 100644 (file)
@@ -836,7 +836,7 @@ struct rbd_options {
        u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
 };
 
-#define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_MAX_RQ
+#define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_DEFAULT_RQ
 #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
 #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
 #define RBD_READ_ONLY_DEFAULT  false
@@ -7054,7 +7054,9 @@ static ssize_t do_rbd_add(struct bus_type *bus,
        if (rc)
                goto err_out_image_lock;
 
-       device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+       rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+       if (rc)
+               goto err_out_cleanup_disk;
 
        spin_lock(&rbd_dev_list_lock);
        list_add_tail(&rbd_dev->node, &rbd_dev_list);
@@ -7068,6 +7070,8 @@ out:
        module_put(THIS_MODULE);
        return rc;
 
+err_out_cleanup_disk:
+       rbd_free_disk(rbd_dev);
 err_out_image_lock:
        rbd_dev_image_unlock(rbd_dev);
        rbd_dev_device_release(rbd_dev);
index bd4a41a..2df0657 100644 (file)
@@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        return ret;
 }
 
-static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
        struct rnbd_queue *q = hctx->driver_data;
        struct rnbd_clt_dev *dev = q->dev;
@@ -1384,8 +1384,10 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
        blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
 }
 
-static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
+static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
 {
+       int err;
+
        dev->gd->major          = rnbd_client_major;
        dev->gd->first_minor    = idx << RNBD_PART_BITS;
        dev->gd->minors         = 1 << RNBD_PART_BITS;
@@ -1410,7 +1412,11 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
 
        if (!dev->rotational)
                blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
-       add_disk(dev->gd);
+       err = add_disk(dev->gd);
+       if (err)
+               blk_cleanup_disk(dev->gd);
+
+       return err;
 }
 
 static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
@@ -1426,8 +1432,7 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
        rnbd_init_mq_hw_queues(dev);
 
        setup_request_queue(dev);
-       rnbd_clt_setup_gen_disk(dev, idx);
-       return 0;
+       return rnbd_clt_setup_gen_disk(dev, idx);
 }
 
 static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
index c1bc5c0..de5d5a8 100644 (file)
@@ -10,7 +10,7 @@
 #define RNBD_PROTO_H
 
 #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/limits.h>
 #include <linux/inet.h>
 #include <linux/in.h>
index 8363671..8d9d69f 100644 (file)
@@ -935,7 +935,9 @@ static int rsxx_pci_probe(struct pci_dev *dev,
                        card->size8 = 0;
        }
 
-       rsxx_attach_dev(card);
+       st = rsxx_attach_dev(card);
+       if (st)
+               goto failed_create_dev;
 
        /************* Setup Debugfs *************/
        rsxx_debugfs_dev_new(card);
index 1cc40b0..dd33f1b 100644 (file)
@@ -50,7 +50,7 @@ struct rsxx_bio_meta {
 
 static struct kmem_cache *bio_meta_pool;
 
-static blk_qc_t rsxx_submit_bio(struct bio *bio);
+static void rsxx_submit_bio(struct bio *bio);
 
 /*----------------- Block Device Operations -----------------*/
 static int rsxx_blkdev_ioctl(struct block_device *bdev,
@@ -120,7 +120,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
        }
 }
 
-static blk_qc_t rsxx_submit_bio(struct bio *bio)
+static void rsxx_submit_bio(struct bio *bio)
 {
        struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data;
        struct rsxx_bio_meta *bio_meta;
@@ -169,7 +169,7 @@ static blk_qc_t rsxx_submit_bio(struct bio *bio)
        if (st)
                goto queue_err;
 
-       return BLK_QC_T_NONE;
+       return;
 
 queue_err:
        kmem_cache_free(bio_meta_pool, bio_meta);
@@ -177,7 +177,6 @@ req_err:
        if (st)
                bio->bi_status = st;
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 /*----------------- Device Setup -------------------*/
@@ -192,6 +191,8 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
 
 int rsxx_attach_dev(struct rsxx_cardinfo *card)
 {
+       int err = 0;
+
        mutex_lock(&card->dev_lock);
 
        /* The block device requires the stripe size from the config. */
@@ -200,13 +201,17 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
                        set_capacity(card->gendisk, card->size8 >> 9);
                else
                        set_capacity(card->gendisk, 0);
-               device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL);
-               card->bdev_attached = 1;
+               err = device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL);
+               if (err == 0)
+                       card->bdev_attached = 1;
        }
 
        mutex_unlock(&card->dev_lock);
 
-       return 0;
+       if (err)
+               blk_cleanup_disk(card->gendisk);
+
+       return err;
 }
 
 void rsxx_detach_dev(struct rsxx_cardinfo *card)
index 7ccc8d2..821594c 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/fd.h>
 #include <linux/slab.h>
 #include <linux/blk-mq.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/hdreg.h>
 #include <linux/kernel.h>
@@ -184,6 +185,7 @@ struct floppy_state {
 
        int             track;
        int             ref_count;
+       bool registered;
 
        struct gendisk *disk;
        struct blk_mq_tag_set tag_set;
@@ -771,6 +773,20 @@ static const struct blk_mq_ops swim_mq_ops = {
        .queue_rq = swim_queue_rq,
 };
 
+static void swim_cleanup_floppy_disk(struct floppy_state *fs)
+{
+       struct gendisk *disk = fs->disk;
+
+       if (!disk)
+               return;
+
+       if (fs->registered)
+               del_gendisk(fs->disk);
+
+       blk_cleanup_disk(disk);
+       blk_mq_free_tag_set(&fs->tag_set);
+}
+
 static int swim_floppy_init(struct swim_priv *swd)
 {
        int err;
@@ -827,7 +843,10 @@ static int swim_floppy_init(struct swim_priv *swd)
                swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
                swd->unit[drive].disk->private_data = &swd->unit[drive];
                set_capacity(swd->unit[drive].disk, 2880);
-               add_disk(swd->unit[drive].disk);
+               err = add_disk(swd->unit[drive].disk);
+               if (err)
+                       goto exit_put_disks;
+               swd->unit[drive].registered = true;
        }
 
        return 0;
@@ -835,12 +854,7 @@ static int swim_floppy_init(struct swim_priv *swd)
 exit_put_disks:
        unregister_blkdev(FLOPPY_MAJOR, "fd");
        do {
-               struct gendisk *disk = swd->unit[drive].disk;
-
-               if (!disk)
-                       continue;
-               blk_cleanup_disk(disk);
-               blk_mq_free_tag_set(&swd->unit[drive].tag_set);
+               swim_cleanup_floppy_disk(&swd->unit[drive]);
        } while (drive--);
        return err;
 }
@@ -909,12 +923,8 @@ static int swim_remove(struct platform_device *dev)
        int drive;
        struct resource *res;
 
-       for (drive = 0; drive < swd->floppy_count; drive++) {
-               del_gendisk(swd->unit[drive].disk);
-               blk_cleanup_queue(swd->unit[drive].disk->queue);
-               blk_mq_free_tag_set(&swd->unit[drive].tag_set);
-               put_disk(swd->unit[drive].disk);
-       }
+       for (drive = 0; drive < swd->floppy_count; drive++)
+               swim_cleanup_floppy_disk(&swd->unit[drive]);
 
        unregister_blkdev(FLOPPY_MAJOR, "fd");
 
index 965af0a..4b91c9a 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/major.h>
 #include <asm/io.h>
 #include <asm/dbdma.h>
 #include <asm/prom.h>
@@ -1229,7 +1230,9 @@ static int swim3_attach(struct macio_dev *mdev,
        disk->flags |= GENHD_FL_REMOVABLE;
        sprintf(disk->disk_name, "fd%d", floppy_count);
        set_capacity(disk, 2880);
-       add_disk(disk);
+       rc = add_disk(disk);
+       if (rc)
+               goto out_cleanup_disk;
 
        disks[floppy_count++] = disk;
        return 0;
index 420cd95..d1676fe 100644 (file)
@@ -297,6 +297,7 @@ struct carm_host {
 
        struct work_struct              fsm_task;
 
+       int probe_err;
        struct completion               probe_comp;
 };
 
@@ -1181,8 +1182,11 @@ static void carm_fsm_task (struct work_struct *work)
                                struct gendisk *disk = port->disk;
 
                                set_capacity(disk, port->capacity);
-                               add_disk(disk);
-                               activated++;
+                               host->probe_err = add_disk(disk);
+                               if (!host->probe_err)
+                                       activated++;
+                               else
+                                       break;
                        }
 
                printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n",
@@ -1192,11 +1196,9 @@ static void carm_fsm_task (struct work_struct *work)
                reschedule = 1;
                break;
        }
-
        case HST_PROBE_FINISHED:
                complete(&host->probe_comp);
                break;
-
        case HST_ERROR:
                /* FIXME: TODO */
                break;
@@ -1507,7 +1509,12 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                goto err_out_free_irq;
 
        DPRINTK("waiting for probe_comp\n");
+       host->probe_err = -ENODEV;
        wait_for_completion(&host->probe_comp);
+       if (host->probe_err) {
+               rc = host->probe_err;
+               goto err_out_free_irq;
+       }
 
        printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n",
               host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
index 303caf2..fc4fc95 100644 (file)
@@ -312,7 +312,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
        struct request *req;
        int err;
 
-       req = blk_get_request(q, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
        if (IS_ERR(req))
                return PTR_ERR(req);
 
@@ -323,7 +323,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
        blk_execute_rq(vblk->disk, req, false);
        err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
 out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
        return err;
 }
 
@@ -815,9 +815,17 @@ static int virtblk_probe(struct virtio_device *vdev)
        err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
                                   struct virtio_blk_config, blk_size,
                                   &blk_size);
-       if (!err)
+       if (!err) {
+               err = blk_validate_block_size(blk_size);
+               if (err) {
+                       dev_err(&vdev->dev,
+                               "virtio_blk: invalid block size: 0x%x\n",
+                               blk_size);
+                       goto out_cleanup_disk;
+               }
+
                blk_queue_logical_block_size(q, blk_size);
-       else
+       else
                blk_size = queue_logical_block_size(q);
 
        /* Use topology information if available */
index 33eba3d..914587a 100644 (file)
@@ -98,7 +98,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
                return;
        }
 
-       err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+       err = sync_blockdev(blkif->vbd.bdev);
        if (err) {
                xenbus_dev_error(blkif->be->dev, err, "block flush");
                return;
index 7290210..8e3983e 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/cdrom.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
 #include <linux/bitmap.h>
@@ -2385,7 +2386,13 @@ static void blkfront_connect(struct blkfront_info *info)
        for_each_rinfo(info, rinfo, i)
                kick_pending_request_queues(rinfo);
 
-       device_add_disk(&info->xbdev->dev, info->gd, NULL);
+       err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
+       if (err) {
+               blk_cleanup_disk(info->gd);
+               blk_mq_free_tag_set(&info->tag_set);
+               info->rq = NULL;
+               goto fail;
+       }
 
        info->is_ready = 1;
        return;
index fcaf275..a68297f 100644 (file)
@@ -1598,22 +1598,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 /*
  * Handler function for all zram I/O requests.
  */
-static blk_qc_t zram_submit_bio(struct bio *bio)
+static void zram_submit_bio(struct bio *bio)
 {
        struct zram *zram = bio->bi_bdev->bd_disk->private_data;
 
        if (!valid_io_request(zram, bio->bi_iter.bi_sector,
                                        bio->bi_iter.bi_size)) {
                atomic64_inc(&zram->stats.invalid_io);
-               goto error;
+               bio_io_error(bio);
+               return;
        }
 
        __zram_make_request(zram, bio);
-       return BLK_QC_T_NONE;
-
-error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
 }
 
 static void zram_slot_free_notify(struct block_device *bdev,
index bd2e5b1..9877e41 100644 (file)
@@ -344,6 +344,12 @@ static void cdrom_sysctl_register(void);
 
 static LIST_HEAD(cdrom_list);
 
+static void signal_media_change(struct cdrom_device_info *cdi)
+{
+       cdi->mc_flags = 0x3; /* set media changed bits, on both queues */
+       cdi->last_media_change_ms = ktime_to_ms(ktime_get());
+}
+
 int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
                               struct packet_command *cgc)
 {
@@ -616,6 +622,7 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi)
        ENSURE(cdo, generic_packet, CDC_GENERIC_PACKET);
        cdi->mc_flags = 0;
        cdi->options = CDO_USE_FFLAGS;
+       cdi->last_media_change_ms = ktime_to_ms(ktime_get());
 
        if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
                cdi->options |= (int) CDO_AUTO_CLOSE;
@@ -864,7 +871,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi)
 {
        struct packet_command cgc;
        char buffer[32];
-       int ret, mmc3_profile;
+       int mmc3_profile;
 
        init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ);
 
@@ -874,7 +881,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi)
        cgc.cmd[8] = sizeof(buffer);            /* Allocation Length */
        cgc.quiet = 1;
 
-       if ((ret = cdi->ops->generic_packet(cdi, &cgc)))
+       if (cdi->ops->generic_packet(cdi, &cgc))
                mmc3_profile = 0xffff;
        else
                mmc3_profile = (buffer[6] << 8) | buffer[7];
@@ -1421,8 +1428,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
                cdi->ops->check_events(cdi, 0, slot);
 
        if (slot == CDSL_NONE) {
-               /* set media changed bits, on both queues */
-               cdi->mc_flags = 0x3;
+               signal_media_change(cdi);
                return cdrom_load_unload(cdi, -1);
        }
 
@@ -1455,7 +1461,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
                slot = curslot;
 
        /* set media changed bits on both queues */
-       cdi->mc_flags = 0x3;
+       signal_media_change(cdi);
        if ((ret = cdrom_load_unload(cdi, slot)))
                return ret;
 
@@ -1521,7 +1527,7 @@ int media_changed(struct cdrom_device_info *cdi, int queue)
        cdi->ioctl_events = 0;
 
        if (changed) {
-               cdi->mc_flags = 0x3;    /* set bit on both queues */
+               signal_media_change(cdi);
                ret |= 1;
                cdi->media_written = 0;
        }
@@ -2336,6 +2342,49 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
        return ret;
 }
 
+/*
+ * Media change detection with timing information.
+ *
+ * arg is a pointer to a cdrom_timed_media_change_info struct.
+ * arg->last_media_change may be set by calling code to signal
+ * the timestamp (in ms) of the last known media change (by the caller).
+ * Upon successful return, ioctl call will set arg->last_media_change
+ * to the latest media change timestamp known by the kernel/driver
+ * and set arg->has_changed to 1 if that timestamp is more recent
+ * than the timestamp set by the caller.
+ */
+static int cdrom_ioctl_timed_media_change(struct cdrom_device_info *cdi,
+               unsigned long arg)
+{
+       int ret;
+       struct cdrom_timed_media_change_info __user *info;
+       struct cdrom_timed_media_change_info tmp_info;
+
+       if (!CDROM_CAN(CDC_MEDIA_CHANGED))
+               return -ENOSYS;
+
+       info = (struct cdrom_timed_media_change_info __user *)arg;
+       cd_dbg(CD_DO_IOCTL, "entering CDROM_TIMED_MEDIA_CHANGE\n");
+
+       ret = cdrom_ioctl_media_changed(cdi, CDSL_CURRENT);
+       if (ret < 0)
+               return ret;
+
+       if (copy_from_user(&tmp_info, info, sizeof(tmp_info)) != 0)
+               return -EFAULT;
+
+       tmp_info.media_flags = 0;
+       if (tmp_info.last_media_change - cdi->last_media_change_ms < 0)
+               tmp_info.media_flags |= MEDIA_CHANGED_FLAG;
+
+       tmp_info.last_media_change = cdi->last_media_change_ms;
+
+       if (copy_to_user(info, &tmp_info, sizeof(*info)) != 0)
+               return -EFAULT;
+
+       return 0;
+}
+
 static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
                unsigned long arg)
 {
@@ -3313,6 +3362,8 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
                return cdrom_ioctl_eject_sw(cdi, arg);
        case CDROM_MEDIA_CHANGED:
                return cdrom_ioctl_media_changed(cdi, arg);
+       case CDROM_TIMED_MEDIA_CHANGE:
+               return cdrom_ioctl_timed_media_change(cdi, arg);
        case CDROM_SET_OPTIONS:
                return cdrom_ioctl_set_options(cdi, arg);
        case CDROM_CLEAR_OPTIONS:
index 8e1fe75..d50cc1f 100644 (file)
@@ -805,9 +805,14 @@ static int probe_gdrom(struct platform_device *devptr)
                err = -ENOMEM;
                goto probe_fail_free_irqs;
        }
-       add_disk(gd.disk);
+       err = add_disk(gd.disk);
+       if (err)
+               goto probe_fail_add_disk;
+
        return 0;
 
+probe_fail_add_disk:
+       kfree(gd.toc);
 probe_fail_free_irqs:
        free_irq(HW_EVENT_GDROM_DMA, &gd);
        free_irq(HW_EVENT_GDROM_CMD, &gd);
index d6ba644..4a55164 100644 (file)
@@ -76,7 +76,7 @@ config TCG_TIS_SPI_CR50
 
 config TCG_TIS_SYNQUACER
        tristate "TPM Interface Specification 1.2 Interface / TPM 2.0 FIFO Interface (MMIO - SynQuacer)"
-       depends on ARCH_SYNQUACER
+       depends on ARCH_SYNQUACER || COMPILE_TEST
        select TCG_TIS_CORE
        help
          If you have a TPM security chip that is compliant with the
index 784b8b3..97e9168 100644 (file)
@@ -455,6 +455,9 @@ static int tpm2_map_response_body(struct tpm_chip *chip, u32 cc, u8 *rsp,
        if (be32_to_cpu(data->capability) != TPM2_CAP_HANDLES)
                return 0;
 
+       if (be32_to_cpu(data->count) > (UINT_MAX - TPM_HEADER_SIZE - 9) / 4)
+               return -EFAULT;
+
        if (len != TPM_HEADER_SIZE + 9 + 4 * be32_to_cpu(data->count))
                return -EFAULT;
 
index 69579ef..b2659a4 100644 (file)
@@ -48,6 +48,7 @@ static int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask,
                unsigned long timeout, wait_queue_head_t *queue,
                bool check_cancel)
 {
+       struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
        unsigned long stop;
        long rc;
        u8 status;
@@ -80,8 +81,8 @@ again:
                }
        } else {
                do {
-                       usleep_range(TPM_TIMEOUT_USECS_MIN,
-                                    TPM_TIMEOUT_USECS_MAX);
+                       usleep_range(priv->timeout_min,
+                                    priv->timeout_max);
                        status = chip->ops->status(chip);
                        if ((status & mask) == mask)
                                return 0;
@@ -945,7 +946,22 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
        chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX);
        chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX);
        chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX);
+       priv->timeout_min = TPM_TIMEOUT_USECS_MIN;
+       priv->timeout_max = TPM_TIMEOUT_USECS_MAX;
        priv->phy_ops = phy_ops;
+
+       rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor);
+       if (rc < 0)
+               goto out_err;
+
+       priv->manufacturer_id = vendor;
+
+       if (priv->manufacturer_id == TPM_VID_ATML &&
+               !(chip->flags & TPM_CHIP_FLAG_TPM2)) {
+               priv->timeout_min = TIS_TIMEOUT_MIN_ATML;
+               priv->timeout_max = TIS_TIMEOUT_MAX_ATML;
+       }
+
        dev_set_drvdata(&chip->dev, priv);
 
        if (is_bsw()) {
@@ -988,12 +1004,6 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
        if (rc)
                goto out_err;
 
-       rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor);
-       if (rc < 0)
-               goto out_err;
-
-       priv->manufacturer_id = vendor;
-
        rc = tpm_tis_read8(priv, TPM_RID(0), &rid);
        if (rc < 0)
                goto out_err;
index b2a3c6c..3be24f2 100644 (file)
@@ -54,6 +54,8 @@ enum tis_defaults {
        TIS_MEM_LEN = 0x5000,
        TIS_SHORT_TIMEOUT = 750,        /* ms */
        TIS_LONG_TIMEOUT = 2000,        /* 2 sec */
+       TIS_TIMEOUT_MIN_ATML = 14700,   /* usecs */
+       TIS_TIMEOUT_MAX_ATML = 15000,   /* usecs */
 };
 
 /* Some timeout values are needed before it is known whether the chip is
@@ -98,6 +100,8 @@ struct tpm_tis_data {
        wait_queue_head_t read_queue;
        const struct tpm_tis_phy_ops *phy_ops;
        unsigned short rng_quality;
+       unsigned int timeout_min; /* usecs */
+       unsigned int timeout_max; /* usecs */
 };
 
 struct tpm_tis_phy_ops {
index 54584b4..aaa59a0 100644 (file)
@@ -267,6 +267,7 @@ static const struct spi_device_id tpm_tis_spi_id[] = {
        { "st33htpm-spi", (unsigned long)tpm_tis_spi_probe },
        { "slb9670", (unsigned long)tpm_tis_spi_probe },
        { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe },
+       { "tpm_tis-spi", (unsigned long)tpm_tis_spi_probe },
        { "cr50", (unsigned long)cr50_spi_probe },
        {}
 };
index 0506046..510a996 100644 (file)
@@ -58,11 +58,8 @@ static int clk_composite_determine_rate(struct clk_hw *hw,
        long rate;
        int i;
 
-       if (rate_hw && rate_ops && rate_ops->determine_rate) {
-               __clk_hw_set_clk(rate_hw, hw);
-               return rate_ops->determine_rate(rate_hw, req);
-       } else if (rate_hw && rate_ops && rate_ops->round_rate &&
-                  mux_hw && mux_ops && mux_ops->set_parent) {
+       if (rate_hw && rate_ops && rate_ops->round_rate &&
+           mux_hw && mux_ops && mux_ops->set_parent) {
                req->best_parent_hw = NULL;
 
                if (clk_hw_get_flags(hw) & CLK_SET_RATE_NO_REPARENT) {
@@ -107,6 +104,9 @@ static int clk_composite_determine_rate(struct clk_hw *hw,
 
                req->rate = best_rate;
                return 0;
+       } else if (rate_hw && rate_ops && rate_ops->determine_rate) {
+               __clk_hw_set_clk(rate_hw, hw);
+               return rate_ops->determine_rate(rate_hw, req);
        } else if (mux_hw && mux_ops && mux_ops->determine_rate) {
                __clk_hw_set_clk(mux_hw, hw);
                return mux_ops->determine_rate(mux_hw, req);
index 177d03e..40a052b 100644 (file)
@@ -256,6 +256,11 @@ mlxbf2_gpio_probe(struct platform_device *pdev)
                        NULL,
                        0);
 
+       if (ret) {
+               dev_err(dev, "bgpio_init failed\n");
+               return ret;
+       }
+
        gc->direction_input = mlxbf2_gpio_direction_input;
        gc->direction_output = mlxbf2_gpio_direction_output;
        gc->ngpio = npins;
index fa9b4d8..43ca52f 100644 (file)
@@ -224,7 +224,7 @@ static int iproc_gpio_probe(struct platform_device *pdev)
        }
 
        chip->gc.label = dev_name(dev);
-       if (of_property_read_u32(dn, "ngpios", &num_gpios))
+       if (!of_property_read_u32(dn, "ngpios", &num_gpios))
                chip->gc.ngpio = num_gpios;
 
        irq = platform_get_irq(pdev, 0);
index ff80786..01efda4 100644 (file)
@@ -1257,7 +1257,7 @@ static int nv_common_early_init(void *handle)
                        AMD_PG_SUPPORT_VCN_DPG |
                        AMD_PG_SUPPORT_JPEG;
                if (adev->pdev->device == 0x1681)
-                       adev->external_rev_id = adev->rev_id + 0x19;
+                       adev->external_rev_id = 0x20;
                else
                        adev->external_rev_id = adev->rev_id + 0x01;
                break;
index 87daa78..8080bba 100644 (file)
@@ -263,7 +263,7 @@ static ssize_t dp_link_settings_write(struct file *f, const char __user *buf,
        if (!wr_buf)
                return -ENOSPC;
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                           (long *)param, buf,
                                           max_param_num,
                                           &param_nums)) {
@@ -487,7 +487,7 @@ static ssize_t dp_phy_settings_write(struct file *f, const char __user *buf,
        if (!wr_buf)
                return -ENOSPC;
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                           (long *)param, buf,
                                           max_param_num,
                                           &param_nums)) {
@@ -639,7 +639,7 @@ static ssize_t dp_phy_test_pattern_debugfs_write(struct file *f, const char __us
        if (!wr_buf)
                return -ENOSPC;
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                           (long *)param, buf,
                                           max_param_num,
                                           &param_nums)) {
@@ -914,7 +914,7 @@ static ssize_t dp_dsc_passthrough_set(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                           &param, buf,
                                           max_param_num,
                                           &param_nums)) {
@@ -1211,7 +1211,7 @@ static ssize_t trigger_hotplug(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                                (long *)param, buf,
                                                max_param_num,
                                                &param_nums)) {
@@ -1396,7 +1396,7 @@ static ssize_t dp_dsc_clock_en_write(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -1581,7 +1581,7 @@ static ssize_t dp_dsc_slice_width_write(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -1766,7 +1766,7 @@ static ssize_t dp_dsc_slice_height_write(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -1944,7 +1944,7 @@ static ssize_t dp_dsc_bits_per_pixel_write(struct file *f, const char __user *bu
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -2382,7 +2382,7 @@ static ssize_t dp_max_bpc_write(struct file *f, const char __user *buf,
                return -ENOSPC;
        }
 
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                           (long *)param, buf,
                                           max_param_num,
                                           &param_nums)) {
index 4a4894e..377c4e5 100644 (file)
@@ -366,32 +366,32 @@ static struct wm_table lpddr5_wm_table = {
                        .wm_inst = WM_A,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 5.32,
-                       .sr_enter_plus_exit_time_us = 6.38,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                        .valid = true,
                },
                {
                        .wm_inst = WM_B,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.82,
-                       .sr_enter_plus_exit_time_us = 11.196,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                        .valid = true,
                },
                {
                        .wm_inst = WM_C,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.89,
-                       .sr_enter_plus_exit_time_us = 11.24,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                        .valid = true,
                },
                {
                        .wm_inst = WM_D,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.748,
-                       .sr_enter_plus_exit_time_us = 11.102,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                        .valid = true,
                },
        }
@@ -518,14 +518,21 @@ static unsigned int find_clk_for_voltage(
                unsigned int voltage)
 {
        int i;
+       int max_voltage = 0;
+       int clock = 0;
 
        for (i = 0; i < NUM_SOC_VOLTAGE_LEVELS; i++) {
-               if (clock_table->SocVoltage[i] == voltage)
+               if (clock_table->SocVoltage[i] == voltage) {
                        return clocks[i];
+               } else if (clock_table->SocVoltage[i] >= max_voltage &&
+                               clock_table->SocVoltage[i] < voltage) {
+                       max_voltage = clock_table->SocVoltage[i];
+                       clock = clocks[i];
+               }
        }
 
-       ASSERT(0);
-       return 0;
+       ASSERT(clock);
+       return clock;
 }
 
 void dcn31_clk_mgr_helper_populate_bw_params(
index 3f2333e..3afa115 100644 (file)
@@ -76,10 +76,6 @@ void dcn31_init_hw(struct dc *dc)
        if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks)
                dc->clk_mgr->funcs->init_clocks(dc->clk_mgr);
 
-       // Initialize the dccg
-       if (res_pool->dccg->funcs->dccg_init)
-               res_pool->dccg->funcs->dccg_init(res_pool->dccg);
-
        if (IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment)) {
 
                REG_WRITE(REFCLK_CNTL, 0);
@@ -106,6 +102,9 @@ void dcn31_init_hw(struct dc *dc)
                hws->funcs.bios_golden_init(dc);
                hws->funcs.disable_vga(dc->hwseq);
        }
+       // Initialize the dccg
+       if (res_pool->dccg->funcs->dccg_init)
+               res_pool->dccg->funcs->dccg_init(res_pool->dccg);
 
        if (dc->debug.enable_mem_low_power.bits.dmcu) {
                // Force ERAM to shutdown if DMCU is not enabled
index 0006bba..79e92ec 100644 (file)
@@ -217,8 +217,8 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_1_soc = {
        .num_states = 5,
        .sr_exit_time_us = 9.0,
        .sr_enter_plus_exit_time_us = 11.0,
-       .sr_exit_z8_time_us = 402.0,
-       .sr_enter_plus_exit_z8_time_us = 520.0,
+       .sr_exit_z8_time_us = 442.0,
+       .sr_enter_plus_exit_z8_time_us = 560.0,
        .writeback_latency_us = 12.0,
        .dram_channel_width_bytes = 4,
        .round_trip_ping_latency_dcfclk_cycles = 106,
@@ -928,7 +928,7 @@ static const struct dc_debug_options debug_defaults_drv = {
        .disable_dcc = DCC_ENABLE,
        .vsr_support = true,
        .performance_trace = false,
-       .max_downscale_src_width = 3840,/*upto 4K*/
+       .max_downscale_src_width = 4096,/*upto true 4K*/
        .disable_pplib_wm_range = false,
        .scl_reset_length10 = true,
        .sanity_checks = false,
@@ -1590,6 +1590,13 @@ static int dcn31_populate_dml_pipes_from_context(
                pipe = &res_ctx->pipe_ctx[i];
                timing = &pipe->stream->timing;
 
+               /*
+                * Immediate flip can be set dynamically after enabling the plane.
+                * We need to require support for immediate flip or underflow can be
+                * intermittently experienced depending on peak b/w requirements.
+                */
+               pipes[pipe_cnt].pipe.src.immediate_flip = true;
+
                pipes[pipe_cnt].pipe.src.unbounded_req_mode = false;
                pipes[pipe_cnt].pipe.src.gpuvm = true;
                pipes[pipe_cnt].pipe.src.dcc_fraction_of_zs_req_luma = 0;
index ce55c9c..d58925c 100644 (file)
@@ -5398,9 +5398,9 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l
 
                                        v->MaximumReadBandwidthWithPrefetch =
                                                        v->MaximumReadBandwidthWithPrefetch
-                                                                       + dml_max4(
-                                                                                       v->VActivePixelBandwidth[i][j][k],
-                                                                                       v->VActiveCursorBandwidth[i][j][k]
+                                                                       + dml_max3(
+                                                                                       v->VActivePixelBandwidth[i][j][k]
+                                                                                                       + v->VActiveCursorBandwidth[i][j][k]
                                                                                                        + v->NoOfDPP[i][j][k]
                                                                                                                        * (v->meta_row_bandwidth[i][j][k]
                                                                                                                                        + v->dpte_row_bandwidth[i][j][k]),
index 5adc471..3d2f081 100644 (file)
@@ -227,7 +227,7 @@ enum {
 #define FAMILY_YELLOW_CARP                     146
 
 #define YELLOW_CARP_A0 0x01
-#define YELLOW_CARP_B0 0x1A
+#define YELLOW_CARP_B0 0x20
 #define YELLOW_CARP_UNKNOWN 0xFF
 
 #ifndef ASICREV_IS_YELLOW_CARP
index e9bd84e..be61975 100644 (file)
@@ -105,6 +105,7 @@ static enum mod_hdcp_status remove_display_from_topology_v3(
        dtm_cmd->dtm_status = TA_DTM_STATUS__GENERIC_FAILURE;
 
        psp_dtm_invoke(psp, dtm_cmd->cmd_id);
+       mutex_unlock(&psp->dtm_context.mutex);
 
        if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
                status = remove_display_from_topology_v2(hdcp, index);
@@ -115,8 +116,6 @@ static enum mod_hdcp_status remove_display_from_topology_v3(
                HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index);
        }
 
-       mutex_unlock(&psp->dtm_context.mutex);
-
        return status;
 }
 
@@ -205,6 +204,7 @@ static enum mod_hdcp_status add_display_to_topology_v3(
        dtm_cmd->dtm_in_message.topology_update_v3.link_hdcp_cap = link->hdcp_supported_informational;
 
        psp_dtm_invoke(psp, dtm_cmd->cmd_id);
+       mutex_unlock(&psp->dtm_context.mutex);
 
        if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
                status = add_display_to_topology_v2(hdcp, display);
@@ -214,8 +214,6 @@ static enum mod_hdcp_status add_display_to_topology_v3(
                HDCP_TOP_ADD_DISPLAY_TRACE(hdcp, display->index);
        }
 
-       mutex_unlock(&psp->dtm_context.mutex);
-
        return status;
 }
 
index f6bdec7..e1b2ce4 100644 (file)
@@ -134,6 +134,12 @@ static const struct dmi_system_id orientation_data[] = {
                  DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T103HAF"),
                },
                .driver_data = (void *)&lcd800x1280_rightside_up,
+       }, {    /* AYA NEO 2021 */
+               .matches = {
+                 DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AYADEVICE"),
+                 DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "AYA NEO 2021"),
+               },
+               .driver_data = (void *)&lcd800x1280_rightside_up,
        }, {    /* GPD MicroPC (generic strings, also match on bios date) */
                .matches = {
                  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Default string"),
@@ -185,6 +191,12 @@ static const struct dmi_system_id orientation_data[] = {
                  DMI_EXACT_MATCH(DMI_BOARD_NAME, "Default string"),
                },
                .driver_data = (void *)&gpd_win2,
+       }, {    /* GPD Win 3 */
+               .matches = {
+                 DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"),
+                 DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1618-03")
+               },
+               .driver_data = (void *)&lcd720x1280_rightside_up,
        }, {    /* I.T.Works TW891 */
                .matches = {
                  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "To be filled by O.E.M."),
index abe3d61..5cf152b 100644 (file)
@@ -1916,6 +1916,9 @@ void intel_dp_sync_state(struct intel_encoder *encoder,
 {
        struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
 
+       if (!crtc_state)
+               return;
+
        /*
         * Don't clobber DPCD if it's been already read out during output
         * setup (eDP) or detect.
index 1257f4f..438bbc7 100644 (file)
@@ -64,7 +64,7 @@ intel_timeline_pin_map(struct intel_timeline *timeline)
 
        timeline->hwsp_map = vaddr;
        timeline->hwsp_seqno = memset(vaddr + ofs, 0, TIMELINE_SEQNO_BYTES);
-       clflush(vaddr + ofs);
+       drm_clflush_virt_range(vaddr + ofs, TIMELINE_SEQNO_BYTES);
 
        return 0;
 }
@@ -225,7 +225,7 @@ void intel_timeline_reset_seqno(const struct intel_timeline *tl)
 
        memset(hwsp_seqno + 1, 0, TIMELINE_SEQNO_BYTES - sizeof(*hwsp_seqno));
        WRITE_ONCE(*hwsp_seqno, tl->seqno);
-       clflush(hwsp_seqno);
+       drm_clflush_virt_range(hwsp_seqno, TIMELINE_SEQNO_BYTES);
 }
 
 void intel_timeline_enter(struct intel_timeline *tl)
index 4037030..9023d4e 100644 (file)
@@ -11048,12 +11048,6 @@ enum skl_power_gate {
 #define  DC_STATE_DEBUG_MASK_CORES     (1 << 0)
 #define  DC_STATE_DEBUG_MASK_MEMORY_UP (1 << 1)
 
-#define BXT_P_CR_MC_BIOS_REQ_0_0_0     _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x7114)
-#define  BXT_REQ_DATA_MASK                     0x3F
-#define  BXT_DRAM_CHANNEL_ACTIVE_SHIFT         12
-#define  BXT_DRAM_CHANNEL_ACTIVE_MASK          (0xF << 12)
-#define  BXT_MEMORY_FREQ_MULTIPLIER_HZ         133333333
-
 #define BXT_D_CR_DRP0_DUNIT8                   0x1000
 #define BXT_D_CR_DRP0_DUNIT9                   0x1200
 #define  BXT_D_CR_DRP0_DUNIT_START             8
@@ -11084,9 +11078,7 @@ enum skl_power_gate {
 #define  BXT_DRAM_TYPE_LPDDR4                  (0x2 << 22)
 #define  BXT_DRAM_TYPE_DDR4                    (0x4 << 22)
 
-#define SKL_MEMORY_FREQ_MULTIPLIER_HZ          266666666
 #define SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU      _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5E04)
-#define  SKL_REQ_DATA_MASK                     (0xF << 0)
 #define  DG1_GEAR_TYPE                         REG_BIT(16)
 
 #define SKL_MAD_INTER_CHANNEL_0_0_0_MCHBAR_MCMAIN _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5000)
index 806ad68..63fec1c 100644 (file)
@@ -794,7 +794,6 @@ DECLARE_EVENT_CLASS(i915_request,
            TP_STRUCT__entry(
                             __field(u32, dev)
                             __field(u64, ctx)
-                            __field(u32, guc_id)
                             __field(u16, class)
                             __field(u16, instance)
                             __field(u32, seqno)
@@ -805,16 +804,14 @@ DECLARE_EVENT_CLASS(i915_request,
                           __entry->dev = rq->engine->i915->drm.primary->index;
                           __entry->class = rq->engine->uabi_class;
                           __entry->instance = rq->engine->uabi_instance;
-                          __entry->guc_id = rq->context->guc_id;
                           __entry->ctx = rq->fence.context;
                           __entry->seqno = rq->fence.seqno;
                           __entry->tail = rq->tail;
                           ),
 
-           TP_printk("dev=%u, engine=%u:%u, guc_id=%u, ctx=%llu, seqno=%u, tail=%u",
+           TP_printk("dev=%u, engine=%u:%u, ctx=%llu, seqno=%u, tail=%u",
                      __entry->dev, __entry->class, __entry->instance,
-                     __entry->guc_id, __entry->ctx, __entry->seqno,
-                     __entry->tail)
+                     __entry->ctx, __entry->seqno, __entry->tail)
 );
 
 DEFINE_EVENT(i915_request, i915_request_add,
index 5259eda..066a911 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/sched/clock.h>
 
 struct drm_i915_private;
 struct timer_list;
index 9186652..7acce64 100644 (file)
@@ -244,7 +244,6 @@ static int
 skl_get_dram_info(struct drm_i915_private *i915)
 {
        struct dram_info *dram_info = &i915->dram_info;
-       u32 mem_freq_khz, val;
        int ret;
 
        dram_info->type = skl_get_dram_type(i915);
@@ -255,17 +254,6 @@ skl_get_dram_info(struct drm_i915_private *i915)
        if (ret)
                return ret;
 
-       val = intel_uncore_read(&i915->uncore,
-                               SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU);
-       mem_freq_khz = DIV_ROUND_UP((val & SKL_REQ_DATA_MASK) *
-                                   SKL_MEMORY_FREQ_MULTIPLIER_HZ, 1000);
-
-       if (dram_info->num_channels * mem_freq_khz == 0) {
-               drm_info(&i915->drm,
-                        "Couldn't get system memory bandwidth\n");
-               return -EINVAL;
-       }
-
        return 0;
 }
 
@@ -350,24 +338,10 @@ static void bxt_get_dimm_info(struct dram_dimm_info *dimm, u32 val)
 static int bxt_get_dram_info(struct drm_i915_private *i915)
 {
        struct dram_info *dram_info = &i915->dram_info;
-       u32 dram_channels;
-       u32 mem_freq_khz, val;
-       u8 num_active_channels, valid_ranks = 0;
+       u32 val;
+       u8 valid_ranks = 0;
        int i;
 
-       val = intel_uncore_read(&i915->uncore, BXT_P_CR_MC_BIOS_REQ_0_0_0);
-       mem_freq_khz = DIV_ROUND_UP((val & BXT_REQ_DATA_MASK) *
-                                   BXT_MEMORY_FREQ_MULTIPLIER_HZ, 1000);
-
-       dram_channels = val & BXT_DRAM_CHANNEL_ACTIVE_MASK;
-       num_active_channels = hweight32(dram_channels);
-
-       if (mem_freq_khz * num_active_channels == 0) {
-               drm_info(&i915->drm,
-                        "Couldn't get system memory bandwidth\n");
-               return -EINVAL;
-       }
-
        /*
         * Now read each DUNIT8/9/10/11 to check the rank of each dimms.
         */
index 1c19a5d..8d8d8e2 100644 (file)
@@ -30,6 +30,7 @@ static void mock_setup(struct drm_plane_state *state)
        mock_device.driver = &mock_driver;
        mock_device.mode_config.prop_fb_damage_clips = &mock_prop;
        mock_plane.dev = &mock_device;
+       mock_obj_props.count = 0;
        mock_plane.base.properties = &mock_obj_props;
        mock_prop.base.id = 1; /* 0 is an invalid id */
        mock_prop.dev = &mock_device;
index 1c5ffe2..abf2d7a 100644 (file)
@@ -190,6 +190,7 @@ static void ttm_transfered_destroy(struct ttm_buffer_object *bo)
        struct ttm_transfer_obj *fbo;
 
        fbo = container_of(bo, struct ttm_transfer_obj, base);
+       dma_resv_fini(&fbo->base.base._resv);
        ttm_bo_put(fbo->bo);
        kfree(fbo);
 }
index a20b810..c00f8e2 100644 (file)
@@ -706,8 +706,9 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
 
        /* Construct the family header first */
        header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
-       memcpy(header->device_name, dev_name(&query->port->agent->device->dev),
-              LS_DEVICE_NAME_MAX);
+       strscpy_pad(header->device_name,
+                   dev_name(&query->port->agent->device->dev),
+                   LS_DEVICE_NAME_MAX);
        header->port_num = query->port->port_num;
 
        if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
index 489b436..3d42bd2 100644 (file)
@@ -878,6 +878,7 @@ void sc_disable(struct send_context *sc)
 {
        u64 reg;
        struct pio_buf *pbuf;
+       LIST_HEAD(wake_list);
 
        if (!sc)
                return;
@@ -912,19 +913,21 @@ void sc_disable(struct send_context *sc)
        spin_unlock(&sc->release_lock);
 
        write_seqlock(&sc->waitlock);
-       while (!list_empty(&sc->piowait)) {
+       if (!list_empty(&sc->piowait))
+               list_move(&sc->piowait, &wake_list);
+       write_sequnlock(&sc->waitlock);
+       while (!list_empty(&wake_list)) {
                struct iowait *wait;
                struct rvt_qp *qp;
                struct hfi1_qp_priv *priv;
 
-               wait = list_first_entry(&sc->piowait, struct iowait, list);
+               wait = list_first_entry(&wake_list, struct iowait, list);
                qp = iowait_to_qp(wait);
                priv = qp->priv;
                list_del_init(&priv->s_iowait.list);
                priv->s_iowait.lock = NULL;
                hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
        }
-       write_sequnlock(&sc->waitlock);
 
        spin_unlock_irq(&sc->alloc_lock);
 }
index 5fb92de..9b544a3 100644 (file)
@@ -1092,12 +1092,12 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info)
                if (cq->avoid_mem_cflct) {
                        ext_cqe = (__le64 *)((u8 *)cqe + 32);
                        get_64bit_val(ext_cqe, 24, &qword7);
-                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7);
                } else {
                        peek_head = (cq->cq_ring.head + 1) % cq->cq_ring.size;
                        ext_cqe = cq->cq_base[peek_head].buf;
                        get_64bit_val(ext_cqe, 24, &qword7);
-                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7);
                        if (!peek_head)
                                polarity ^= 1;
                }
index 7110ebf..102dc93 100644 (file)
@@ -3399,9 +3399,13 @@ static void irdma_process_cqe(struct ib_wc *entry,
                }
 
                if (cq_poll_info->ud_vlan_valid) {
-                       entry->vlan_id = cq_poll_info->ud_vlan & VLAN_VID_MASK;
-                       entry->wc_flags |= IB_WC_WITH_VLAN;
+                       u16 vlan = cq_poll_info->ud_vlan & VLAN_VID_MASK;
+
                        entry->sl = cq_poll_info->ud_vlan >> VLAN_PRIO_SHIFT;
+                       if (vlan) {
+                               entry->vlan_id = vlan;
+                               entry->wc_flags |= IB_WC_WITH_VLAN;
+                       }
                } else {
                        entry->sl = 0;
                }
index b68c575..b0d6ee0 100644 (file)
@@ -330,8 +330,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri)
 
                tc_node->enable = true;
                ret = irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_MODIFY_NODE);
-               if (ret)
+               if (ret) {
+                       vsi->unregister_qset(vsi, tc_node);
                        goto reg_err;
+               }
        }
        ibdev_dbg(to_ibdev(vsi->dev),
                  "WS: Using node %d which represents VSI %d TC %d\n",
@@ -350,6 +352,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri)
        }
        goto exit;
 
+reg_err:
+       irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_DELETE_NODE);
+       list_del(&tc_node->siblings);
+       irdma_free_node(vsi, tc_node);
 leaf_add_err:
        if (list_empty(&vsi_node->child_list_head)) {
                if (irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_DELETE_NODE))
@@ -369,11 +375,6 @@ vsi_add_err:
 exit:
        mutex_unlock(&vsi->dev->ws_mutex);
        return ret;
-
-reg_err:
-       mutex_unlock(&vsi->dev->ws_mutex);
-       irdma_ws_remove(vsi, user_pri);
-       return ret;
 }
 
 /**
index 3be36eb..22e2f4d 100644 (file)
@@ -1339,7 +1339,6 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
                goto err_2;
        }
        mr->mmkey.type = MLX5_MKEY_MR;
-       mr->desc_size = sizeof(struct mlx5_mtt);
        mr->umem = umem;
        set_mr_fields(dev, mr, umem->length, access_flags);
        kvfree(in);
@@ -1533,6 +1532,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
                ib_umem_release(&odp->umem);
                return ERR_CAST(mr);
        }
+       xa_init(&mr->implicit_children);
 
        odp->private = mr;
        err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
index b2fca11..e5abbcf 100644 (file)
@@ -4458,6 +4458,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                MLX5_SET(dctc, dctc, mtu, attr->path_mtu);
                MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index);
                MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
+               if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+                       MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7);
 
                err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in,
                                           MLX5_ST_SZ_BYTES(create_dct_in), out,
index 3cb4feb..8def88c 100644 (file)
@@ -455,6 +455,7 @@ struct qedr_qp {
        /* synchronization objects used with iwarp ep */
        struct kref refcnt;
        struct completion iwarp_cm_comp;
+       struct completion qp_rel_comp;
        unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
 };
 
index 1715fbe..a51fc68 100644 (file)
@@ -83,7 +83,7 @@ static void qedr_iw_free_qp(struct kref *ref)
 {
        struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
 
-       kfree(qp);
+       complete(&qp->qp_rel_comp);
 }
 
 static void
index 3fbf172..dcb3653 100644 (file)
@@ -1357,6 +1357,7 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev,
        if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
                kref_init(&qp->refcnt);
                init_completion(&qp->iwarp_cm_comp);
+               init_completion(&qp->qp_rel_comp);
        }
 
        qp->pd = pd;
@@ -2857,8 +2858,10 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
        qedr_free_qp_resources(dev, qp, udata);
 
-       if (rdma_protocol_iwarp(&dev->ibdev, 1))
+       if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
                qedr_iw_qp_rem_ref(&qp->ibqp);
+               wait_for_completion(&qp->qp_rel_comp);
+       }
 
        return 0;
 }
index a67599b..ac11943 100644 (file)
@@ -602,7 +602,7 @@ done:
 /*
  * How many pages in this iovec element?
  */
-static int qib_user_sdma_num_pages(const struct iovec *iov)
+static size_t qib_user_sdma_num_pages(const struct iovec *iov)
 {
        const unsigned long addr  = (unsigned long) iov->iov_base;
        const unsigned long  len  = iov->iov_len;
@@ -658,7 +658,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev,
 static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
                                   struct qib_user_sdma_queue *pq,
                                   struct qib_user_sdma_pkt *pkt,
-                                  unsigned long addr, int tlen, int npages)
+                                  unsigned long addr, int tlen, size_t npages)
 {
        struct page *pages[8];
        int i, j;
@@ -722,7 +722,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
        unsigned long idx;
 
        for (idx = 0; idx < niov; idx++) {
-               const int npages = qib_user_sdma_num_pages(iov + idx);
+               const size_t npages = qib_user_sdma_num_pages(iov + idx);
                const unsigned long addr = (unsigned long) iov[idx].iov_base;
 
                ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
@@ -824,8 +824,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                unsigned pktnw;
                unsigned pktnwc;
                int nfrags = 0;
-               int npages = 0;
-               int bytes_togo = 0;
+               size_t npages = 0;
+               size_t bytes_togo = 0;
                int tiddma = 0;
                int cfur;
 
@@ -885,7 +885,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
 
                        npages += qib_user_sdma_num_pages(&iov[idx]);
 
-                       bytes_togo += slen;
+                       if (check_add_overflow(bytes_togo, slen, &bytes_togo) ||
+                           bytes_togo > type_max(typeof(pkt->bytes_togo))) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
                        pktnwc += slen >> 2;
                        idx++;
                        nfrags++;
@@ -904,8 +908,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                }
 
                if (frag_size) {
-                       int tidsmsize, n;
-                       size_t pktsize;
+                       size_t tidsmsize, n, pktsize, sz, addrlimit;
 
                        n = npages*((2*PAGE_SIZE/frag_size)+1);
                        pktsize = struct_size(pkt, addr, n);
@@ -923,14 +926,24 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                        else
                                tidsmsize = 0;
 
-                       pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+                       if (check_add_overflow(pktsize, tidsmsize, &sz)) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
+                       pkt = kmalloc(sz, GFP_KERNEL);
                        if (!pkt) {
                                ret = -ENOMEM;
                                goto free_pbc;
                        }
                        pkt->largepkt = 1;
                        pkt->frag_size = frag_size;
-                       pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+                       if (check_add_overflow(n, ARRAY_SIZE(pkt->addr),
+                                              &addrlimit) ||
+                           addrlimit > type_max(typeof(pkt->addrlimit))) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
+                       pkt->addrlimit = addrlimit;
 
                        if (tiddma) {
                                char *tidsm = (char *)pkt + pktsize;
index 49bdd78..3305f27 100644 (file)
@@ -1223,7 +1223,7 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
        spin_lock(&rdi->n_qps_lock);
        if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
                spin_unlock(&rdi->n_qps_lock);
-               ret = ENOMEM;
+               ret = -ENOMEM;
                goto bail_ip;
        }
 
index 5fc989a..9ed9c95 100644 (file)
 
 #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
 
-#include <linux/bcache.h>
 #include <linux/bio.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 
+#include "bcache_ondisk.h"
 #include "bset.h"
 #include "util.h"
 #include "closure.h"
@@ -395,8 +395,6 @@ struct cached_dev {
        atomic_t                io_errors;
        unsigned int            error_limit;
        unsigned int            offline_seconds;
-
-       char                    backing_dev_name[BDEVNAME_SIZE];
 };
 
 enum alloc_reserve {
@@ -470,8 +468,6 @@ struct cache {
        atomic_long_t           meta_sectors_written;
        atomic_long_t           btree_sectors_written;
        atomic_long_t           sectors_written;
-
-       char                    cache_dev_name[BDEVNAME_SIZE];
 };
 
 struct gc_stat {
similarity index 99%
rename from include/uapi/linux/bcache.h
rename to drivers/md/bcache/bcache_ondisk.h
index cf7399f..9741358 100644 (file)
@@ -43,9 +43,9 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v)        \
 #define KEY_MAX_U64S           8
 
 KEY_FIELD(KEY_PTRS,    high, 60, 3)
-KEY_FIELD(HEADER_SIZE, high, 58, 2)
+KEY_FIELD(__PAD0,      high, 58, 2)
 KEY_FIELD(KEY_CSUM,    high, 56, 2)
-KEY_FIELD(KEY_PINNED,  high, 55, 1)
+KEY_FIELD(__PAD1,      high, 55, 1)
 KEY_FIELD(KEY_DIRTY,   high, 36, 1)
 
 KEY_FIELD(KEY_SIZE,    high, 20, KEY_SIZE_BITS)
index a50dcfd..d795c84 100644 (file)
@@ -2,10 +2,10 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H
 
-#include <linux/bcache.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#include "bcache_ondisk.h"
 #include "util.h" /* for time_stats */
 
 /*
index 0595559..93b67b8 100644 (file)
@@ -141,7 +141,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
        uint64_t crc = b->key.ptr[0];
        void *data = (void *) i + 8, *end = bset_bkey_last(i);
 
-       crc = bch_crc64_update(crc, data, end - data);
+       crc = crc64_be(crc, data, end - data);
        return crc ^ 0xffffffffffffffffULL;
 }
 
index 116edda..6230dfd 100644 (file)
@@ -127,21 +127,20 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 
        citer.bi_size = UINT_MAX;
        bio_for_each_segment(bv, bio, iter) {
-               void *p1 = kmap_atomic(bv.bv_page);
+               void *p1 = bvec_kmap_local(&bv);
                void *p2;
 
                cbv = bio_iter_iovec(check, citer);
-               p2 = page_address(cbv.bv_page);
+               p2 = bvec_kmap_local(&cbv);
 
-               cache_set_err_on(memcmp(p1 + bv.bv_offset,
-                                       p2 + bv.bv_offset,
-                                       bv.bv_len),
+               cache_set_err_on(memcmp(p1, p2, bv.bv_len),
                                 dc->disk.c,
-                                "verify failed at dev %s sector %llu",
-                                dc->backing_dev_name,
+                                "verify failed at dev %pg sector %llu",
+                                dc->bdev,
                                 (uint64_t) bio->bi_iter.bi_sector);
 
-               kunmap_atomic(p1);
+               kunmap_local(p2);
+               kunmap_local(p1);
                bio_advance_iter(check, &citer, bv.bv_len);
        }
 
index 6d2b7b8..634922c 100644 (file)
@@ -6,7 +6,7 @@
  * Copyright 2020 Coly Li <colyli@suse.de>
  *
  */
-#include <linux/bcache.h>
+#include "bcache_ondisk.h"
 #include "bcache.h"
 #include "features.h"
 
index d1c8fd3..09161b8 100644 (file)
@@ -2,10 +2,11 @@
 #ifndef _BCACHE_FEATURES_H
 #define _BCACHE_FEATURES_H
 
-#include <linux/bcache.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#include "bcache_ondisk.h"
+
 #define BCH_FEATURE_COMPAT             0
 #define BCH_FEATURE_RO_COMPAT          1
 #define BCH_FEATURE_INCOMPAT           2
index e4388fe..9c6f9ec 100644 (file)
@@ -65,15 +65,15 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
         * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
         */
        if (bio->bi_opf & REQ_RAHEAD) {
-               pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore\n",
-                                   dc->backing_dev_name);
+               pr_warn_ratelimited("%pg: Read-ahead I/O failed on backing device, ignore\n",
+                                   dc->bdev);
                return;
        }
 
        errors = atomic_add_return(1, &dc->io_errors);
        if (errors < dc->error_limit)
-               pr_err("%s: IO error on backing device, unrecoverable\n",
-                       dc->backing_dev_name);
+               pr_err("%pg: IO error on backing device, unrecoverable\n",
+                       dc->bdev);
        else
                bch_cached_dev_error(dc);
 }
@@ -123,13 +123,13 @@ void bch_count_io_errors(struct cache *ca,
                errors >>= IO_ERROR_SHIFT;
 
                if (errors < ca->set->error_limit)
-                       pr_err("%s: IO error on %s%s\n",
-                              ca->cache_dev_name, m,
+                       pr_err("%pg: IO error on %s%s\n",
+                              ca->bdev, m,
                               is_read ? ", recovering." : ".");
                else
                        bch_cache_set_error(ca->set,
-                                           "%s: too many IO errors %s\n",
-                                           ca->cache_dev_name, m);
+                                           "%pg: too many IO errors %s\n",
+                                           ca->bdev, m);
        }
 }
 
index 6d1de88..d15aae6 100644 (file)
@@ -46,7 +46,7 @@ static void bio_csum(struct bio *bio, struct bkey *k)
        bio_for_each_segment(bv, bio, iter) {
                void *d = kmap(bv.bv_page) + bv.bv_offset;
 
-               csum = bch_crc64_update(csum, d, bv.bv_len);
+               csum = crc64_be(csum, d, bv.bv_len);
                kunmap(bv.bv_page);
        }
 
@@ -651,8 +651,8 @@ static void backing_request_endio(struct bio *bio)
                 */
                if (unlikely(s->iop.writeback &&
                             bio->bi_opf & REQ_PREFLUSH)) {
-                       pr_err("Can't flush %s: returned bi_status %i\n",
-                               dc->backing_dev_name, bio->bi_status);
+                       pr_err("Can't flush %pg: returned bi_status %i\n",
+                               dc->bdev, bio->bi_status);
                } else {
                        /* set to orig_bio->bi_status in bio_complete() */
                        s->iop.status = bio->bi_status;
@@ -1163,7 +1163,7 @@ static void quit_max_writeback_rate(struct cache_set *c,
 
 /* Cached devices - read & write stuff */
 
-blk_qc_t cached_dev_submit_bio(struct bio *bio)
+void cached_dev_submit_bio(struct bio *bio)
 {
        struct search *s;
        struct block_device *orig_bdev = bio->bi_bdev;
@@ -1176,7 +1176,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
                     dc->io_disable)) {
                bio->bi_status = BLK_STS_IOERR;
                bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        if (likely(d->c)) {
@@ -1222,8 +1222,6 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
        } else
                /* I/O request sent to backing device */
                detached_dev_do_request(d, bio, orig_bdev, start_time);
-
-       return BLK_QC_T_NONE;
 }
 
 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
@@ -1273,7 +1271,7 @@ static void flash_dev_nodata(struct closure *cl)
        continue_at(cl, search_free, NULL);
 }
 
-blk_qc_t flash_dev_submit_bio(struct bio *bio)
+void flash_dev_submit_bio(struct bio *bio)
 {
        struct search *s;
        struct closure *cl;
@@ -1282,7 +1280,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
        if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
                bio->bi_status = BLK_STS_IOERR;
                bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
@@ -1298,7 +1296,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
                continue_at_nobarrier(&s->cl,
                                      flash_dev_nodata,
                                      bcache_wq);
-               return BLK_QC_T_NONE;
+               return;
        } else if (bio_data_dir(bio)) {
                bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
                                        &KEY(d->id, bio->bi_iter.bi_sector, 0),
@@ -1314,7 +1312,6 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
        }
 
        continue_at(cl, search_free, NULL);
-       return BLK_QC_T_NONE;
 }
 
 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
index 82b3836..38ab485 100644 (file)
@@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c);
 void bch_data_insert(struct closure *cl);
 
 void bch_cached_dev_request_init(struct cached_dev *dc);
-blk_qc_t cached_dev_submit_bio(struct bio *bio);
+void cached_dev_submit_bio(struct bio *bio);
 
 void bch_flash_dev_request_init(struct bcache_device *d);
-blk_qc_t flash_dev_submit_bio(struct bio *bio);
+void flash_dev_submit_bio(struct bio *bio);
 
 extern struct kmem_cache *bch_search_cache;
 
index f2874c7..4a9a65d 100644 (file)
@@ -1002,7 +1002,7 @@ static void calc_cached_dev_sectors(struct cache_set *c)
        struct cached_dev *dc;
 
        list_for_each_entry(dc, &c->cached_devs, list)
-               sectors += bdev_sectors(dc->bdev);
+               sectors += bdev_nr_sectors(dc->bdev);
 
        c->cached_dev_sectors = sectors;
 }
@@ -1026,8 +1026,8 @@ static int cached_dev_status_update(void *arg)
                        dc->offline_seconds = 0;
 
                if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
-                       pr_err("%s: device offline for %d seconds\n",
-                              dc->backing_dev_name,
+                       pr_err("%pg: device offline for %d seconds\n",
+                              dc->bdev,
                               BACKING_DEV_OFFLINE_TIMEOUT);
                        pr_err("%s: disable I/O request due to backing device offline\n",
                               dc->disk.name);
@@ -1058,15 +1058,13 @@ int bch_cached_dev_run(struct cached_dev *dc)
        };
 
        if (dc->io_disable) {
-               pr_err("I/O disabled on cached dev %s\n",
-                      dc->backing_dev_name);
+               pr_err("I/O disabled on cached dev %pg\n", dc->bdev);
                ret = -EIO;
                goto out;
        }
 
        if (atomic_xchg(&dc->running, 1)) {
-               pr_info("cached dev %s is running already\n",
-                      dc->backing_dev_name);
+               pr_info("cached dev %pg is running already\n", dc->bdev);
                ret = -EBUSY;
                goto out;
        }
@@ -1082,7 +1080,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
                closure_sync(&cl);
        }
 
-       add_disk(d->disk);
+       ret = add_disk(d->disk);
+       if (ret)
+               goto out;
        bd_link_disk_holder(dc->bdev, dc->disk.disk);
        /*
         * won't show up in the uevent file, use udevadm monitor -e instead
@@ -1154,16 +1154,16 @@ static void cached_dev_detach_finish(struct work_struct *w)
 
        mutex_lock(&bch_register_lock);
 
-       calc_cached_dev_sectors(dc->disk.c);
        bcache_device_detach(&dc->disk);
        list_move(&dc->list, &uncached_devices);
+       calc_cached_dev_sectors(dc->disk.c);
 
        clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
        clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 
        mutex_unlock(&bch_register_lock);
 
-       pr_info("Caching disabled for %s\n", dc->backing_dev_name);
+       pr_info("Caching disabled for %pg\n", dc->bdev);
 
        /* Drop ref we took in cached_dev_detach() */
        closure_put(&dc->disk.cl);
@@ -1203,29 +1203,27 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
                return -ENOENT;
 
        if (dc->disk.c) {
-               pr_err("Can't attach %s: already attached\n",
-                      dc->backing_dev_name);
+               pr_err("Can't attach %pg: already attached\n", dc->bdev);
                return -EINVAL;
        }
 
        if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
-               pr_err("Can't attach %s: shutting down\n",
-                      dc->backing_dev_name);
+               pr_err("Can't attach %pg: shutting down\n", dc->bdev);
                return -EINVAL;
        }
 
        if (dc->sb.block_size < c->cache->sb.block_size) {
                /* Will die */
-               pr_err("Couldn't attach %s: block size less than set's block size\n",
-                      dc->backing_dev_name);
+               pr_err("Couldn't attach %pg: block size less than set's block size\n",
+                      dc->bdev);
                return -EINVAL;
        }
 
        /* Check whether already attached */
        list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
                if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
-                       pr_err("Tried to attach %s but duplicate UUID already attached\n",
-                               dc->backing_dev_name);
+                       pr_err("Tried to attach %pg but duplicate UUID already attached\n",
+                               dc->bdev);
 
                        return -EINVAL;
                }
@@ -1243,15 +1241,13 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 
        if (!u) {
                if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-                       pr_err("Couldn't find uuid for %s in set\n",
-                              dc->backing_dev_name);
+                       pr_err("Couldn't find uuid for %pg in set\n", dc->bdev);
                        return -ENOENT;
                }
 
                u = uuid_find_empty(c);
                if (!u) {
-                       pr_err("Not caching %s, no room for UUID\n",
-                              dc->backing_dev_name);
+                       pr_err("Not caching %pg, no room for UUID\n", dc->bdev);
                        return -EINVAL;
                }
        }
@@ -1319,8 +1315,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
                 */
                kthread_stop(dc->writeback_thread);
                cancel_writeback_rate_update_dwork(dc);
-               pr_err("Couldn't run cached device %s\n",
-                      dc->backing_dev_name);
+               pr_err("Couldn't run cached device %pg\n", dc->bdev);
                return ret;
        }
 
@@ -1336,8 +1331,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
        /* Allow the writeback thread to proceed */
        up_write(&dc->writeback_lock);
 
-       pr_info("Caching %s as %s on set %pU\n",
-               dc->backing_dev_name,
+       pr_info("Caching %pg as %s on set %pU\n",
+               dc->bdev,
                dc->disk.disk->disk_name,
                dc->disk.c->set_uuid);
        return 0;
@@ -1461,7 +1456,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
        struct cache_set *c;
        int ret = -ENOMEM;
 
-       bdevname(bdev, dc->backing_dev_name);
        memcpy(&dc->sb, sb, sizeof(struct cache_sb));
        dc->bdev = bdev;
        dc->bdev->bd_holder = dc;
@@ -1476,7 +1470,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
        if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
                goto err;
 
-       pr_info("registered backing device %s\n", dc->backing_dev_name);
+       pr_info("registered backing device %pg\n", dc->bdev);
 
        list_add(&dc->list, &uncached_devices);
        /* attach to a matched cache set if it exists */
@@ -1493,7 +1487,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 
        return 0;
 err:
-       pr_notice("error %s: %s\n", dc->backing_dev_name, err);
+       pr_notice("error %pg: %s\n", dc->bdev, err);
        bcache_device_stop(&dc->disk);
        return ret;
 }
@@ -1534,10 +1528,11 @@ static void flash_dev_flush(struct closure *cl)
 
 static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 {
+       int err = -ENOMEM;
        struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
                                          GFP_KERNEL);
        if (!d)
-               return -ENOMEM;
+               goto err_ret;
 
        closure_init(&d->cl, NULL);
        set_closure_fn(&d->cl, flash_dev_flush, system_wq);
@@ -1551,9 +1546,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
        bcache_device_attach(d, c, u - c->uuids);
        bch_sectors_dirty_init(d);
        bch_flash_dev_request_init(d);
-       add_disk(d->disk);
+       err = add_disk(d->disk);
+       if (err)
+               goto err;
 
-       if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+       err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache");
+       if (err)
                goto err;
 
        bcache_device_link(d, c, "volume");
@@ -1567,7 +1565,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
        return 0;
 err:
        kobject_put(&d->kobj);
-       return -ENOMEM;
+err_ret:
+       return err;
 }
 
 static int flash_devs_run(struct cache_set *c)
@@ -1621,8 +1620,8 @@ bool bch_cached_dev_error(struct cached_dev *dc)
        /* make others know io_disable is true earlier */
        smp_mb();
 
-       pr_err("stop %s: too many IO errors on backing device %s\n",
-              dc->disk.disk->disk_name, dc->backing_dev_name);
+       pr_err("stop %s: too many IO errors on backing device %pg\n",
+              dc->disk.disk->disk_name, dc->bdev);
 
        bcache_device_stop(&dc->disk);
        return true;
@@ -2338,7 +2337,7 @@ err_btree_alloc:
 err_free:
        module_put(THIS_MODULE);
        if (err)
-               pr_notice("error %s: %s\n", ca->cache_dev_name, err);
+               pr_notice("error %pg: %s\n", ca->bdev, err);
        return ret;
 }
 
@@ -2348,7 +2347,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
        const char *err = NULL; /* must be set for any error case */
        int ret = 0;
 
-       bdevname(bdev, ca->cache_dev_name);
        memcpy(&ca->sb, sb, sizeof(struct cache_sb));
        ca->bdev = bdev;
        ca->bdev->bd_holder = ca;
@@ -2390,14 +2388,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
                goto out;
        }
 
-       pr_info("registered cache device %s\n", ca->cache_dev_name);
+       pr_info("registered cache device %pg\n", ca->bdev);
 
 out:
        kobject_put(&ca->kobj);
 
 err:
        if (err)
-               pr_notice("error %s: %s\n", ca->cache_dev_name, err);
+               pr_notice("error %pg: %s\n", ca->bdev, err);
 
        return ret;
 }
@@ -2617,8 +2615,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
        if (SB_IS_BDEV(sb)) {
                struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
 
-               if (!dc)
+               if (!dc) {
+                       ret = -ENOMEM;
+                       err = "cannot allocate memory";
                        goto out_put_sb_page;
+               }
 
                mutex_lock(&bch_register_lock);
                ret = register_bdev(sb, sb_disk, bdev, dc);
@@ -2629,11 +2630,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
        } else {
                struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 
-               if (!ca)
+               if (!ca) {
+                       ret = -ENOMEM;
+                       err = "cannot allocate memory";
                        goto out_put_sb_page;
+               }
 
                /* blkdev_put() will be called in bch_cache_release() */
-               if (register_cache(sb, sb_disk, bdev, ca) != 0)
+               ret = register_cache(sb, sb_disk, bdev, ca);
+               if (ret)
                        goto out_free_sb;
        }
 
@@ -2750,7 +2755,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
                 * The reason bch_register_lock is not held to call
                 * bch_cache_set_stop() and bcache_device_stop() is to
                 * avoid potential deadlock during reboot, because cache
-                * set or bcache device stopping process will acqurie
+                * set or bcache device stopping process will acquire
                 * bch_register_lock too.
                 *
                 * We are safe here because bcache_is_reboot sets to
index 05ac1d6..1f0dce3 100644 (file)
@@ -271,7 +271,7 @@ SHOW(__bch_cached_dev)
        }
 
        if (attr == &sysfs_backing_dev_name) {
-               snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
+               snprintf(buf, BDEVNAME_SIZE + 1, "%pg", dc->bdev);
                strcat(buf, "\n");
                return strlen(buf);
        }
index 215df32..c1752ba 100644 (file)
@@ -51,13 +51,27 @@ STORE(fn)                                                           \
 #define sysfs_printf(file, fmt, ...)                                   \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
-               return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
+               return sysfs_emit(buf, fmt "\n", __VA_ARGS__);  \
 } while (0)
 
 #define sysfs_print(file, var)                                         \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
-               return snprint(buf, PAGE_SIZE, var);                    \
+               return sysfs_emit(buf,                                          \
+                               __builtin_types_compatible_p(typeof(var), int)          \
+                                        ? "%i\n" :                             \
+                               __builtin_types_compatible_p(typeof(var), unsigned int) \
+                                        ? "%u\n" :                             \
+                               __builtin_types_compatible_p(typeof(var), long)         \
+                                        ? "%li\n" :                    \
+                               __builtin_types_compatible_p(typeof(var), unsigned long)\
+                                        ? "%lu\n" :                    \
+                               __builtin_types_compatible_p(typeof(var), int64_t)      \
+                                        ? "%lli\n" :                   \
+                               __builtin_types_compatible_p(typeof(var), uint64_t)     \
+                                        ? "%llu\n" :                   \
+                               __builtin_types_compatible_p(typeof(var), const char *) \
+                                        ? "%s\n" : "%i\n", var);       \
 } while (0)
 
 #define sysfs_hprint(file, val)                                                \
index b64460a..6f3cb7c 100644 (file)
@@ -340,23 +340,6 @@ static inline int bch_strtoul_h(const char *cp, long *res)
        _r;                                                             \
 })
 
-#define snprint(buf, size, var)                                                \
-       snprintf(buf, size,                                             \
-               __builtin_types_compatible_p(typeof(var), int)          \
-                    ? "%i\n" :                                         \
-               __builtin_types_compatible_p(typeof(var), unsigned int) \
-                    ? "%u\n" :                                         \
-               __builtin_types_compatible_p(typeof(var), long)         \
-                    ? "%li\n" :                                        \
-               __builtin_types_compatible_p(typeof(var), unsigned long)\
-                    ? "%lu\n" :                                        \
-               __builtin_types_compatible_p(typeof(var), int64_t)      \
-                    ? "%lli\n" :                                       \
-               __builtin_types_compatible_p(typeof(var), uint64_t)     \
-                    ? "%llu\n" :                                       \
-               __builtin_types_compatible_p(typeof(var), const char *) \
-                    ? "%s\n" : "%i\n", var)
-
 ssize_t bch_hprint(char *buf, int64_t v);
 
 bool bch_is_zero(const char *p, size_t n);
@@ -548,14 +531,6 @@ static inline uint64_t bch_crc64(const void *p, size_t len)
        return crc ^ 0xffffffffffffffffULL;
 }
 
-static inline uint64_t bch_crc64_update(uint64_t crc,
-                                       const void *p,
-                                       size_t len)
-{
-       crc = crc64_be(crc, p, len);
-       return crc;
-}
-
 /*
  * A stepwise-linear pseudo-exponential.  This returns 1 << (x >>
  * frac_bits), with the less-significant bits filled in by linear
@@ -584,8 +559,4 @@ static inline unsigned int fract_exp_two(unsigned int x,
 void bch_bio_map(struct bio *bio, void *base);
 int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
 
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
-       return bdev->bd_inode->i_size >> 9;
-}
 #endif /* _BCACHE_UTIL_H */
index 8120da2..c7560f6 100644 (file)
@@ -45,7 +45,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
         * backing volume uses about 2% of the cache for dirty data.
         */
        uint32_t bdev_share =
-               div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+               div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
                                c->cached_dev_sectors);
 
        uint64_t cache_dirty_target =
index a3b7135..745e3ab 100644 (file)
@@ -8,6 +8,7 @@
 #define DM_BIO_RECORD_H
 
 #include <linux/bio.h>
+#include <linux/blk-integrity.h>
 
 /*
  * There are lots of mutable fields in the bio struct that get
index 50f3e67..104ebc1 100644 (file)
@@ -1525,7 +1525,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
 
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
 {
-       sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t s = bdev_nr_sectors(c->bdev);
        if (s >= c->start)
                s -= c->start;
        else
index 89a7320..2874f22 100644 (file)
@@ -334,7 +334,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
        int r;
        struct dm_block *sblock;
        struct cache_disk_superblock *disk_super;
-       sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t bdev_size = bdev_nr_sectors(cmd->bdev);
 
        /* FIXME: see if we can lose the max sectors limit */
        if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
index bdd5004..447d030 100644 (file)
@@ -1940,7 +1940,7 @@ static void cache_dtr(struct dm_target *ti)
 
 static sector_t get_dev_size(struct dm_dev *dev)
 {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
 }
 
 /*----------------------------------------------------------------*/
index edd22e4..4599632 100644 (file)
@@ -1514,7 +1514,7 @@ error:
 
 static sector_t get_dev_size(struct dm_dev *dev)
 {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
 }
 
 /*---------------------------------------------------------------------------*/
index 55dccdf..b855fef 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/ktime.h>
 #include <linux/genhd.h>
 #include <linux/blk-mq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 
 #include <trace/events/block.h>
 
@@ -200,7 +200,7 @@ struct dm_table {
        struct dm_md_mempools *mempools;
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       struct blk_keyslot_manager *ksm;
+       struct blk_crypto_profile *crypto_profile;
 #endif
 };
 
index 916b7da..292f789 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/key.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
index 3163e2b..0367220 100644 (file)
@@ -415,7 +415,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
                        char *result, unsigned int maxlen)
 {
        struct dust_device *dd = ti->private;
-       sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t size = bdev_nr_sectors(dd->dev->bdev);
        bool invalid_msg = false;
        int r = -EINVAL;
        unsigned long long tmp, block;
@@ -544,8 +544,7 @@ static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-       if (dd->start ||
-           ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (dd->start || ti->len != bdev_nr_sectors(dev->bdev))
                return 1;
 
        return 0;
index d259896..7ce5d50 100644 (file)
@@ -416,7 +416,7 @@ static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
         * Only pass ioctls through if the device sizes match exactly.
         */
        *bdev = dev->bdev;
-       return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT);
+       return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev));
 }
 
 static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits)
index 2a78f68..1f6bf15 100644 (file)
@@ -1681,7 +1681,7 @@ static int era_message(struct dm_target *ti, unsigned argc, char **argv,
 
 static sector_t get_dev_size(struct dm_dev *dev)
 {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
 }
 
 static int era_iterate_devices(struct dm_target *ti,
index 3f4139a..b5f20eb 100644 (file)
@@ -168,7 +168,7 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
  */
 static inline sector_t get_dev_size(struct block_device *bdev)
 {
-       return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(bdev);
 }
 
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
index 4b94ffe..345229d 100644 (file)
@@ -456,8 +456,7 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-       if (fc->start ||
-           ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+       if (fc->start || ti->len != bdev_nr_sectors((*bdev)))
                return 1;
        return 0;
 }
index 2c5edfb..9579999 100644 (file)
@@ -12,6 +12,7 @@
 #include "dm-ima.h"
 
 #include <linux/ima.h>
+#include <linux/sched/mm.h>
 #include <crypto/hash.h>
 #include <linux/crypto.h>
 #include <crypto/hash_info.h>
index dc03b70..d0f788e 100644 (file)
@@ -4113,11 +4113,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                }
        }
 
-       ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev);
        if (!ic->meta_dev)
                ic->meta_device_sectors = ic->data_device_sectors;
        else
-               ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT;
+               ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev);
 
        if (!journal_sectors) {
                journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
@@ -4367,7 +4367,7 @@ try_smaller_buffer:
        DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
        DEBUG_print("   journal_entries %u\n", ic->journal_entries);
        DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
-       DEBUG_print("   data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
+       DEBUG_print("   data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev));
        DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
        DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
        DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
index 679b4c0..66ba167 100644 (file)
@@ -135,8 +135,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-       if (lc->start ||
-           ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (lc->start || ti->len != bdev_nr_sectors(dev->bdev))
                return 1;
        return 0;
 }
index d93a4db..46de085 100644 (file)
@@ -446,7 +446,7 @@ static int log_super(struct log_writes_c *lc)
 
 static inline sector_t logdev_last_sector(struct log_writes_c *lc)
 {
-       return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(lc->logdev->bdev);
 }
 
 static int log_writes_kthread(void *arg)
@@ -851,7 +851,7 @@ static int log_writes_prepare_ioctl(struct dm_target *ti,
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-       if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (ti->len != bdev_nr_sectors(dev->bdev))
                return 1;
        return 0;
 }
index 1ecf75e..06f3289 100644 (file)
@@ -447,7 +447,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                                bdev_logical_block_size(lc->header_location.
                                                            bdev));
 
-               if (buf_size > i_size_read(dev->bdev->bd_inode)) {
+               if (buf_size > bdev_nr_bytes(dev->bdev)) {
                        DMWARN("log device %s too small: need %llu bytes",
                                dev->name, (unsigned long long)buf_size);
                        kfree(lc);
index 694aaca..90dc9cc 100644 (file)
@@ -530,7 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
        bdev = pgpath->path.dev->bdev;
        q = bdev_get_queue(bdev);
-       clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+       clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE,
                        BLK_MQ_REQ_NOWAIT);
        if (IS_ERR(clone)) {
                /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
@@ -579,7 +579,7 @@ static void multipath_release_clone(struct request *clone,
                                                    clone->io_start_time_ns);
        }
 
-       blk_put_request(clone);
+       blk_mq_free_request(clone);
 }
 
 /*
@@ -2061,7 +2061,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-       if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+       if (!r && ti->len != bdev_nr_sectors((*bdev)))
                return 1;
        return r;
 }
index 1856a1b..875bca3 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/sched/clock.h>
 
 
 #define DM_MSG_PREFIX  "multipath historical-service-time"
index d9ef521..2b26435 100644 (file)
@@ -1261,7 +1261,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                        md_rdev_init(jdev);
                        jdev->mddev = &rs->md;
                        jdev->bdev = rs->journal_dev.dev->bdev;
-                       jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+                       jdev->sectors = bdev_nr_sectors(jdev->bdev);
                        if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
                                rs->ti->error = "No space for raid4/5/6 journal";
                                return -ENOSPC;
@@ -1607,7 +1607,7 @@ static int _check_data_dev_sectors(struct raid_set *rs)
 
        rdev_for_each(rdev, &rs->md)
                if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
-                       ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
+                       ds = min(ds, bdev_nr_sectors(rdev->bdev));
                        if (ds < rs->md.dev_sectors) {
                                rs->ti->error = "Component device(s) too small";
                                return -EINVAL;
@@ -2662,7 +2662,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
         * Make sure we got a minimum amount of free sectors per device
         */
        if (rs->data_offset &&
-           to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
+           bdev_nr_sectors(rdev->bdev) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
                rs->ti->error = data_offset ? "No space for forward reshape" :
                                              "No space for backward reshape";
                return -ENOSPC;
index a896dea..579ab61 100644 (file)
@@ -7,7 +7,6 @@
 #include "dm-core.h"
 #include "dm-rq.h"
 
-#include <linux/elevator.h> /* for rq_end_sector() */
 #include <linux/blk-mq.h>
 
 #define DM_MSG_PREFIX "core-rq"
index 028a92f..534dc2c 100644 (file)
@@ -529,7 +529,7 @@ static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
         * Only pass ioctls through if the device sizes match exactly.
         */
        if (ti->len + sctx->path_list[path_nr].start !=
-           i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+           bdev_nr_sectors((*bdev)))
                return 1;
        return 0;
 }
index 2111daa..bcddc5e 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
@@ -169,7 +170,7 @@ static void free_devices(struct list_head *devices, struct mapped_device *md)
        }
 }
 
-static void dm_table_destroy_keyslot_manager(struct dm_table *t);
+static void dm_table_destroy_crypto_profile(struct dm_table *t);
 
 void dm_table_destroy(struct dm_table *t)
 {
@@ -199,7 +200,7 @@ void dm_table_destroy(struct dm_table *t)
 
        dm_free_md_mempools(t->mempools);
 
-       dm_table_destroy_keyslot_manager(t);
+       dm_table_destroy_crypto_profile(t);
 
        kfree(t);
 }
@@ -226,8 +227,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 {
        struct queue_limits *limits = data;
        struct block_device *bdev = dev->bdev;
-       sector_t dev_size =
-               i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t dev_size = bdev_nr_sectors(bdev);
        unsigned short logical_block_size_sectors =
                limits->logical_block_size >> SECTOR_SHIFT;
        char b[BDEVNAME_SIZE];
@@ -1186,8 +1186,8 @@ static int dm_table_register_integrity(struct dm_table *t)
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
-struct dm_keyslot_manager {
-       struct blk_keyslot_manager ksm;
+struct dm_crypto_profile {
+       struct blk_crypto_profile profile;
        struct mapped_device *md;
 };
 
@@ -1213,13 +1213,11 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
  * When an inline encryption key is evicted from a device-mapper device, evict
  * it from all the underlying devices.
  */
-static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int dm_keyslot_evict(struct blk_crypto_profile *profile,
                            const struct blk_crypto_key *key, unsigned int slot)
 {
-       struct dm_keyslot_manager *dksm = container_of(ksm,
-                                                      struct dm_keyslot_manager,
-                                                      ksm);
-       struct mapped_device *md = dksm->md;
+       struct mapped_device *md =
+               container_of(profile, struct dm_crypto_profile, profile)->md;
        struct dm_keyslot_evict_args args = { key };
        struct dm_table *t;
        int srcu_idx;
@@ -1239,150 +1237,148 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
        return args.err;
 }
 
-static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
-       .keyslot_evict = dm_keyslot_evict,
-};
-
-static int device_intersect_crypto_modes(struct dm_target *ti,
-                                        struct dm_dev *dev, sector_t start,
-                                        sector_t len, void *data)
+static int
+device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
+                                    sector_t start, sector_t len, void *data)
 {
-       struct blk_keyslot_manager *parent = data;
-       struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm;
+       struct blk_crypto_profile *parent = data;
+       struct blk_crypto_profile *child =
+               bdev_get_queue(dev->bdev)->crypto_profile;
 
-       blk_ksm_intersect_modes(parent, child);
+       blk_crypto_intersect_capabilities(parent, child);
        return 0;
 }
 
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
 {
-       struct dm_keyslot_manager *dksm = container_of(ksm,
-                                                      struct dm_keyslot_manager,
-                                                      ksm);
+       struct dm_crypto_profile *dmcp = container_of(profile,
+                                                     struct dm_crypto_profile,
+                                                     profile);
 
-       if (!ksm)
+       if (!profile)
                return;
 
-       blk_ksm_destroy(ksm);
-       kfree(dksm);
+       blk_crypto_profile_destroy(profile);
+       kfree(dmcp);
 }
 
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
 {
-       dm_destroy_keyslot_manager(t->ksm);
-       t->ksm = NULL;
+       dm_destroy_crypto_profile(t->crypto_profile);
+       t->crypto_profile = NULL;
 }
 
 /*
- * Constructs and initializes t->ksm with a keyslot manager that
- * represents the common set of crypto capabilities of the devices
- * described by the dm_table. However, if the constructed keyslot
- * manager does not support a superset of the crypto capabilities
- * supported by the current keyslot manager of the mapped_device,
- * it returns an error instead, since we don't support restricting
- * crypto capabilities on table changes. Finally, if the constructed
- * keyslot manager doesn't actually support any crypto modes at all,
- * it just returns NULL.
+ * Constructs and initializes t->crypto_profile with a crypto profile that
+ * represents the common set of crypto capabilities of the devices described by
+ * the dm_table.  However, if the constructed crypto profile doesn't support all
+ * crypto capabilities that are supported by the current mapped_device, it
+ * returns an error instead, since we don't support removing crypto capabilities
+ * on table changes.  Finally, if the constructed crypto profile is "empty" (has
+ * no crypto capabilities at all), it just sets t->crypto_profile to NULL.
  */
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
 {
-       struct dm_keyslot_manager *dksm;
-       struct blk_keyslot_manager *ksm;
+       struct dm_crypto_profile *dmcp;
+       struct blk_crypto_profile *profile;
        struct dm_target *ti;
        unsigned int i;
-       bool ksm_is_empty = true;
+       bool empty_profile = true;
 
-       dksm = kmalloc(sizeof(*dksm), GFP_KERNEL);
-       if (!dksm)
+       dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL);
+       if (!dmcp)
                return -ENOMEM;
-       dksm->md = t->md;
+       dmcp->md = t->md;
 
-       ksm = &dksm->ksm;
-       blk_ksm_init_passthrough(ksm);
-       ksm->ksm_ll_ops = dm_ksm_ll_ops;
-       ksm->max_dun_bytes_supported = UINT_MAX;
-       memset(ksm->crypto_modes_supported, 0xFF,
-              sizeof(ksm->crypto_modes_supported));
+       profile = &dmcp->profile;
+       blk_crypto_profile_init(profile, 0);
+       profile->ll_ops.keyslot_evict = dm_keyslot_evict;
+       profile->max_dun_bytes_supported = UINT_MAX;
+       memset(profile->modes_supported, 0xFF,
+              sizeof(profile->modes_supported));
 
        for (i = 0; i < dm_table_get_num_targets(t); i++) {
                ti = dm_table_get_target(t, i);
 
                if (!dm_target_passes_crypto(ti->type)) {
-                       blk_ksm_intersect_modes(ksm, NULL);
+                       blk_crypto_intersect_capabilities(profile, NULL);
                        break;
                }
                if (!ti->type->iterate_devices)
                        continue;
-               ti->type->iterate_devices(ti, device_intersect_crypto_modes,
-                                         ksm);
+               ti->type->iterate_devices(ti,
+                                         device_intersect_crypto_capabilities,
+                                         profile);
        }
 
-       if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) {
+       if (t->md->queue &&
+           !blk_crypto_has_capabilities(profile,
+                                        t->md->queue->crypto_profile)) {
                DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!");
-               dm_destroy_keyslot_manager(ksm);
+               dm_destroy_crypto_profile(profile);
                return -EINVAL;
        }
 
        /*
-        * If the new KSM doesn't actually support any crypto modes, we may as
-        * well represent it with a NULL ksm.
+        * If the new profile doesn't actually support any crypto capabilities,
+        * we may as well represent it with a NULL profile.
         */
-       ksm_is_empty = true;
-       for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) {
-               if (ksm->crypto_modes_supported[i]) {
-                       ksm_is_empty = false;
+       for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) {
+               if (profile->modes_supported[i]) {
+                       empty_profile = false;
                        break;
                }
        }
 
-       if (ksm_is_empty) {
-               dm_destroy_keyslot_manager(ksm);
-               ksm = NULL;
+       if (empty_profile) {
+               dm_destroy_crypto_profile(profile);
+               profile = NULL;
        }
 
        /*
-        * t->ksm is only set temporarily while the table is being set
-        * up, and it gets set to NULL after the capabilities have
-        * been transferred to the request_queue.
+        * t->crypto_profile is only set temporarily while the table is being
+        * set up, and it gets set to NULL after the profile has been
+        * transferred to the request_queue.
         */
-       t->ksm = ksm;
+       t->crypto_profile = profile;
 
        return 0;
 }
 
-static void dm_update_keyslot_manager(struct request_queue *q,
-                                     struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+                                    struct dm_table *t)
 {
-       if (!t->ksm)
+       if (!t->crypto_profile)
                return;
 
-       /* Make the ksm less restrictive */
-       if (!q->ksm) {
-               blk_ksm_register(t->ksm, q);
+       /* Make the crypto profile less restrictive. */
+       if (!q->crypto_profile) {
+               blk_crypto_register(t->crypto_profile, q);
        } else {
-               blk_ksm_update_capabilities(q->ksm, t->ksm);
-               dm_destroy_keyslot_manager(t->ksm);
+               blk_crypto_update_capabilities(q->crypto_profile,
+                                              t->crypto_profile);
+               dm_destroy_crypto_profile(t->crypto_profile);
        }
-       t->ksm = NULL;
+       t->crypto_profile = NULL;
 }
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
 {
        return 0;
 }
 
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
 {
 }
 
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
 {
 }
 
-static void dm_update_keyslot_manager(struct request_queue *q,
-                                     struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+                                    struct dm_table *t)
 {
 }
 
@@ -1414,9 +1410,9 @@ int dm_table_complete(struct dm_table *t)
                return r;
        }
 
-       r = dm_table_construct_keyslot_manager(t);
+       r = dm_table_construct_crypto_profile(t);
        if (r) {
-               DMERR("could not construct keyslot manager.");
+               DMERR("could not construct crypto profile.");
                return r;
        }
 
@@ -2070,7 +2066,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                        return r;
        }
 
-       dm_update_keyslot_manager(q, t);
+       dm_update_crypto_profile(q, t);
        disk_update_readahead(t->md->disk);
 
        return 0;
index c88ed14..1a96a07 100644 (file)
@@ -549,7 +549,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
        int r;
        struct dm_block *sblock;
        struct thin_disk_superblock *disk_super;
-       sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
 
        if (bdev_size > THIN_METADATA_MAX_SECTORS)
                bdev_size = THIN_METADATA_MAX_SECTORS;
index 4c67b77..ec119d2 100644 (file)
@@ -3212,7 +3212,7 @@ static int metadata_pre_commit_callback(void *context)
 
 static sector_t get_dev_size(struct block_device *bdev)
 {
-       return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(bdev);
 }
 
 static void warn_if_metadata_device_too_big(struct block_device *bdev)
index 88288c8..a7efe83 100644 (file)
@@ -18,6 +18,7 @@
 #include "dm-verity-verify-sig.h"
 #include <linux/module.h>
 #include <linux/reboot.h>
+#include <linux/scatterlist.h>
 
 #define DM_MSG_PREFIX                  "verity"
 
@@ -833,8 +834,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
        *bdev = v->data_dev->bdev;
 
-       if (v->data_start ||
-           ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (v->data_start || ti->len != bdev_nr_sectors(v->data_dev->bdev))
                return 1;
        return 0;
 }
index 1832044..0178060 100644 (file)
@@ -2341,7 +2341,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
                ti->error = "Cache data device lookup failed";
                goto bad;
        }
-       wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
+       wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
 
        /*
         * Parse the cache block size
index ae1bc48..8dc21c0 100644 (file)
@@ -733,7 +733,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path,
        dev->dev_idx = idx;
        (void)bdevname(dev->bdev, dev->name);
 
-       dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       dev->capacity = bdev_nr_sectors(bdev);
        if (ti->begin) {
                ti->error = "Partial mapping is not supported";
                goto err;
index 76d9da4..63aa522 100644 (file)
@@ -29,7 +29,7 @@
 #include <linux/refcount.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -1183,14 +1183,13 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
        mutex_unlock(&md->swap_bios_lock);
 }
 
-static blk_qc_t __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct dm_target_io *tio)
 {
        int r;
        sector_t sector;
        struct bio *clone = &tio->clone;
        struct dm_io *io = tio->io;
        struct dm_target *ti = tio->ti;
-       blk_qc_t ret = BLK_QC_T_NONE;
 
        clone->bi_end_io = clone_endio;
 
@@ -1226,7 +1225,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
        case DM_MAPIO_REMAPPED:
                /* the bio has been remapped so dispatch it */
                trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
-               ret = submit_bio_noacct(clone);
+               submit_bio_noacct(clone);
                break;
        case DM_MAPIO_KILL:
                if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1248,8 +1247,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
                DMWARN("unimplemented target map return value: %d", r);
                BUG();
        }
-
-       return ret;
 }
 
 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
@@ -1336,7 +1333,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
        }
 }
 
-static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
+static void __clone_and_map_simple_bio(struct clone_info *ci,
                                           struct dm_target_io *tio, unsigned *len)
 {
        struct bio *clone = &tio->clone;
@@ -1346,8 +1343,7 @@ static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
        __bio_clone_fast(clone, ci->bio);
        if (len)
                bio_setup_sector(clone, ci->sector, *len);
-
-       return __map_bio(tio);
+       __map_bio(tio);
 }
 
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
@@ -1361,7 +1357,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 
        while ((bio = bio_list_pop(&blist))) {
                tio = container_of(bio, struct dm_target_io, clone);
-               (void) __clone_and_map_simple_bio(ci, tio, len);
+               __clone_and_map_simple_bio(ci, tio, len);
        }
 }
 
@@ -1405,7 +1401,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
                free_tio(tio);
                return r;
        }
-       (void) __map_bio(tio);
+       __map_bio(tio);
 
        return 0;
 }
@@ -1520,11 +1516,10 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
 /*
  * Entry point to split a bio into clones and submit them to the targets.
  */
-static blk_qc_t __split_and_process_bio(struct mapped_device *md,
+static void __split_and_process_bio(struct mapped_device *md,
                                        struct dm_table *map, struct bio *bio)
 {
        struct clone_info ci;
-       blk_qc_t ret = BLK_QC_T_NONE;
        int error = 0;
 
        init_clone_info(&ci, md, map, bio);
@@ -1567,19 +1562,17 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 
                        bio_chain(b, bio);
                        trace_block_split(b, bio->bi_iter.bi_sector);
-                       ret = submit_bio_noacct(bio);
+                       submit_bio_noacct(bio);
                }
        }
 
        /* drop the extra reference count */
        dm_io_dec_pending(ci.io, errno_to_blk_status(error));
-       return ret;
 }
 
-static blk_qc_t dm_submit_bio(struct bio *bio)
+static void dm_submit_bio(struct bio *bio)
 {
        struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
-       blk_qc_t ret = BLK_QC_T_NONE;
        int srcu_idx;
        struct dm_table *map;
 
@@ -1609,10 +1602,9 @@ static blk_qc_t dm_submit_bio(struct bio *bio)
        if (is_abnormal_io(bio))
                blk_queue_split(&bio);
 
-       ret = __split_and_process_bio(md, map, bio);
+       __split_and_process_bio(md, map, bio);
 out:
        dm_put_live_table(md, srcu_idx);
-       return ret;
 }
 
 /*-----------------------------------------------------------------
@@ -1671,14 +1663,14 @@ static const struct dax_operations dm_dax_ops;
 static void dm_wq_work(struct work_struct *work);
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static void dm_queue_destroy_crypto_profile(struct request_queue *q)
 {
-       dm_destroy_keyslot_manager(q->ksm);
+       dm_destroy_crypto_profile(q->crypto_profile);
 }
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
-static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
 {
 }
 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
@@ -1704,7 +1696,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
                        dm_sysfs_exit(md);
                        del_gendisk(md->disk);
                }
-               dm_queue_destroy_keyslot_manager(md->queue);
+               dm_queue_destroy_crypto_profile(md->queue);
                blk_cleanup_disk(md->disk);
        }
 
@@ -2086,7 +2078,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
        if (r)
                return r;
 
-       add_disk(md->disk);
+       r = add_disk(md->disk);
+       if (r)
+               return r;
 
        r = dm_sysfs_init(md);
        if (r) {
index 6c0c3d0..5111ed9 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/badblocks.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
@@ -51,6 +52,7 @@
 #include <linux/hdreg.h>
 #include <linux/proc_fs.h>
 #include <linux/random.h>
+#include <linux/major.h>
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/file.h>
@@ -352,7 +354,7 @@ static bool create_on_open = true;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-void md_new_event(struct mddev *mddev)
+void md_new_event(void)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -441,19 +443,19 @@ check_suspended:
 }
 EXPORT_SYMBOL(md_handle_request);
 
-static blk_qc_t md_submit_bio(struct bio *bio)
+static void md_submit_bio(struct bio *bio)
 {
        const int rw = bio_data_dir(bio);
        struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
 
        if (mddev == NULL || mddev->pers == NULL) {
                bio_io_error(bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
                bio_io_error(bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        blk_queue_split(&bio);
@@ -462,15 +464,13 @@ static blk_qc_t md_submit_bio(struct bio *bio)
                if (bio_sectors(bio) != 0)
                        bio->bi_status = BLK_STS_IOERR;
                bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        /* bio could be mergeable after passing to underlayer */
        bio->bi_opf &= ~REQ_NOMERGE;
 
        md_handle_request(mddev, bio);
-
-       return BLK_QC_T_NONE;
 }
 
 /* mddev_suspend makes sure no new requests are submitted
@@ -888,8 +888,7 @@ static struct md_personality *find_pers(int level, char *clevel)
 /* return the offset of the super block in 512byte sectors */
 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
 {
-       sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
-       return MD_NEW_SIZE_SECTORS(num_sectors);
+       return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
 }
 
 static int alloc_disk_sb(struct md_rdev *rdev)
@@ -1631,8 +1630,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
         */
        switch(minor_version) {
        case 0:
-               sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
-               sb_start -= 8*2;
+               sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
                sb_start &= ~(sector_t)(4*2-1);
                break;
        case 1:
@@ -1787,10 +1785,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
                else
                        ret = 0;
        }
-       if (minor_version) {
-               sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
-               sectors -= rdev->data_offset;
-       } else
+       if (minor_version)
+               sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
+       else
                sectors = rdev->sb_start;
        if (sectors < le64_to_cpu(sb->data_size))
                return -EINVAL;
@@ -2168,8 +2165,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
                return 0; /* too confusing */
        if (rdev->sb_start < rdev->data_offset) {
                /* minor versions 1 and 2; superblock before data */
-               max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
-               max_sectors -= rdev->data_offset;
+               max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
                if (!num_sectors || num_sectors > max_sectors)
                        num_sectors = max_sectors;
        } else if (rdev->mddev->bitmap_info.offset) {
@@ -2178,7 +2174,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
        } else {
                /* minor version 0; superblock after data */
                sector_t sb_start, bm_space;
-               sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
+               sector_t dev_size = bdev_nr_sectors(rdev->bdev);
 
                /* 8K is for superblock */
                sb_start = dev_size - 8*2;
@@ -2886,7 +2882,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
        if (mddev->degraded)
                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-       md_new_event(mddev);
+       md_new_event();
        md_wakeup_thread(mddev->thread);
        return 0;
 }
@@ -2976,7 +2972,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
         *  -write_error - clears WriteErrorSeen
         *  {,-}failfast - set/clear FailFast
         */
+
+       struct mddev *mddev = rdev->mddev;
        int err = -EINVAL;
+       bool need_update_sb = false;
+
        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
                md_error(rdev->mddev, rdev);
                if (test_bit(Faulty, &rdev->flags))
@@ -2991,7 +2991,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->raid_disk >= 0)
                        err = -EBUSY;
                else {
-                       struct mddev *mddev = rdev->mddev;
                        err = 0;
                        if (mddev_is_clustered(mddev))
                                err = md_cluster_ops->remove_disk(mddev, rdev);
@@ -3002,16 +3001,18 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                                        md_wakeup_thread(mddev->thread);
                                }
-                               md_new_event(mddev);
+                               md_new_event();
                        }
                }
        } else if (cmd_match(buf, "writemostly")) {
                set_bit(WriteMostly, &rdev->flags);
                mddev_create_serial_pool(rdev->mddev, rdev, false);
+               need_update_sb = true;
                err = 0;
        } else if (cmd_match(buf, "-writemostly")) {
                mddev_destroy_serial_pool(rdev->mddev, rdev, false);
                clear_bit(WriteMostly, &rdev->flags);
+               need_update_sb = true;
                err = 0;
        } else if (cmd_match(buf, "blocked")) {
                set_bit(Blocked, &rdev->flags);
@@ -3037,9 +3038,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                err = 0;
        } else if (cmd_match(buf, "failfast")) {
                set_bit(FailFast, &rdev->flags);
+               need_update_sb = true;
                err = 0;
        } else if (cmd_match(buf, "-failfast")) {
                clear_bit(FailFast, &rdev->flags);
+               need_update_sb = true;
                err = 0;
        } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
                   !test_bit(Journal, &rdev->flags)) {
@@ -3118,6 +3121,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                clear_bit(ExternalBbl, &rdev->flags);
                err = 0;
        }
+       if (need_update_sb)
+               md_update_sb(mddev, 1);
        if (!err)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
        return err ? err : len;
@@ -3382,7 +3387,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                        if (!sectors)
                                return -EBUSY;
                } else if (!sectors)
-                       sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
+                       sectors = bdev_nr_sectors(rdev->bdev) -
                                rdev->data_offset;
                if (!my_mddev->pers->resize)
                        /* Cannot change size for RAID0 or Linear etc */
@@ -3709,7 +3714,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 
        kobject_init(&rdev->kobj, &rdev_ktype);
 
-       size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
+       size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
        if (!size) {
                pr_warn("md: %s has zero or unknown size, marking faulty!\n",
                        bdevname(rdev->bdev,b));
@@ -4099,7 +4104,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
        if (!mddev->thread)
                md_update_sb(mddev, 1);
        sysfs_notify_dirent_safe(mddev->sysfs_level);
-       md_new_event(mddev);
+       md_new_event();
        rv = len;
 out_unlock:
        mddev_unlock(mddev);
@@ -4620,7 +4625,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
                export_rdev(rdev);
        mddev_unlock(mddev);
        if (!err)
-               md_new_event(mddev);
+               md_new_event();
        return err ? err : len;
 }
 
@@ -5490,6 +5495,10 @@ static struct attribute *md_default_attrs[] = {
        NULL,
 };
 
+static const struct attribute_group md_default_group = {
+       .attrs = md_default_attrs,
+};
+
 static struct attribute *md_redundancy_attrs[] = {
        &md_scan_mode.attr,
        &md_last_scan_mode.attr,
@@ -5512,6 +5521,12 @@ static const struct attribute_group md_redundancy_group = {
        .attrs = md_redundancy_attrs,
 };
 
+static const struct attribute_group *md_attr_groups[] = {
+       &md_default_group,
+       &md_bitmap_group,
+       NULL,
+};
+
 static ssize_t
 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
@@ -5587,7 +5602,7 @@ static const struct sysfs_ops md_sysfs_ops = {
 static struct kobj_type md_ktype = {
        .release        = md_free,
        .sysfs_ops      = &md_sysfs_ops,
-       .default_attrs  = md_default_attrs,
+       .default_groups = md_attr_groups,
 };
 
 int mdp_major = 0;
@@ -5596,7 +5611,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
 {
        struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
-       sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
        kobject_del(&mddev->kobj);
        kobject_put(&mddev->kobj);
 }
@@ -5663,7 +5677,7 @@ static int md_alloc(dev_t dev, char *name)
                            strcmp(mddev2->gendisk->disk_name, name) == 0) {
                                spin_unlock(&all_mddevs_lock);
                                error = -EEXIST;
-                               goto abort;
+                               goto out_unlock_disks_mutex;
                        }
                spin_unlock(&all_mddevs_lock);
        }
@@ -5676,7 +5690,7 @@ static int md_alloc(dev_t dev, char *name)
        error = -ENOMEM;
        disk = blk_alloc_disk(NUMA_NO_NODE);
        if (!disk)
-               goto abort;
+               goto out_unlock_disks_mutex;
 
        disk->major = MAJOR(mddev->unit);
        disk->first_minor = unit << shift;
@@ -5700,27 +5714,25 @@ static int md_alloc(dev_t dev, char *name)
        disk->flags |= GENHD_FL_EXT_DEVT;
        disk->events |= DISK_EVENT_MEDIA_CHANGE;
        mddev->gendisk = disk;
-       add_disk(disk);
+       error = add_disk(disk);
+       if (error)
+               goto out_cleanup_disk;
 
        error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
-       if (error) {
-               /* This isn't possible, but as kobject_init_and_add is marked
-                * __must_check, we must do something with the result
-                */
-               pr_debug("md: cannot register %s/md - name in use\n",
-                        disk->disk_name);
-               error = 0;
-       }
-       if (mddev->kobj.sd &&
-           sysfs_create_group(&mddev->kobj, &md_bitmap_group))
-               pr_debug("pointless warning\n");
- abort:
+       if (error)
+               goto out_del_gendisk;
+
+       kobject_uevent(&mddev->kobj, KOBJ_ADD);
+       mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
+       mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
+       goto out_unlock_disks_mutex;
+
+out_del_gendisk:
+       del_gendisk(disk);
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
+out_unlock_disks_mutex:
        mutex_unlock(&disks_mutex);
-       if (!error && mddev->kobj.sd) {
-               kobject_uevent(&mddev->kobj, KOBJ_ADD);
-               mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
-               mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
-       }
        mddev_put(mddev);
        return error;
 }
@@ -6034,7 +6046,7 @@ int md_run(struct mddev *mddev)
        if (mddev->sb_flags)
                md_update_sb(mddev, 0);
 
-       md_new_event(mddev);
+       md_new_event();
        return 0;
 
 bitmap_abort:
@@ -6424,7 +6436,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
                if (mddev->hold_active == UNTIL_STOP)
                        mddev->hold_active = 0;
        }
-       md_new_event(mddev);
+       md_new_event();
        sysfs_notify_dirent_safe(mddev->sysfs_state);
        return 0;
 }
@@ -6880,7 +6892,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 
                if (!mddev->persistent) {
                        pr_debug("md: nonpersistent superblock ...\n");
-                       rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+                       rdev->sb_start = bdev_nr_sectors(rdev->bdev);
                } else
                        rdev->sb_start = calc_dev_sboffset(rdev);
                rdev->sectors = rdev->sb_start;
@@ -6928,7 +6940,7 @@ kick_rdev:
                md_wakeup_thread(mddev->thread);
        else
                md_update_sb(mddev, 1);
-       md_new_event(mddev);
+       md_new_event();
 
        return 0;
 busy:
@@ -6967,7 +6979,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
        if (mddev->persistent)
                rdev->sb_start = calc_dev_sboffset(rdev);
        else
-               rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+               rdev->sb_start = bdev_nr_sectors(rdev->bdev);
 
        rdev->sectors = rdev->sb_start;
 
@@ -7001,7 +7013,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
         */
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
-       md_new_event(mddev);
+       md_new_event();
        return 0;
 
 abort_export:
@@ -7975,7 +7987,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
        md_wakeup_thread(mddev->thread);
        if (mddev->event_work.func)
                queue_work(md_misc_wq, &mddev->event_work);
-       md_new_event(mddev);
+       md_new_event();
 }
 EXPORT_SYMBOL(md_error);
 
@@ -8859,7 +8871,7 @@ void md_do_sync(struct md_thread *thread)
                mddev->curr_resync = 3; /* no longer delayed */
        mddev->curr_resync_completed = j;
        sysfs_notify_dirent_safe(mddev->sysfs_completed);
-       md_new_event(mddev);
+       md_new_event();
        update_time = jiffies;
 
        blk_start_plug(&plug);
@@ -8930,7 +8942,7 @@ void md_do_sync(struct md_thread *thread)
                        /* this is the earliest that rebuild will be
                         * visible in /proc/mdstat
                         */
-                       md_new_event(mddev);
+                       md_new_event();
 
                if (last_check + window > io_sectors || j == max_sectors)
                        continue;
@@ -9154,7 +9166,7 @@ static int remove_and_add_spares(struct mddev *mddev,
                        sysfs_link_rdev(mddev, rdev);
                        if (!test_bit(Journal, &rdev->flags))
                                spares++;
-                       md_new_event(mddev);
+                       md_new_event();
                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                }
        }
@@ -9188,7 +9200,7 @@ static void md_start_sync(struct work_struct *ws)
        } else
                md_wakeup_thread(mddev->sync_thread);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
-       md_new_event(mddev);
+       md_new_event();
 }
 
 /*
@@ -9447,7 +9459,7 @@ void md_reap_sync_thread(struct mddev *mddev)
        /* flag recovery needed just to double check */
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
-       md_new_event(mddev);
+       md_new_event();
        if (mddev->event_work.func)
                queue_work(md_misc_wq, &mddev->event_work);
 }
index 4c96c36..53ea7a6 100644 (file)
@@ -731,7 +731,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
                        struct page *page, int op, int op_flags,
                        bool metadata_op);
 extern void md_do_sync(struct md_thread *thread);
-extern void md_new_event(struct mddev *mddev);
+extern void md_new_event(void);
 extern void md_allow_write(struct mddev *mddev);
 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
index 19598bd..7dc8026 100644 (file)
@@ -1496,7 +1496,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                if (!r1_bio->bios[i])
                        continue;
 
-               if (first_clone) {
+               if (first_clone && test_bit(WriteMostly, &rdev->flags)) {
                        /* do behind I/O ?
                         * Not if there are too many, or cannot
                         * allocate memory, or a reader on WriteMostly
@@ -1529,13 +1529,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
                r1_bio->bios[i] = mbio;
 
-               mbio->bi_iter.bi_sector = (r1_bio->sector +
-                                  conf->mirrors[i].rdev->data_offset);
-               bio_set_dev(mbio, conf->mirrors[i].rdev->bdev);
+               mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
+               bio_set_dev(mbio, rdev->bdev);
                mbio->bi_end_io = raid1_end_write_request;
                mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
-               if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
-                   !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
+               if (test_bit(FailFast, &rdev->flags) &&
+                   !test_bit(WriteMostly, &rdev->flags) &&
                    conf->raid_disks - mddev->degraded > 1)
                        mbio->bi_opf |= MD_FAILFAST;
                mbio->bi_private = r1_bio;
@@ -1546,7 +1545,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
                                              r1_bio->sector);
                /* flush_pending_writes() needs access to the rdev so...*/
-               mbio->bi_bdev = (void *)conf->mirrors[i].rdev;
+               mbio->bi_bdev = (void *)rdev;
 
                cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
                if (cb)
index aa26365..dde98f6 100644 (file)
@@ -4647,7 +4647,7 @@ out:
        }
        conf->reshape_checkpoint = jiffies;
        md_wakeup_thread(mddev->sync_thread);
-       md_new_event(mddev);
+       md_new_event();
        return 0;
 
 abort:
index 02ed53b..9c1a587 100644 (file)
@@ -7732,10 +7732,7 @@ static int raid5_run(struct mddev *mddev)
                 * discard data disk but write parity disk
                 */
                stripe = stripe * PAGE_SIZE;
-               /* Round up to power of 2, as discard handling
-                * currently assumes that */
-               while ((stripe-1) & stripe)
-                       stripe = (stripe | (stripe-1)) + 1;
+               stripe = roundup_pow_of_two(stripe);
                mddev->queue->limits.discard_alignment = stripe;
                mddev->queue->limits.discard_granularity = stripe;
 
@@ -8282,7 +8279,7 @@ static int raid5_start_reshape(struct mddev *mddev)
        }
        conf->reshape_checkpoint = jiffies;
        md_wakeup_thread(mddev->sync_thread);
-       md_new_event(mddev);
+       md_new_event();
        return 0;
 }
 
index 431af5e..74882fa 100644 (file)
@@ -258,7 +258,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
        mq = &md->queue;
 
        /* Dispatch locking to the block layer */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_OUT, 0);
        if (IS_ERR(req)) {
                count = PTR_ERR(req);
                goto out_put;
@@ -266,7 +266,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
        req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
        blk_execute_rq(NULL, req, 0);
        ret = req_to_mmc_queue_req(req)->drv_op_result;
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
        if (!ret) {
                pr_info("%s: Locking boot partition ro until next power on\n",
@@ -646,7 +646,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
         * Dispatch the ioctl() into the block request queue.
         */
        mq = &md->queue;
-       req = blk_get_request(mq->queue,
+       req = blk_mq_alloc_request(mq->queue,
                idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -660,7 +660,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
        blk_execute_rq(NULL, req, 0);
        ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
        err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
 cmd_done:
        kfree(idata->buf);
@@ -716,7 +716,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
         * Dispatch the ioctl()s into the block request queue.
         */
        mq = &md->queue;
-       req = blk_get_request(mq->queue,
+       req = blk_mq_alloc_request(mq->queue,
                idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -733,7 +733,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
        for (i = 0; i < num_of_cmds && !err; i++)
                err = mmc_blk_ioctl_copy_to_user(&cmds[i], idata[i]);
 
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
 cmd_err:
        for (i = 0; i < num_of_cmds; i++) {
@@ -2730,7 +2730,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
        int ret;
 
        /* Ask the block layer about the card status */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
@@ -2740,7 +2740,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
                *val = ret;
                ret = 0;
        }
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
        return ret;
 }
@@ -2766,7 +2766,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
                return -ENOMEM;
 
        /* Ask the block layer for the EXT CSD */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out_free;
@@ -2775,7 +2775,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
        req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
        blk_execute_rq(NULL, req, 0);
        err = req_to_mmc_queue_req(req)->drv_op_result;
-       blk_put_request(req);
+       blk_mq_free_request(req);
        if (err) {
                pr_err("FAILED %d\n", err);
                goto out_free;
index 6755780..fec4fbf 100644 (file)
@@ -16,13 +16,13 @@ void mmc_crypto_set_initial_state(struct mmc_host *host)
 {
        /* Reset might clear all keys, so reprogram all the keys. */
        if (host->caps2 & MMC_CAP2_CRYPTO)
-               blk_ksm_reprogram_all_keys(&host->ksm);
+               blk_crypto_reprogram_all_keys(&host->crypto_profile);
 }
 
 void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host)
 {
        if (host->caps2 & MMC_CAP2_CRYPTO)
-               blk_ksm_register(&host->ksm, q);
+               blk_crypto_register(&host->crypto_profile, q);
 }
 EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue);
 
@@ -30,12 +30,15 @@ void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq)
 {
        struct request *req = mmc_queue_req_to_req(mqrq);
        struct mmc_request *mrq = &mqrq->brq.mrq;
+       struct blk_crypto_keyslot *keyslot;
 
        if (!req->crypt_ctx)
                return;
 
        mrq->crypto_ctx = req->crypt_ctx;
-       if (req->crypt_keyslot)
-               mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot);
+
+       keyslot = req->crypt_keyslot;
+       if (keyslot)
+               mrq->crypto_key_slot = blk_crypto_keyslot_index(keyslot);
 }
 EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req);
index 4646b7a..c9db24e 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
 
 #include <linux/mmc/host.h>
 #include <linux/mmc/card.h>
index 95b3511..ccc148c 100644 (file)
@@ -506,7 +506,7 @@ config MMC_OMAP_HS
 
 config MMC_WBSD
        tristate "Winbond W83L51xD SD/MMC Card Interface support"
-       depends on ISA_DMA_API
+       depends on ISA_DMA_API && !M68K
        help
          This selects the Winbond(R) W83L51xD Secure digital and
          Multimedia card Interface.
index 38559a9..31f8412 100644 (file)
@@ -282,6 +282,9 @@ static void __cqhci_enable(struct cqhci_host *cq_host)
 
        cqhci_writel(cq_host, cqcfg, CQHCI_CFG);
 
+       if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT)
+               cqhci_writel(cq_host, 0, CQHCI_CTL);
+
        mmc->cqe_on = true;
 
        if (cq_host->ops->enable)
index 6419cfb..d5f4b69 100644 (file)
@@ -6,7 +6,7 @@
  */
 
 #include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 #include <linux/mmc/host.h>
 
 #include "cqhci-crypto.h"
@@ -23,9 +23,10 @@ static const struct cqhci_crypto_alg_entry {
 };
 
 static inline struct cqhci_host *
-cqhci_host_from_ksm(struct blk_keyslot_manager *ksm)
+cqhci_host_from_crypto_profile(struct blk_crypto_profile *profile)
 {
-       struct mmc_host *mmc = container_of(ksm, struct mmc_host, ksm);
+       struct mmc_host *mmc =
+               container_of(profile, struct mmc_host, crypto_profile);
 
        return mmc->cqe_private;
 }
@@ -57,12 +58,12 @@ static int cqhci_crypto_program_key(struct cqhci_host *cq_host,
        return 0;
 }
 
-static int cqhci_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile,
                                        const struct blk_crypto_key *key,
                                        unsigned int slot)
 
 {
-       struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+       struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
        const union cqhci_crypto_cap_entry *ccap_array =
                cq_host->crypto_cap_array;
        const struct cqhci_crypto_alg_entry *alg =
@@ -115,11 +116,11 @@ static int cqhci_crypto_clear_keyslot(struct cqhci_host *cq_host, int slot)
        return cqhci_crypto_program_key(cq_host, &cfg, slot);
 }
 
-static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_evict(struct blk_crypto_profile *profile,
                                      const struct blk_crypto_key *key,
                                      unsigned int slot)
 {
-       struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+       struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
 
        return cqhci_crypto_clear_keyslot(cq_host, slot);
 }
@@ -132,7 +133,7 @@ static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
  * "enabled" when these are called, i.e. CQHCI_ENABLE might not be set in the
  * CQHCI_CFG register.  But the hardware allows that.
  */
-static const struct blk_ksm_ll_ops cqhci_ksm_ops = {
+static const struct blk_crypto_ll_ops cqhci_crypto_ops = {
        .keyslot_program        = cqhci_crypto_keyslot_program,
        .keyslot_evict          = cqhci_crypto_keyslot_evict,
 };
@@ -157,8 +158,8 @@ cqhci_find_blk_crypto_mode(union cqhci_crypto_cap_entry cap)
  *
  * If the driver previously set MMC_CAP2_CRYPTO and the CQE declares
  * CQHCI_CAP_CS, initialize the crypto support.  This involves reading the
- * crypto capability registers, initializing the keyslot manager, clearing all
- * keyslots, and enabling 128-bit task descriptors.
+ * crypto capability registers, initializing the blk_crypto_profile, clearing
+ * all keyslots, and enabling 128-bit task descriptors.
  *
  * Return: 0 if crypto was initialized or isn't supported; whether
  *        MMC_CAP2_CRYPTO remains set indicates which one of those cases it is.
@@ -168,7 +169,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
 {
        struct mmc_host *mmc = cq_host->mmc;
        struct device *dev = mmc_dev(mmc);
-       struct blk_keyslot_manager *ksm = &mmc->ksm;
+       struct blk_crypto_profile *profile = &mmc->crypto_profile;
        unsigned int num_keyslots;
        unsigned int cap_idx;
        enum blk_crypto_mode_num blk_mode_num;
@@ -199,15 +200,15 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
         */
        num_keyslots = cq_host->crypto_capabilities.config_count + 1;
 
-       err = devm_blk_ksm_init(dev, ksm, num_keyslots);
+       err = devm_blk_crypto_profile_init(dev, profile, num_keyslots);
        if (err)
                goto out;
 
-       ksm->ksm_ll_ops = cqhci_ksm_ops;
-       ksm->dev = dev;
+       profile->ll_ops = cqhci_crypto_ops;
+       profile->dev = dev;
 
        /* Unfortunately, CQHCI crypto only supports 32 DUN bits. */
-       ksm->max_dun_bytes_supported = 4;
+       profile->max_dun_bytes_supported = 4;
 
        /*
         * Cache all the crypto capabilities and advertise the supported crypto
@@ -223,7 +224,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
                                        cq_host->crypto_cap_array[cap_idx]);
                if (blk_mode_num == BLK_ENCRYPTION_MODE_INVALID)
                        continue;
-               ksm->crypto_modes_supported[blk_mode_num] |=
+               profile->modes_supported[blk_mode_num] |=
                        cq_host->crypto_cap_array[cap_idx].sdus_mask * 512;
        }
 
index 0c75810..1f8a3c0 100644 (file)
@@ -464,6 +464,18 @@ static s8 dw_mci_exynos_get_best_clksmpl(u8 candiates)
                }
        }
 
+       /*
+        * If there is no cadiates value, then it needs to return -EIO.
+        * If there are candiates values and don't find bset clk sample value,
+        * then use a first candiates clock sample value.
+        */
+       for (i = 0; i < iter; i++) {
+               __c = ror8(candiates, i);
+               if ((__c & 0x1) == 0x1) {
+                       loc = i;
+                       goto out;
+               }
+       }
 out:
        return loc;
 }
@@ -494,6 +506,8 @@ static int dw_mci_exynos_execute_tuning(struct dw_mci_slot *slot, u32 opcode)
                priv->tuned_sample = found;
        } else {
                ret = -EIO;
+               dev_warn(&mmc->class_dev,
+                       "There is no candiates value about clksmpl!\n");
        }
 
        return ret;
index 4dfc246..b06b4dc 100644 (file)
@@ -2577,6 +2577,25 @@ static int msdc_drv_probe(struct platform_device *pdev)
                host->dma_mask = DMA_BIT_MASK(32);
        mmc_dev(mmc)->dma_mask = &host->dma_mask;
 
+       host->timeout_clks = 3 * 1048576;
+       host->dma.gpd = dma_alloc_coherent(&pdev->dev,
+                               2 * sizeof(struct mt_gpdma_desc),
+                               &host->dma.gpd_addr, GFP_KERNEL);
+       host->dma.bd = dma_alloc_coherent(&pdev->dev,
+                               MAX_BD_NUM * sizeof(struct mt_bdma_desc),
+                               &host->dma.bd_addr, GFP_KERNEL);
+       if (!host->dma.gpd || !host->dma.bd) {
+               ret = -ENOMEM;
+               goto release_mem;
+       }
+       msdc_init_gpd_bd(host, &host->dma);
+       INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout);
+       spin_lock_init(&host->lock);
+
+       platform_set_drvdata(pdev, mmc);
+       msdc_ungate_clock(host);
+       msdc_init_hw(host);
+
        if (mmc->caps2 & MMC_CAP2_CQE) {
                host->cq_host = devm_kzalloc(mmc->parent,
                                             sizeof(*host->cq_host),
@@ -2597,25 +2616,6 @@ static int msdc_drv_probe(struct platform_device *pdev)
                mmc->max_seg_size = 64 * 1024;
        }
 
-       host->timeout_clks = 3 * 1048576;
-       host->dma.gpd = dma_alloc_coherent(&pdev->dev,
-                               2 * sizeof(struct mt_gpdma_desc),
-                               &host->dma.gpd_addr, GFP_KERNEL);
-       host->dma.bd = dma_alloc_coherent(&pdev->dev,
-                               MAX_BD_NUM * sizeof(struct mt_bdma_desc),
-                               &host->dma.bd_addr, GFP_KERNEL);
-       if (!host->dma.gpd || !host->dma.bd) {
-               ret = -ENOMEM;
-               goto release_mem;
-       }
-       msdc_init_gpd_bd(host, &host->dma);
-       INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout);
-       spin_lock_init(&host->lock);
-
-       platform_set_drvdata(pdev, mmc);
-       msdc_ungate_clock(host);
-       msdc_init_hw(host);
-
        ret = devm_request_irq(&pdev->dev, host->irq, msdc_irq,
                               IRQF_TRIGGER_NONE, pdev->name, host);
        if (ret)
index f18d169..e658f01 100644 (file)
@@ -1187,6 +1187,7 @@ static void esdhc_reset_tuning(struct sdhci_host *host)
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
        struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host);
        u32 ctrl;
+       int ret;
 
        /* Reset the tuning circuit */
        if (esdhc_is_usdhc(imx_data)) {
@@ -1199,7 +1200,22 @@ static void esdhc_reset_tuning(struct sdhci_host *host)
                } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
                        ctrl = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS);
                        ctrl &= ~ESDHC_MIX_CTRL_SMPCLK_SEL;
+                       ctrl &= ~ESDHC_MIX_CTRL_EXE_TUNE;
                        writel(ctrl, host->ioaddr + SDHCI_AUTO_CMD_STATUS);
+                       /* Make sure ESDHC_MIX_CTRL_EXE_TUNE cleared */
+                       ret = readl_poll_timeout(host->ioaddr + SDHCI_AUTO_CMD_STATUS,
+                               ctrl, !(ctrl & ESDHC_MIX_CTRL_EXE_TUNE), 1, 50);
+                       if (ret == -ETIMEDOUT)
+                               dev_warn(mmc_dev(host->mmc),
+                                "Warning! clear execute tuning bit failed\n");
+                       /*
+                        * SDHCI_INT_DATA_AVAIL is W1C bit, set this bit will clear the
+                        * usdhc IP internal logic flag execute_tuning_with_clr_buf, which
+                        * will finally make sure the normal data transfer logic correct.
+                        */
+                       ctrl = readl(host->ioaddr + SDHCI_INT_STATUS);
+                       ctrl |= SDHCI_INT_DATA_AVAIL;
+                       writel(ctrl, host->ioaddr + SDHCI_INT_STATUS);
                }
        }
 }
index be19785..d0f2edf 100644 (file)
@@ -616,16 +616,12 @@ static int intel_select_drive_strength(struct mmc_card *card,
        return intel_host->drv_strength;
 }
 
-static int bxt_get_cd(struct mmc_host *mmc)
+static int sdhci_get_cd_nogpio(struct mmc_host *mmc)
 {
-       int gpio_cd = mmc_gpio_get_cd(mmc);
        struct sdhci_host *host = mmc_priv(mmc);
        unsigned long flags;
        int ret = 0;
 
-       if (!gpio_cd)
-               return 0;
-
        spin_lock_irqsave(&host->lock, flags);
 
        if (host->flags & SDHCI_DEVICE_DEAD)
@@ -638,6 +634,21 @@ out:
        return ret;
 }
 
+static int bxt_get_cd(struct mmc_host *mmc)
+{
+       int gpio_cd = mmc_gpio_get_cd(mmc);
+
+       if (!gpio_cd)
+               return 0;
+
+       return sdhci_get_cd_nogpio(mmc);
+}
+
+static int mrfld_get_cd(struct mmc_host *mmc)
+{
+       return sdhci_get_cd_nogpio(mmc);
+}
+
 #define SDHCI_INTEL_PWR_TIMEOUT_CNT    20
 #define SDHCI_INTEL_PWR_TIMEOUT_UDELAY 100
 
@@ -1341,6 +1352,14 @@ static int intel_mrfld_mmc_probe_slot(struct sdhci_pci_slot *slot)
                                         MMC_CAP_1_8V_DDR;
                break;
        case INTEL_MRFLD_SD:
+               slot->cd_idx = 0;
+               slot->cd_override_level = true;
+               /*
+                * There are two PCB designs of SD card slot with the opposite
+                * card detection sense. Quirk this out by ignoring GPIO state
+                * completely in the custom ->get_cd() callback.
+                */
+               slot->host->mmc_host_ops.get_cd = mrfld_get_cd;
                slot->host->quirks2 |= SDHCI_QUIRK2_NO_1_8_V;
                break;
        case INTEL_MRFLD_SDIO:
index 8eefa7d..2d80a04 100644 (file)
@@ -2042,6 +2042,12 @@ void sdhci_set_power_noreg(struct sdhci_host *host, unsigned char mode,
                        break;
                case MMC_VDD_32_33:
                case MMC_VDD_33_34:
+               /*
+                * 3.4 ~ 3.6V are valid only for those platforms where it's
+                * known that the voltage range is supported by hardware.
+                */
+               case MMC_VDD_34_35:
+               case MMC_VDD_35_36:
                        pwr = SDHCI_POWER_330;
                        break;
                default:
index 7dfc26f..e2affa5 100644 (file)
@@ -195,6 +195,10 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host)
        sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, host->sdcard_irq_mask_all);
        host->sdcard_irq_mask = host->sdcard_irq_mask_all;
 
+       if (host->native_hotplug)
+               tmio_mmc_enable_mmc_irqs(host,
+                               TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT);
+
        tmio_mmc_set_bus_width(host, host->mmc->ios.bus_width);
 
        if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) {
@@ -956,8 +960,15 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        case MMC_POWER_OFF:
                tmio_mmc_power_off(host);
                /* For R-Car Gen2+, we need to reset SDHI specific SCC */
-               if (host->pdata->flags & TMIO_MMC_MIN_RCAR2)
+               if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) {
                        host->reset(host);
+
+                       if (host->native_hotplug)
+                               tmio_mmc_enable_mmc_irqs(host,
+                                               TMIO_STAT_CARD_REMOVE |
+                                               TMIO_STAT_CARD_INSERT);
+               }
+
                host->set_clock(host, 0);
                break;
        case MMC_POWER_UP:
@@ -1185,10 +1196,6 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host)
        _host->set_clock(_host, 0);
        tmio_mmc_reset(_host);
 
-       if (_host->native_hotplug)
-               tmio_mmc_enable_mmc_irqs(_host,
-                               TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT);
-
        spin_lock_init(&_host->lock);
        mutex_init(&_host->ios_lock);
 
index 4950d10..97beece 100644 (file)
@@ -576,7 +576,7 @@ static void check_vub300_port_status(struct vub300_mmc_host *vub300)
                                GET_SYSTEM_PORT_STATUS,
                                USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                0x0000, 0x0000, &vub300->system_port_status,
-                               sizeof(vub300->system_port_status), HZ);
+                               sizeof(vub300->system_port_status), 1000);
        if (sizeof(vub300->system_port_status) == retval)
                new_system_port_status(vub300);
 }
@@ -1241,7 +1241,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300,
                                                SET_INTERRUPT_PSEUDOCODE,
                                                USB_DIR_OUT | USB_TYPE_VENDOR |
                                                USB_RECIP_DEVICE, 0x0000, 0x0000,
-                                               xfer_buffer, xfer_length, HZ);
+                                               xfer_buffer, xfer_length, 1000);
                        kfree(xfer_buffer);
                        if (retval < 0)
                                goto copy_error_message;
@@ -1284,7 +1284,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300,
                                                SET_TRANSFER_PSEUDOCODE,
                                                USB_DIR_OUT | USB_TYPE_VENDOR |
                                                USB_RECIP_DEVICE, 0x0000, 0x0000,
-                                               xfer_buffer, xfer_length, HZ);
+                                               xfer_buffer, xfer_length, 1000);
                        kfree(xfer_buffer);
                        if (retval < 0)
                                goto copy_error_message;
@@ -1991,7 +1991,7 @@ static void __set_clock_speed(struct vub300_mmc_host *vub300, u8 buf[8],
                usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                SET_CLOCK_SPEED,
                                USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x00, 0x00, buf, buf_array_size, HZ);
+                               0x00, 0x00, buf, buf_array_size, 1000);
        if (retval != 8) {
                dev_err(&vub300->udev->dev, "SET_CLOCK_SPEED"
                        " %dkHz failed with retval=%d\n", kHzClock, retval);
@@ -2013,14 +2013,14 @@ static void vub300_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                SET_SD_POWER,
                                USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x0000, 0x0000, NULL, 0, HZ);
+                               0x0000, 0x0000, NULL, 0, 1000);
                /* must wait for the VUB300 u-proc to boot up */
                msleep(600);
        } else if ((ios->power_mode == MMC_POWER_UP) && !vub300->card_powered) {
                usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                SET_SD_POWER,
                                USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x0001, 0x0000, NULL, 0, HZ);
+                               0x0001, 0x0000, NULL, 0, 1000);
                msleep(600);
                vub300->card_powered = 1;
        } else if (ios->power_mode == MMC_POWER_ON) {
@@ -2275,14 +2275,14 @@ static int vub300_probe(struct usb_interface *interface,
                                GET_HC_INF0,
                                USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                0x0000, 0x0000, &vub300->hc_info,
-                               sizeof(vub300->hc_info), HZ);
+                               sizeof(vub300->hc_info), 1000);
        if (retval < 0)
                goto error5;
        retval =
                usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                SET_ROM_WAIT_STATES,
                                USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               firmware_rom_wait_states, 0x0000, NULL, 0, HZ);
+                               firmware_rom_wait_states, 0x0000, NULL, 0, 1000);
        if (retval < 0)
                goto error5;
        dev_info(&vub300->udev->dev,
@@ -2297,7 +2297,7 @@ static int vub300_probe(struct usb_interface *interface,
                                GET_SYSTEM_PORT_STATUS,
                                USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                0x0000, 0x0000, &vub300->system_port_status,
-                               sizeof(vub300->system_port_status), HZ);
+                               sizeof(vub300->system_port_status), 1000);
        if (retval < 0) {
                goto error4;
        } else if (sizeof(vub300->system_port_status) == retval) {
index b8ae1ec..4eaba6f 100644 (file)
@@ -384,7 +384,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
        if (new->readonly)
                set_disk_ro(gd, 1);
 
-       device_add_disk(&new->mtd->dev, gd, NULL);
+       ret = device_add_disk(&new->mtd->dev, gd, NULL);
+       if (ret)
+               goto out_cleanup_disk;
 
        if (new->disk_attributes) {
                ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
@@ -393,6 +395,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
        }
        return 0;
 
+out_cleanup_disk:
+       blk_cleanup_disk(new->disk);
 out_free_tag_set:
        blk_mq_free_tag_set(new->tag_set);
 out_kfree_tag_set:
index 38b6aa8..5ff0011 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/major.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/fs_context.h>
 #include "mtdcore.h"
 
index 2b66c59..e54f962 100644 (file)
@@ -137,7 +137,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                .name = "uc",
                .cmd = HNAE3_DBG_CMD_MAC_UC,
                .dentry = HNS3_DBG_DENTRY_MAC,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_128KB,
                .init = hns3_dbg_common_file_init,
        },
        {
@@ -256,7 +256,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                .name = "tqp",
                .cmd = HNAE3_DBG_CMD_REG_TQP,
                .dentry = HNS3_DBG_DENTRY_REG,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_128KB,
                .init = hns3_dbg_common_file_init,
        },
        {
@@ -298,7 +298,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                .name = "fd_tcam",
                .cmd = HNAE3_DBG_CMD_FD_TCAM,
                .dentry = HNS3_DBG_DENTRY_FD,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_1MB,
                .init = hns3_dbg_common_file_init,
        },
        {
@@ -462,7 +462,7 @@ static const struct hns3_dbg_item rx_queue_info_items[] = {
        { "TAIL", 2 },
        { "HEAD", 2 },
        { "FBDNUM", 2 },
-       { "PKTNUM", 2 },
+       { "PKTNUM", 5 },
        { "COPYBREAK", 2 },
        { "RING_EN", 2 },
        { "RX_RING_EN", 2 },
@@ -565,7 +565,7 @@ static const struct hns3_dbg_item tx_queue_info_items[] = {
        { "HEAD", 2 },
        { "FBDNUM", 2 },
        { "OFFSET", 2 },
-       { "PKTNUM", 2 },
+       { "PKTNUM", 5 },
        { "RING_EN", 2 },
        { "TX_RING_EN", 2 },
        { "BASE_ADDR", 10 },
@@ -790,13 +790,13 @@ static int hns3_dbg_rx_bd_info(struct hns3_dbg_data *d, char *buf, int len)
 }
 
 static const struct hns3_dbg_item tx_bd_info_items[] = {
-       { "BD_IDX", 5 },
-       { "ADDRESS", 2 },
+       { "BD_IDX", 2 },
+       { "ADDRESS", 13 },
        { "VLAN_TAG", 2 },
        { "SIZE", 2 },
        { "T_CS_VLAN_TSO", 2 },
        { "OT_VLAN_TAG", 3 },
-       { "TV", 2 },
+       { "TV", 5 },
        { "OLT_VLAN_LEN", 2 },
        { "PAYLEN_OL4CS", 2 },
        { "BD_FE_SC_VLD", 2 },
index 32f62cd..9cda8b3 100644 (file)
@@ -391,7 +391,7 @@ static int hclge_dbg_dump_mac(struct hclge_dev *hdev, char *buf, int len)
 static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
                                   int *pos)
 {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
        struct hclge_desc desc;
        u16 qset_id, qset_num;
        int ret;
@@ -408,12 +408,12 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
                if (ret)
                        return ret;
 
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
 
                *pos += scnprintf(buf + *pos, len - *pos,
                                  "%04u           %#x            %#x             %#x               %#x\n",
-                                 qset_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2, bitmap->bit3);
+                                 qset_id, req.bit0, req.bit1, req.bit2,
+                                 req.bit3);
        }
 
        return 0;
@@ -422,7 +422,7 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
 static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
                                  int *pos)
 {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
        struct hclge_desc desc;
        u8 pri_id, pri_num;
        int ret;
@@ -439,12 +439,11 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
                if (ret)
                        return ret;
 
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
 
                *pos += scnprintf(buf + *pos, len - *pos,
                                  "%03u       %#x           %#x                %#x\n",
-                                 pri_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2);
+                                 pri_id, req.bit0, req.bit1, req.bit2);
        }
 
        return 0;
@@ -453,7 +452,7 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
 static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len,
                                 int *pos)
 {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
        struct hclge_desc desc;
        u8 pg_id;
        int ret;
@@ -466,12 +465,11 @@ static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len,
                if (ret)
                        return ret;
 
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
 
                *pos += scnprintf(buf + *pos, len - *pos,
                                  "%03u      %#x           %#x               %#x\n",
-                                 pg_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2);
+                                 pg_id, req.bit0, req.bit1, req.bit2);
        }
 
        return 0;
@@ -511,7 +509,7 @@ static int hclge_dbg_dump_dcb_queue(struct hclge_dev *hdev, char *buf, int len,
 static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len,
                                   int *pos)
 {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
        struct hclge_desc desc;
        u8 port_id = 0;
        int ret;
@@ -521,12 +519,12 @@ static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len,
        if (ret)
                return ret;
 
-       bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+       req.bitmap = (u8)le32_to_cpu(desc.data[1]);
 
        *pos += scnprintf(buf + *pos, len - *pos, "port_mask: %#x\n",
-                        bitmap->bit0);
+                        req.bit0);
        *pos += scnprintf(buf + *pos, len - *pos, "port_shaping_pass: %#x\n",
-                        bitmap->bit1);
+                        req.bit1);
 
        return 0;
 }
index dcd40cc..d891390 100644 (file)
@@ -2847,33 +2847,29 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev)
 {
        if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
            !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
 }
 
 static void hclge_reset_task_schedule(struct hclge_dev *hdev)
 {
        if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
+           test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state) &&
            !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
 }
 
 static void hclge_errhand_task_schedule(struct hclge_dev *hdev)
 {
        if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
            !test_and_set_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
 }
 
 void hclge_task_schedule(struct hclge_dev *hdev, unsigned long delay_time)
 {
        if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
            !test_bit(HCLGE_STATE_RST_FAIL, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task,
-                                   delay_time);
+               mod_delayed_work(hclge_wq, &hdev->service_task, delay_time);
 }
 
 static int hclge_get_mac_link_status(struct hclge_dev *hdev, int *link_status)
@@ -3491,33 +3487,14 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev)
        hdev->num_msi_used += 1;
 }
 
-static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify,
-                                     const cpumask_t *mask)
-{
-       struct hclge_dev *hdev = container_of(notify, struct hclge_dev,
-                                             affinity_notify);
-
-       cpumask_copy(&hdev->affinity_mask, mask);
-}
-
-static void hclge_irq_affinity_release(struct kref *ref)
-{
-}
-
 static void hclge_misc_affinity_setup(struct hclge_dev *hdev)
 {
        irq_set_affinity_hint(hdev->misc_vector.vector_irq,
                              &hdev->affinity_mask);
-
-       hdev->affinity_notify.notify = hclge_irq_affinity_notify;
-       hdev->affinity_notify.release = hclge_irq_affinity_release;
-       irq_set_affinity_notifier(hdev->misc_vector.vector_irq,
-                                 &hdev->affinity_notify);
 }
 
 static void hclge_misc_affinity_teardown(struct hclge_dev *hdev)
 {
-       irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL);
        irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL);
 }
 
@@ -13052,7 +13029,7 @@ static int hclge_init(void)
 {
        pr_info("%s is initializing\n", HCLGE_NAME);
 
-       hclge_wq = alloc_workqueue("%s", 0, 0, HCLGE_NAME);
+       hclge_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGE_NAME);
        if (!hclge_wq) {
                pr_err("%s: failed to create workqueue\n", HCLGE_NAME);
                return -ENOMEM;
index de6afbc..69cd8f8 100644 (file)
@@ -944,7 +944,6 @@ struct hclge_dev {
 
        /* affinity mask and notify for misc interrupt */
        cpumask_t affinity_mask;
-       struct irq_affinity_notify affinity_notify;
        struct hclge_ptp *ptp;
        struct devlink *devlink;
 };
index bef6b98..cf00ad7 100644 (file)
@@ -2232,6 +2232,7 @@ static void hclgevf_get_misc_vector(struct hclgevf_dev *hdev)
 void hclgevf_reset_task_schedule(struct hclgevf_dev *hdev)
 {
        if (!test_bit(HCLGEVF_STATE_REMOVING, &hdev->state) &&
+           test_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state) &&
            !test_and_set_bit(HCLGEVF_STATE_RST_SERVICE_SCHED,
                              &hdev->state))
                mod_delayed_work(hclgevf_wq, &hdev->service_task, 0);
@@ -3449,6 +3450,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 
        hclgevf_init_rxd_adv_layout(hdev);
 
+       set_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state);
+
        hdev->last_reset_time = jiffies;
        dev_info(&hdev->pdev->dev, "finished initializing %s driver\n",
                 HCLGEVF_DRIVER_NAME);
@@ -3899,7 +3902,7 @@ static int hclgevf_init(void)
 {
        pr_info("%s is initializing\n", HCLGEVF_NAME);
 
-       hclgevf_wq = alloc_workqueue("%s", 0, 0, HCLGEVF_NAME);
+       hclgevf_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGEVF_NAME);
        if (!hclgevf_wq) {
                pr_err("%s: failed to create workqueue\n", HCLGEVF_NAME);
                return -ENOMEM;
index 883130a..28288d7 100644 (file)
@@ -146,6 +146,7 @@ enum hclgevf_states {
        HCLGEVF_STATE_REMOVING,
        HCLGEVF_STATE_NIC_REGISTERED,
        HCLGEVF_STATE_ROCE_REGISTERED,
+       HCLGEVF_STATE_SERVICE_INITED,
        /* task states */
        HCLGEVF_STATE_RST_SERVICE_SCHED,
        HCLGEVF_STATE_RST_HANDLING,
index 37c18c6..e375ac8 100644 (file)
@@ -100,9 +100,9 @@ static void ice_display_lag_info(struct ice_lag *lag)
  */
 static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
 {
-       struct net_device *event_netdev, *netdev_tmp;
        struct netdev_notifier_bonding_info *info;
        struct netdev_bonding_info *bonding_info;
+       struct net_device *event_netdev;
        const char *lag_netdev_name;
 
        event_netdev = netdev_notifier_info_to_dev(ptr);
@@ -123,19 +123,6 @@ static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
                goto lag_out;
        }
 
-       rcu_read_lock();
-       for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) {
-               if (!netif_is_ice(netdev_tmp))
-                       continue;
-
-               if (netdev_tmp && netdev_tmp != lag->netdev &&
-                   lag->peer_netdev != netdev_tmp) {
-                       dev_hold(netdev_tmp);
-                       lag->peer_netdev = netdev_tmp;
-               }
-       }
-       rcu_read_unlock();
-
        if (bonding_info->slave.state)
                ice_lag_set_backup(lag);
        else
@@ -319,6 +306,9 @@ ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event,
        case NETDEV_BONDING_INFO:
                ice_lag_info_event(lag, ptr);
                break;
+       case NETDEV_UNREGISTER:
+               ice_lag_unlink(lag, ptr);
+               break;
        default:
                break;
        }
index 80380ae..d1ef3d4 100644 (file)
@@ -1571,6 +1571,9 @@ err_kworker:
  */
 void ice_ptp_release(struct ice_pf *pf)
 {
+       if (!test_bit(ICE_FLAG_PTP, pf->flags))
+               return;
+
        /* Disable timestamping for both Tx and Rx */
        ice_ptp_cfg_timestamp(pf, false);
 
index 9338765..49d822a 100644 (file)
@@ -226,18 +226,85 @@ static const struct file_operations rvu_dbg_##name##_fops = { \
 
 static void print_nix_qsize(struct seq_file *filp, struct rvu_pfvf *pfvf);
 
+static void get_lf_str_list(struct rvu_block block, int pcifunc,
+                           char *lfs)
+{
+       int lf = 0, seq = 0, len = 0, prev_lf = block.lf.max;
+
+       for_each_set_bit(lf, block.lf.bmap, block.lf.max) {
+               if (lf >= block.lf.max)
+                       break;
+
+               if (block.fn_map[lf] != pcifunc)
+                       continue;
+
+               if (lf == prev_lf + 1) {
+                       prev_lf = lf;
+                       seq = 1;
+                       continue;
+               }
+
+               if (seq)
+                       len += sprintf(lfs + len, "-%d,%d", prev_lf, lf);
+               else
+                       len += (len ? sprintf(lfs + len, ",%d", lf) :
+                                     sprintf(lfs + len, "%d", lf));
+
+               prev_lf = lf;
+               seq = 0;
+       }
+
+       if (seq)
+               len += sprintf(lfs + len, "-%d", prev_lf);
+
+       lfs[len] = '\0';
+}
+
+static int get_max_column_width(struct rvu *rvu)
+{
+       int index, pf, vf, lf_str_size = 12, buf_size = 256;
+       struct rvu_block block;
+       u16 pcifunc;
+       char *buf;
+
+       buf = kzalloc(buf_size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
+               for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
+                       pcifunc = pf << 10 | vf;
+                       if (!pcifunc)
+                               continue;
+
+                       for (index = 0; index < BLK_COUNT; index++) {
+                               block = rvu->hw->block[index];
+                               if (!strlen(block.name))
+                                       continue;
+
+                               get_lf_str_list(block, pcifunc, buf);
+                               if (lf_str_size <= strlen(buf))
+                                       lf_str_size = strlen(buf) + 1;
+                       }
+               }
+       }
+
+       kfree(buf);
+       return lf_str_size;
+}
+
 /* Dumps current provisioning status of all RVU block LFs */
 static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                          char __user *buffer,
                                          size_t count, loff_t *ppos)
 {
-       int index, off = 0, flag = 0, go_back = 0, len = 0;
+       int index, off = 0, flag = 0, len = 0, i = 0;
        struct rvu *rvu = filp->private_data;
-       int lf, pf, vf, pcifunc;
+       int bytes_not_copied = 0;
        struct rvu_block block;
-       int bytes_not_copied;
-       int lf_str_size = 12;
+       int pf, vf, pcifunc;
        int buf_size = 2048;
+       int lf_str_size;
        char *lfs;
        char *buf;
 
@@ -249,6 +316,9 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
        if (!buf)
                return -ENOSPC;
 
+       /* Get the maximum width of a column */
+       lf_str_size = get_max_column_width(rvu);
+
        lfs = kzalloc(lf_str_size, GFP_KERNEL);
        if (!lfs) {
                kfree(buf);
@@ -262,65 +332,69 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                         "%-*s", lf_str_size,
                                         rvu->hw->block[index].name);
                }
+
        off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
+       bytes_not_copied = copy_to_user(buffer + (i * off), buf, off);
+       if (bytes_not_copied)
+               goto out;
+
+       i++;
+       *ppos += off;
        for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
                for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
+                       off = 0;
+                       flag = 0;
                        pcifunc = pf << 10 | vf;
                        if (!pcifunc)
                                continue;
 
                        if (vf) {
                                sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
-                               go_back = scnprintf(&buf[off],
-                                                   buf_size - 1 - off,
-                                                   "%-*s", lf_str_size, lfs);
+                               off = scnprintf(&buf[off],
+                                               buf_size - 1 - off,
+                                               "%-*s", lf_str_size, lfs);
                        } else {
                                sprintf(lfs, "PF%d", pf);
-                               go_back = scnprintf(&buf[off],
-                                                   buf_size - 1 - off,
-                                                   "%-*s", lf_str_size, lfs);
+                               off = scnprintf(&buf[off],
+                                               buf_size - 1 - off,
+                                               "%-*s", lf_str_size, lfs);
                        }
 
-                       off += go_back;
-                       for (index = 0; index < BLKTYPE_MAX; index++) {
+                       for (index = 0; index < BLK_COUNT; index++) {
                                block = rvu->hw->block[index];
                                if (!strlen(block.name))
                                        continue;
                                len = 0;
                                lfs[len] = '\0';
-                               for (lf = 0; lf < block.lf.max; lf++) {
-                                       if (block.fn_map[lf] != pcifunc)
-                                               continue;
+                               get_lf_str_list(block, pcifunc, lfs);
+                               if (strlen(lfs))
                                        flag = 1;
-                                       len += sprintf(&lfs[len], "%d,", lf);
-                               }
 
-                               if (flag)
-                                       len--;
-                               lfs[len] = '\0';
                                off += scnprintf(&buf[off], buf_size - 1 - off,
                                                 "%-*s", lf_str_size, lfs);
-                               if (!strlen(lfs))
-                                       go_back += lf_str_size;
                        }
-                       if (!flag)
-                               off -= go_back;
-                       else
-                               flag = 0;
-                       off--;
-                       off +=  scnprintf(&buf[off], buf_size - 1 - off, "\n");
+                       if (flag) {
+                               off +=  scnprintf(&buf[off],
+                                                 buf_size - 1 - off, "\n");
+                               bytes_not_copied = copy_to_user(buffer +
+                                                               (i * off),
+                                                               buf, off);
+                               if (bytes_not_copied)
+                                       goto out;
+
+                               i++;
+                               *ppos += off;
+                       }
                }
        }
 
-       bytes_not_copied = copy_to_user(buffer, buf, off);
+out:
        kfree(lfs);
        kfree(buf);
-
        if (bytes_not_copied)
                return -EFAULT;
 
-       *ppos = off;
-       return off;
+       return *ppos;
 }
 
 RVU_DEBUG_FOPS(rsrc_status, rsrc_attach_status, NULL);
@@ -504,7 +578,7 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp,
        if (cmd_buf)
                ret = -EINVAL;
 
-       if (!strncmp(subtoken, "help", 4) || ret < 0) {
+       if (ret < 0 || !strncmp(subtoken, "help", 4)) {
                dev_info(rvu->dev, "Use echo <%s-lf > qsize\n", blk_string);
                goto qsize_write_done;
        }
@@ -1719,6 +1793,10 @@ static int rvu_dbg_nix_band_prof_ctx_display(struct seq_file *m, void *unused)
        u16 pcifunc;
        char *str;
 
+       /* Ingress policers do not exist on all platforms */
+       if (!nix_hw->ipolicer)
+               return 0;
+
        for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) {
                if (layer == BAND_PROF_INVAL_LAYER)
                        continue;
@@ -1768,6 +1846,10 @@ static int rvu_dbg_nix_band_prof_rsrc_display(struct seq_file *m, void *unused)
        int layer;
        char *str;
 
+       /* Ingress policers do not exist on all platforms */
+       if (!nix_hw->ipolicer)
+               return 0;
+
        seq_puts(m, "\nBandwidth profile resource free count\n");
        seq_puts(m, "=====================================\n");
        for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) {
index 9ef4e94..6970540 100644 (file)
@@ -2507,6 +2507,9 @@ static void nix_free_tx_vtag_entries(struct rvu *rvu, u16 pcifunc)
                return;
 
        nix_hw = get_nix_hw(rvu->hw, blkaddr);
+       if (!nix_hw)
+               return;
+
        vlan = &nix_hw->txvlan;
 
        mutex_lock(&vlan->rsrc_lock);
index 13b0259..fcace73 100644 (file)
@@ -353,13 +353,10 @@ static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
        struct sk_buff *skb;
        int err;
 
-       elem_info->u.rdq.skb = NULL;
        skb = netdev_alloc_skb_ip_align(NULL, buf_len);
        if (!skb)
                return -ENOMEM;
 
-       /* Assume that wqe was previously zeroed. */
-
        err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
                                     buf_len, DMA_FROM_DEVICE);
        if (err)
@@ -597,21 +594,26 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
        struct pci_dev *pdev = mlxsw_pci->pdev;
        struct mlxsw_pci_queue_elem_info *elem_info;
        struct mlxsw_rx_info rx_info = {};
-       char *wqe;
+       char wqe[MLXSW_PCI_WQE_SIZE];
        struct sk_buff *skb;
        u16 byte_count;
        int err;
 
        elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-       skb = elem_info->u.sdq.skb;
-       if (!skb)
-               return;
-       wqe = elem_info->elem;
-       mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+       skb = elem_info->u.rdq.skb;
+       memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);
 
        if (q->consumer_counter++ != consumer_counter_limit)
                dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
 
+       err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+       if (err) {
+               dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+               goto out;
+       }
+
+       mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+
        if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
                rx_info.is_lag = true;
                rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe_v, cqe);
@@ -647,10 +649,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
        skb_put(skb, byte_count);
        mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
 
-       memset(wqe, 0, q->elem_size);
-       err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
-       if (err)
-               dev_dbg_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+out:
        /* Everything is set up, ring doorbell to pass elem to HW */
        q->producer_counter++;
        mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
index 9e8561c..4d5a5d6 100644 (file)
@@ -1743,6 +1743,16 @@ static int lan743x_tx_ring_init(struct lan743x_tx *tx)
                ret = -EINVAL;
                goto cleanup;
        }
+       if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev,
+                                     DMA_BIT_MASK(64))) {
+               if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev,
+                                             DMA_BIT_MASK(32))) {
+                       dev_warn(&tx->adapter->pdev->dev,
+                                "lan743x_: No suitable DMA available\n");
+                       ret = -ENOMEM;
+                       goto cleanup;
+               }
+       }
        ring_allocation_size = ALIGN(tx->ring_size *
                                     sizeof(struct lan743x_tx_descriptor),
                                     PAGE_SIZE);
@@ -1934,7 +1944,8 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index)
                                  index);
 }
 
-static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index)
+static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index,
+                                       gfp_t gfp)
 {
        struct net_device *netdev = rx->adapter->netdev;
        struct device *dev = &rx->adapter->pdev->dev;
@@ -1948,7 +1959,7 @@ static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index)
 
        descriptor = &rx->ring_cpu_ptr[index];
        buffer_info = &rx->buffer_info[index];
-       skb = __netdev_alloc_skb(netdev, buffer_length, GFP_ATOMIC | GFP_DMA);
+       skb = __netdev_alloc_skb(netdev, buffer_length, gfp);
        if (!skb)
                return -ENOMEM;
        dma_ptr = dma_map_single(dev, skb->data, buffer_length, DMA_FROM_DEVICE);
@@ -2110,7 +2121,8 @@ static int lan743x_rx_process_buffer(struct lan743x_rx *rx)
 
        /* save existing skb, allocate new skb and map to dma */
        skb = buffer_info->skb;
-       if (lan743x_rx_init_ring_element(rx, rx->last_head)) {
+       if (lan743x_rx_init_ring_element(rx, rx->last_head,
+                                        GFP_ATOMIC | GFP_DMA)) {
                /* failed to allocate next skb.
                 * Memory is very low.
                 * Drop this packet and reuse buffer.
@@ -2276,6 +2288,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx)
                ret = -EINVAL;
                goto cleanup;
        }
+       if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev,
+                                     DMA_BIT_MASK(64))) {
+               if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev,
+                                             DMA_BIT_MASK(32))) {
+                       dev_warn(&rx->adapter->pdev->dev,
+                                "lan743x_: No suitable DMA available\n");
+                       ret = -ENOMEM;
+                       goto cleanup;
+               }
+       }
        ring_allocation_size = ALIGN(rx->ring_size *
                                     sizeof(struct lan743x_rx_descriptor),
                                     PAGE_SIZE);
@@ -2315,13 +2337,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx)
 
        rx->last_head = 0;
        for (index = 0; index < rx->ring_size; index++) {
-               ret = lan743x_rx_init_ring_element(rx, index);
+               ret = lan743x_rx_init_ring_element(rx, index, GFP_KERNEL);
                if (ret)
                        goto cleanup;
        }
        return 0;
 
 cleanup:
+       netif_warn(rx->adapter, ifup, rx->adapter->netdev,
+                  "Error allocating memory for LAN743x\n");
+
        lan743x_rx_ring_cleanup(rx);
        return ret;
 }
@@ -3019,6 +3044,8 @@ static int lan743x_pm_resume(struct device *dev)
        if (ret) {
                netif_err(adapter, probe, adapter->netdev,
                          "lan743x_hardware_init returned %d\n", ret);
+               lan743x_pci_cleanup(adapter);
+               return ret;
        }
 
        /* open netdev when netdev is at running state while resume.
index 11c83a9..f469950 100644 (file)
@@ -182,15 +182,21 @@ static int
 nfp_bpf_check_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu)
 {
        struct nfp_net *nn = netdev_priv(netdev);
-       unsigned int max_mtu;
+       struct nfp_bpf_vnic *bv;
+       struct bpf_prog *prog;
 
        if (~nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF)
                return 0;
 
-       max_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
-       if (new_mtu > max_mtu) {
-               nn_info(nn, "BPF offload active, MTU over %u not supported\n",
-                       max_mtu);
+       if (nn->xdp_hw.prog) {
+               prog = nn->xdp_hw.prog;
+       } else {
+               bv = nn->app_priv;
+               prog = bv->tc_prog;
+       }
+
+       if (nfp_bpf_offload_check_mtu(nn, prog, new_mtu)) {
+               nn_info(nn, "BPF offload active, potential packet access beyond hardware packet boundary");
                return -EBUSY;
        }
        return 0;
index d0e17ee..16841bb 100644 (file)
@@ -560,6 +560,8 @@ bool nfp_is_subprog_start(struct nfp_insn_meta *meta);
 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog);
 int nfp_bpf_jit(struct nfp_prog *prog);
 bool nfp_bpf_supported_opcode(u8 code);
+bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog,
+                              unsigned int mtu);
 
 int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx,
                    int prev_insn_idx);
index 5385185..9d97cd2 100644 (file)
@@ -481,19 +481,28 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data,
        return 0;
 }
 
+bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog,
+                              unsigned int mtu)
+{
+       unsigned int fw_mtu, pkt_off;
+
+       fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
+       pkt_off = min(prog->aux->max_pkt_offset, mtu);
+
+       return fw_mtu < pkt_off;
+}
+
 static int
 nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
                 struct netlink_ext_ack *extack)
 {
        struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-       unsigned int fw_mtu, pkt_off, max_stack, max_prog_len;
+       unsigned int max_stack, max_prog_len;
        dma_addr_t dma_addr;
        void *img;
        int err;
 
-       fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
-       pkt_off = min(prog->aux->max_pkt_offset, nn->dp.netdev->mtu);
-       if (fw_mtu < pkt_off) {
+       if (nfp_bpf_offload_check_mtu(nn, prog, nn->dp.netdev->mtu)) {
                NL_SET_ERR_MSG_MOD(extack, "BPF offload not supported with potential packet access beyond HW packet split boundary");
                return -EOPNOTSUPP;
        }
index d29fe56..c910fa2 100644 (file)
@@ -1015,9 +1015,6 @@ static int lpc_eth_close(struct net_device *ndev)
        napi_disable(&pldat->napi);
        netif_stop_queue(ndev);
 
-       if (ndev->phydev)
-               phy_stop(ndev->phydev);
-
        spin_lock_irqsave(&pldat->lock, flags);
        __lpc_eth_reset(pldat);
        netif_carrier_off(ndev);
@@ -1025,6 +1022,8 @@ static int lpc_eth_close(struct net_device *ndev)
        writel(0, LPC_ENET_MAC2(pldat->net_base));
        spin_unlock_irqrestore(&pldat->lock, flags);
 
+       if (ndev->phydev)
+               phy_stop(ndev->phydev);
        clk_disable_unprepare(pldat->clk);
 
        return 0;
index 46a6ff9..2918947 100644 (file)
@@ -157,6 +157,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = {
        { PCI_VDEVICE(REALTEK,  0x8129) },
        { PCI_VDEVICE(REALTEK,  0x8136), RTL_CFG_NO_GBIT },
        { PCI_VDEVICE(REALTEK,  0x8161) },
+       { PCI_VDEVICE(REALTEK,  0x8162) },
        { PCI_VDEVICE(REALTEK,  0x8167) },
        { PCI_VDEVICE(REALTEK,  0x8168) },
        { PCI_VDEVICE(NCUBE,    0x8168) },
index f124a8a..a3bfb15 100644 (file)
@@ -243,62 +243,10 @@ static void phy_sanitize_settings(struct phy_device *phydev)
        }
 }
 
-int phy_ethtool_ksettings_set(struct phy_device *phydev,
-                             const struct ethtool_link_ksettings *cmd)
-{
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
-       u8 autoneg = cmd->base.autoneg;
-       u8 duplex = cmd->base.duplex;
-       u32 speed = cmd->base.speed;
-
-       if (cmd->base.phy_address != phydev->mdio.addr)
-               return -EINVAL;
-
-       linkmode_copy(advertising, cmd->link_modes.advertising);
-
-       /* We make sure that we don't pass unsupported values in to the PHY */
-       linkmode_and(advertising, advertising, phydev->supported);
-
-       /* Verify the settings we care about. */
-       if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
-               return -EINVAL;
-
-       if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising))
-               return -EINVAL;
-
-       if (autoneg == AUTONEG_DISABLE &&
-           ((speed != SPEED_1000 &&
-             speed != SPEED_100 &&
-             speed != SPEED_10) ||
-            (duplex != DUPLEX_HALF &&
-             duplex != DUPLEX_FULL)))
-               return -EINVAL;
-
-       phydev->autoneg = autoneg;
-
-       if (autoneg == AUTONEG_DISABLE) {
-               phydev->speed = speed;
-               phydev->duplex = duplex;
-       }
-
-       linkmode_copy(phydev->advertising, advertising);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
-                        phydev->advertising, autoneg == AUTONEG_ENABLE);
-
-       phydev->master_slave_set = cmd->base.master_slave_cfg;
-       phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
-
-       /* Restart the PHY */
-       phy_start_aneg(phydev);
-
-       return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_ksettings_set);
-
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
                               struct ethtool_link_ksettings *cmd)
 {
+       mutex_lock(&phydev->lock);
        linkmode_copy(cmd->link_modes.supported, phydev->supported);
        linkmode_copy(cmd->link_modes.advertising, phydev->advertising);
        linkmode_copy(cmd->link_modes.lp_advertising, phydev->lp_advertising);
@@ -317,6 +265,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev,
        cmd->base.autoneg = phydev->autoneg;
        cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl;
        cmd->base.eth_tp_mdix = phydev->mdix;
+       mutex_unlock(&phydev->lock);
 }
 EXPORT_SYMBOL(phy_ethtool_ksettings_get);
 
@@ -751,7 +700,7 @@ static int phy_check_link_status(struct phy_device *phydev)
 }
 
 /**
- * phy_start_aneg - start auto-negotiation for this PHY device
+ * _phy_start_aneg - start auto-negotiation for this PHY device
  * @phydev: the phy_device struct
  *
  * Description: Sanitizes the settings (if we're not autonegotiating
@@ -759,25 +708,43 @@ static int phy_check_link_status(struct phy_device *phydev)
  *   If the PHYCONTROL Layer is operating, we change the state to
  *   reflect the beginning of Auto-negotiation or forcing.
  */
-int phy_start_aneg(struct phy_device *phydev)
+static int _phy_start_aneg(struct phy_device *phydev)
 {
        int err;
 
+       lockdep_assert_held(&phydev->lock);
+
        if (!phydev->drv)
                return -EIO;
 
-       mutex_lock(&phydev->lock);
-
        if (AUTONEG_DISABLE == phydev->autoneg)
                phy_sanitize_settings(phydev);
 
        err = phy_config_aneg(phydev);
        if (err < 0)
-               goto out_unlock;
+               return err;
 
        if (phy_is_started(phydev))
                err = phy_check_link_status(phydev);
-out_unlock:
+
+       return err;
+}
+
+/**
+ * phy_start_aneg - start auto-negotiation for this PHY device
+ * @phydev: the phy_device struct
+ *
+ * Description: Sanitizes the settings (if we're not autonegotiating
+ *   them), and then calls the driver's config_aneg function.
+ *   If the PHYCONTROL Layer is operating, we change the state to
+ *   reflect the beginning of Auto-negotiation or forcing.
+ */
+int phy_start_aneg(struct phy_device *phydev)
+{
+       int err;
+
+       mutex_lock(&phydev->lock);
+       err = _phy_start_aneg(phydev);
        mutex_unlock(&phydev->lock);
 
        return err;
@@ -800,6 +767,61 @@ static int phy_poll_aneg_done(struct phy_device *phydev)
        return ret < 0 ? ret : 0;
 }
 
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+                             const struct ethtool_link_ksettings *cmd)
+{
+       __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+       u8 autoneg = cmd->base.autoneg;
+       u8 duplex = cmd->base.duplex;
+       u32 speed = cmd->base.speed;
+
+       if (cmd->base.phy_address != phydev->mdio.addr)
+               return -EINVAL;
+
+       linkmode_copy(advertising, cmd->link_modes.advertising);
+
+       /* We make sure that we don't pass unsupported values in to the PHY */
+       linkmode_and(advertising, advertising, phydev->supported);
+
+       /* Verify the settings we care about. */
+       if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
+               return -EINVAL;
+
+       if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising))
+               return -EINVAL;
+
+       if (autoneg == AUTONEG_DISABLE &&
+           ((speed != SPEED_1000 &&
+             speed != SPEED_100 &&
+             speed != SPEED_10) ||
+            (duplex != DUPLEX_HALF &&
+             duplex != DUPLEX_FULL)))
+               return -EINVAL;
+
+       mutex_lock(&phydev->lock);
+       phydev->autoneg = autoneg;
+
+       if (autoneg == AUTONEG_DISABLE) {
+               phydev->speed = speed;
+               phydev->duplex = duplex;
+       }
+
+       linkmode_copy(phydev->advertising, advertising);
+
+       linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+                        phydev->advertising, autoneg == AUTONEG_ENABLE);
+
+       phydev->master_slave_set = cmd->base.master_slave_cfg;
+       phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
+
+       /* Restart the PHY */
+       _phy_start_aneg(phydev);
+
+       mutex_unlock(&phydev->lock);
+       return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_set);
+
 /**
  * phy_speed_down - set speed to lowest speed supported by both link partners
  * @phydev: the phy_device struct
index 793f8fb..63cd72c 100644 (file)
@@ -4122,6 +4122,12 @@ static int lan78xx_probe(struct usb_interface *intf,
 
        dev->maxpacket = usb_maxpacket(dev->udev, dev->pipe_out, 1);
 
+       /* Reject broken descriptors. */
+       if (dev->maxpacket == 0) {
+               ret = -ENODEV;
+               goto out4;
+       }
+
        /* driver requires remote-wakeup capability during autosuspend. */
        intf->needs_remote_wakeup = 1;
 
index 80432ee..a33d7fb 100644 (file)
@@ -1790,6 +1790,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
        dev->maxpacket = usb_maxpacket (dev->udev, dev->out, 1);
        if (dev->maxpacket == 0) {
                /* that is a broken device */
+               status = -ENODEV;
                goto out4;
        }
 
index 142f706..8799854 100644 (file)
@@ -3833,7 +3833,6 @@ vmxnet3_suspend(struct device *device)
        vmxnet3_free_intr_resources(adapter);
 
        netif_device_detach(netdev);
-       netif_tx_stop_all_queues(netdev);
 
        /* Create wake-up filters. */
        pmConf = adapter->pm_conf;
index e31b984..fc41ba9 100644 (file)
@@ -1730,6 +1730,10 @@ static int netfront_resume(struct xenbus_device *dev)
 
        dev_dbg(&dev->dev, "%s\n", dev->nodename);
 
+       netif_tx_lock_bh(info->netdev);
+       netif_device_detach(info->netdev);
+       netif_tx_unlock_bh(info->netdev);
+
        xennet_disconnect_backend(info);
        return 0;
 }
@@ -2349,6 +2353,10 @@ static int xennet_connect(struct net_device *dev)
         * domain a kick because we've probably just requeued some
         * packets.
         */
+       netif_tx_lock_bh(np->netdev);
+       netif_device_attach(np->netdev);
+       netif_tx_unlock_bh(np->netdev);
+
        netif_carrier_on(np->netdev);
        for (j = 0; j < num_queues; ++j) {
                queue = &np->queues[j];
index 517376c..16ceb76 100644 (file)
@@ -1006,11 +1006,11 @@ static u64 port100_get_command_type_mask(struct port100 *dev)
 
        skb = port100_alloc_skb(dev, 0);
        if (!skb)
-               return -ENOMEM;
+               return 0;
 
        resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb);
        if (IS_ERR(resp))
-               return PTR_ERR(resp);
+               return 0;
 
        if (resp->len < 8)
                mask = 0;
index 088d3dd..b6c6866 100644 (file)
@@ -162,7 +162,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
        return err;
 }
 
-static blk_qc_t nd_blk_submit_bio(struct bio *bio)
+static void nd_blk_submit_bio(struct bio *bio)
 {
        struct bio_integrity_payload *bip;
        struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
@@ -173,7 +173,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
        bool do_acct;
 
        if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               return;
 
        bip = bio_integrity(bio);
        rw = bio_data_dir(bio);
@@ -199,7 +199,6 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
                bio_end_io_acct(bio, start);
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
index 92dec49..4295fa8 100644 (file)
@@ -1440,7 +1440,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
        return ret;
 }
 
-static blk_qc_t btt_submit_bio(struct bio *bio)
+static void btt_submit_bio(struct bio *bio)
 {
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct btt *btt = bio->bi_bdev->bd_disk->private_data;
@@ -1451,7 +1451,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
        bool do_acct;
 
        if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               return;
 
        do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
        if (do_acct)
@@ -1483,7 +1483,6 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
                bio_end_io_acct(bio, start);
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int btt_rw_page(struct block_device *bdev, sector_t sector,
index 7de592d..6a45fa9 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/device.h>
 #include <linux/ctype.h>
 #include <linux/ndctl.h>
index ef4950f..c74d7bc 100644 (file)
@@ -190,7 +190,7 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
        return rc;
 }
 
-static blk_qc_t pmem_submit_bio(struct bio *bio)
+static void pmem_submit_bio(struct bio *bio)
 {
        int ret = 0;
        blk_status_t rc = 0;
@@ -229,7 +229,6 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
                bio->bi_status = errno_to_blk_status(ret);
 
        bio_endio(bio);
-       return BLK_QC_T_NONE;
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
@@ -333,26 +332,6 @@ static const struct attribute_group *pmem_attribute_groups[] = {
        NULL,
 };
 
-static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
-       struct pmem_device *pmem = pgmap->owner;
-
-       blk_cleanup_disk(pmem->disk);
-}
-
-static void pmem_release_queue(void *pgmap)
-{
-       pmem_pagemap_cleanup(pgmap);
-}
-
-static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
-{
-       struct request_queue *q =
-               container_of(pgmap->ref, struct request_queue, q_usage_counter);
-
-       blk_freeze_queue_start(q);
-}
-
 static void pmem_release_disk(void *__pmem)
 {
        struct pmem_device *pmem = __pmem;
@@ -360,12 +339,9 @@ static void pmem_release_disk(void *__pmem)
        kill_dax(pmem->dax_dev);
        put_dax(pmem->dax_dev);
        del_gendisk(pmem->disk);
-}
 
-static const struct dev_pagemap_ops fsdax_pagemap_ops = {
-       .kill                   = pmem_pagemap_kill,
-       .cleanup                = pmem_pagemap_cleanup,
-};
+       blk_cleanup_disk(pmem->disk);
+}
 
 static int pmem_attach_disk(struct device *dev,
                struct nd_namespace_common *ndns)
@@ -427,10 +403,8 @@ static int pmem_attach_disk(struct device *dev,
        pmem->disk = disk;
        pmem->pgmap.owner = pmem;
        pmem->pfn_flags = PFN_DEV;
-       pmem->pgmap.ref = &q->q_usage_counter;
        if (is_nd_pfn(dev)) {
                pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
-               pmem->pgmap.ops = &fsdax_pagemap_ops;
                addr = devm_memremap_pages(dev, &pmem->pgmap);
                pfn_sb = nd_pfn->pfn_sb;
                pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -444,16 +418,12 @@ static int pmem_attach_disk(struct device *dev,
                pmem->pgmap.range.end = res->end;
                pmem->pgmap.nr_range = 1;
                pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
-               pmem->pgmap.ops = &fsdax_pagemap_ops;
                addr = devm_memremap_pages(dev, &pmem->pgmap);
                pmem->pfn_flags |= PFN_MAP;
                bb_range = pmem->pgmap.range;
        } else {
                addr = devm_memremap(dev, pmem->phys_addr,
                                pmem->size, ARCH_MEMREMAP_PMEM);
-               if (devm_add_action_or_reset(dev, pmem_release_queue,
-                                       &pmem->pgmap))
-                       return -ENOMEM;
                bb_range.start =  res->start;
                bb_range.end = res->end;
        }
index f8dd664..838b5e2 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
 #include <linux/compat.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -118,25 +119,6 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
                                   struct nvme_command *cmd);
 
-/*
- * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex.  This will end buffered writers dirtying pages that can't
- * be synced.
- */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
-{
-       if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
-               return;
-
-       blk_set_queue_dying(ns->queue);
-       blk_mq_unquiesce_queue(ns->queue);
-
-       set_capacity_and_notify(ns->disk, 0);
-}
-
 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 {
        /*
@@ -221,7 +203,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 {
        dev_info(ctrl->device,
-                "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
+                "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
 
        flush_work(&ctrl->reset_work);
        nvme_stop_ctrl(ctrl);
@@ -345,15 +327,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
        return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-       blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
        if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
            req_op(req) == REQ_OP_ZONE_APPEND)
                req->__sector = nvme_lba_to_sect(req->q->queuedata,
                        le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+       blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+       nvme_end_req_zoned(req);
        nvme_trace_bio_complete(req);
        blk_mq_end_request(req, status);
 }
@@ -380,6 +366,13 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch_req(struct request *req)
+{
+       nvme_cleanup_cmd(req);
+       nvme_end_req_zoned(req);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
@@ -631,7 +624,7 @@ static inline void nvme_init_request(struct request *req,
 
        req->cmd_flags |= REQ_FAILFAST_DRIVER;
        if (req->mq_hctx->type == HCTX_TYPE_POLL)
-               req->cmd_flags |= REQ_HIPRI;
+               req->cmd_flags |= REQ_POLLED;
        nvme_clear_nvme_request(req);
        memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
 }
@@ -822,6 +815,7 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 static inline void nvme_setup_flush(struct nvme_ns *ns,
                struct nvme_command *cmnd)
 {
+       memset(cmnd, 0, sizeof(*cmnd));
        cmnd->common.opcode = nvme_cmd_flush;
        cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
@@ -873,6 +867,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
                return BLK_STS_IOERR;
        }
 
+       memset(cmnd, 0, sizeof(*cmnd));
        cmnd->dsm.opcode = nvme_cmd_dsm;
        cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
        cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -889,6 +884,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
                struct request *req, struct nvme_command *cmnd)
 {
+       memset(cmnd, 0, sizeof(*cmnd));
+
        if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
                return nvme_setup_discard(ns, req, cmnd);
 
@@ -922,9 +919,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
                dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
        cmnd->rw.opcode = op;
+       cmnd->rw.flags = 0;
        cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
+       cmnd->rw.rsvd2 = 0;
+       cmnd->rw.metadata = 0;
        cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
        cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+       cmnd->rw.reftag = 0;
+       cmnd->rw.apptag = 0;
+       cmnd->rw.appmask = 0;
 
        if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
                nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
@@ -981,10 +984,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
        struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
        blk_status_t ret = BLK_STS_OK;
 
-       if (!(req->rq_flags & RQF_DONTPREP)) {
+       if (!(req->rq_flags & RQF_DONTPREP))
                nvme_clear_nvme_request(req);
-               memset(cmd, 0, sizeof(*cmd));
-       }
 
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
@@ -2600,6 +2601,24 @@ static ssize_t nvme_subsys_show_nqn(struct device *dev,
 }
 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
 
+static ssize_t nvme_subsys_show_type(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+
+       switch (subsys->subtype) {
+       case NVME_NQN_DISC:
+               return sysfs_emit(buf, "discovery\n");
+       case NVME_NQN_NVME:
+               return sysfs_emit(buf, "nvm\n");
+       default:
+               return sysfs_emit(buf, "reserved\n");
+       }
+}
+static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
+
 #define nvme_subsys_show_str_function(field)                           \
 static ssize_t subsys_##field##_show(struct device *dev,               \
                            struct device_attribute *attr, char *buf)   \
@@ -2620,6 +2639,7 @@ static struct attribute *nvme_subsys_attrs[] = {
        &subsys_attr_serial.attr,
        &subsys_attr_firmware_rev.attr,
        &subsys_attr_subsysnqn.attr,
+       &subsys_attr_subsystype.attr,
 #ifdef CONFIG_NVME_MULTIPATH
        &subsys_attr_iopolicy.attr,
 #endif
@@ -2690,6 +2710,21 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
        memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
        subsys->vendor_id = le16_to_cpu(id->vid);
        subsys->cmic = id->cmic;
+
+       /* Versions prior to 1.4 don't necessarily report a valid type */
+       if (id->cntrltype == NVME_CTRL_DISC ||
+           !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
+               subsys->subtype = NVME_NQN_DISC;
+       else
+               subsys->subtype = NVME_NQN_NVME;
+
+       if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
+               dev_err(ctrl->device,
+                       "Subsystem %s is not a discovery controller",
+                       subsys->subnqn);
+               kfree(subsys);
+               return -EINVAL;
+       }
        subsys->awupf = le16_to_cpu(id->awupf);
 #ifdef CONFIG_NVME_MULTIPATH
        subsys->iopolicy = NVME_IOPOLICY_NUMA;
@@ -4473,6 +4508,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_ns_queue(struct nvme_ns *ns)
+{
+       if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
+               blk_mq_unquiesce_queue(ns->queue);
+}
+
+static void nvme_stop_ns_queue(struct nvme_ns *ns)
+{
+       if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
+               blk_mq_quiesce_queue(ns->queue);
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex.  This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
+static void nvme_set_queue_dying(struct nvme_ns *ns)
+{
+       if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+               return;
+
+       blk_set_queue_dying(ns->queue);
+       nvme_start_ns_queue(ns);
+
+       set_capacity_and_notify(ns->disk, 0);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -4488,7 +4554,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 
        /* Forcibly unquiesce queues to avoid blocking dispatch */
        if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
-               blk_mq_unquiesce_queue(ctrl->admin_q);
+               nvme_start_admin_queue(ctrl);
 
        list_for_each_entry(ns, &ctrl->namespaces, list)
                nvme_set_queue_dying(ns);
@@ -4551,7 +4617,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_quiesce_queue(ns->queue);
+               nvme_stop_ns_queue(ns);
        up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
@@ -4562,11 +4628,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_unquiesce_queue(ns->queue);
+               nvme_start_ns_queue(ns);
        up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+       if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+               blk_mq_quiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
+
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+       if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+               blk_mq_unquiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
+
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns;
index 668c6bb..c5a2b71 100644 (file)
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = {
        { NVMF_OPT_NR_POLL_QUEUES,      "nr_poll_queues=%d"     },
        { NVMF_OPT_TOS,                 "tos=%d"                },
        { NVMF_OPT_FAIL_FAST_TMO,       "fast_io_fail_tmo=%d"   },
+       { NVMF_OPT_DISCOVERY,           "discovery"             },
        { NVMF_OPT_ERR,                 NULL                    }
 };
 
@@ -823,6 +824,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                        }
                        opts->tos = token;
                        break;
+               case NVMF_OPT_DISCOVERY:
+                       opts->discovery_nqn = true;
+                       break;
                default:
                        pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
                                p);
@@ -949,7 +953,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 #define NVMF_ALLOWED_OPTS      (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
                                 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
                                 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
-                                NVMF_OPT_DISABLE_SQFLOW |\
+                                NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
                                 NVMF_OPT_FAIL_FAST_TMO)
 
 static struct nvme_ctrl *
index a146cb9..c3203ff 100644 (file)
@@ -67,6 +67,7 @@ enum {
        NVMF_OPT_TOS            = 1 << 19,
        NVMF_OPT_FAIL_FAST_TMO  = 1 << 20,
        NVMF_OPT_HOST_IFACE     = 1 << 21,
+       NVMF_OPT_DISCOVERY      = 1 << 22,
 };
 
 /**
@@ -178,6 +179,13 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
        return true;
 }
 
+static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl)
+{
+       if (!ctrl->subsys)
+               return ctrl->opts->subsysnqn;
+       return ctrl->subsys->subnqn;
+}
+
 int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
 int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
index aa14ad9..71b3108 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/nvme-fc.h>
 #include "fc.h"
 #include <scsi/scsi_transport_fc.h>
+#include <linux/blk-mq-pci.h>
 
 /* *************************** Data Structures/Defines ****************** */
 
@@ -2382,7 +2383,7 @@ nvme_fc_ctrl_free(struct kref *ref)
        list_del(&ctrl->ctrl_list);
        spin_unlock_irqrestore(&ctrl->rport->lock, flags);
 
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
        blk_cleanup_queue(ctrl->ctrl.admin_q);
        blk_cleanup_queue(ctrl->ctrl.fabrics_q);
        blk_mq_free_tag_set(&ctrl->admin_tag_set);
@@ -2510,7 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
        /*
         * clean up the admin queue. Same thing as above.
         */
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        blk_sync_queue(ctrl->ctrl.admin_q);
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_fc_terminate_exchange, &ctrl->ctrl);
@@ -2841,6 +2842,28 @@ nvme_fc_complete_rq(struct request *rq)
        nvme_fc_ctrl_put(ctrl);
 }
 
+static int nvme_fc_map_queues(struct blk_mq_tag_set *set)
+{
+       struct nvme_fc_ctrl *ctrl = set->driver_data;
+       int i;
+
+       for (i = 0; i < set->nr_maps; i++) {
+               struct blk_mq_queue_map *map = &set->map[i];
+
+               if (!map->nr_queues) {
+                       WARN_ON(i == HCTX_TYPE_DEFAULT);
+                       continue;
+               }
+
+               /* Call LLDD map queue functionality if defined */
+               if (ctrl->lport->ops->map_queues)
+                       ctrl->lport->ops->map_queues(&ctrl->lport->localport,
+                                                    map);
+               else
+                       blk_mq_map_queues(map);
+       }
+       return 0;
+}
 
 static const struct blk_mq_ops nvme_fc_mq_ops = {
        .queue_rq       = nvme_fc_queue_rq,
@@ -2849,6 +2872,7 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
        .exit_request   = nvme_fc_exit_request,
        .init_hctx      = nvme_fc_init_hctx,
        .timeout        = nvme_fc_timeout,
+       .map_queues     = nvme_fc_map_queues,
 };
 
 static int
@@ -3095,7 +3119,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
        ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
                                                (ilog2(SZ_4K) - 9);
 
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
 
        ret = nvme_init_ctrl_finish(&ctrl->ctrl);
        if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
@@ -3249,7 +3273,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
        nvme_fc_free_queue(&ctrl->queues[0]);
 
        /* re-enable the admin_q so anything new can fast fail */
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
 
        /* resume the io queues so that things will fast fail */
        nvme_start_queues(&ctrl->ctrl);
@@ -3572,7 +3596,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        dev_info(ctrl->ctrl.device,
                "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
-               ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
+               ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl));
 
        return &ctrl->ctrl;
 
index fba0661..7f2071f 100644 (file)
@@ -85,8 +85,13 @@ void nvme_failover_req(struct request *req)
        }
 
        spin_lock_irqsave(&ns->head->requeue_lock, flags);
-       for (bio = req->bio; bio; bio = bio->bi_next)
+       for (bio = req->bio; bio; bio = bio->bi_next) {
                bio_set_dev(bio, ns->head->disk->part0);
+               if (bio->bi_opf & REQ_POLLED) {
+                       bio->bi_opf &= ~REQ_POLLED;
+                       bio->bi_cookie = BLK_QC_T_NONE;
+               }
+       }
        blk_steal_bios(&ns->head->requeue_list, req);
        spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 
@@ -100,8 +105,11 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list) {
-               if (ns->head->disk)
-                       kblockd_schedule_work(&ns->head->requeue_work);
+               if (!ns->head->disk)
+                       continue;
+               kblockd_schedule_work(&ns->head->requeue_work);
+               if (ctrl->state == NVME_CTRL_LIVE)
+                       disk_uevent(ns->head->disk, KOBJ_CHANGE);
        }
        up_read(&ctrl->namespaces_rwsem);
 }
@@ -138,13 +146,12 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns;
 
-       mutex_lock(&ctrl->scan_lock);
        down_read(&ctrl->namespaces_rwsem);
-       list_for_each_entry(ns, &ctrl->namespaces, list)
-               if (nvme_mpath_clear_current_path(ns))
-                       kblockd_schedule_work(&ns->head->requeue_work);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               nvme_mpath_clear_current_path(ns);
+               kblockd_schedule_work(&ns->head->requeue_work);
+       }
        up_read(&ctrl->namespaces_rwsem);
-       mutex_unlock(&ctrl->scan_lock);
 }
 
 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -312,12 +319,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)
        return false;
 }
 
-static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
+static void nvme_ns_head_submit_bio(struct bio *bio)
 {
        struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
        struct device *dev = disk_to_dev(head->disk);
        struct nvme_ns *ns;
-       blk_qc_t ret = BLK_QC_T_NONE;
        int srcu_idx;
 
        /*
@@ -334,7 +340,7 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
                bio->bi_opf |= REQ_NVME_MPATH;
                trace_block_bio_remap(bio, disk_devt(ns->head->disk),
                                      bio->bi_iter.bi_sector);
-               ret = submit_bio_noacct(bio);
+               submit_bio_noacct(bio);
        } else if (nvme_available_path(head)) {
                dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 
@@ -349,7 +355,6 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
        }
 
        srcu_read_unlock(&head->srcu, srcu_idx);
-       return ret;
 }
 
 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
@@ -479,6 +484,15 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 
        blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
        blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
+       /*
+        * This assumes all controllers that refer to a namespace either
+        * support poll queues or not.  That is not a strict guarantee,
+        * but if the assumption is wrong the effect is only suboptimal
+        * performance but not correctness problem.
+        */
+       if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
+           ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
+               blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
 
        /* set to a default value of 512 until the disk is validated */
        blk_queue_logical_block_size(head->disk->queue, 512);
@@ -494,13 +508,23 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 static void nvme_mpath_set_live(struct nvme_ns *ns)
 {
        struct nvme_ns_head *head = ns->head;
+       int rc;
 
        if (!head->disk)
                return;
 
+       /*
+        * test_and_set_bit() is used because it is protecting against two nvme
+        * paths simultaneously calling device_add_disk() on the same namespace
+        * head.
+        */
        if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
-               device_add_disk(&head->subsys->dev, head->disk,
-                               nvme_ns_id_attr_groups);
+               rc = device_add_disk(&head->subsys->dev, head->disk,
+                                    nvme_ns_id_attr_groups);
+               if (rc) {
+                       clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
+                       return;
+               }
                nvme_add_ns_head_cdev(head);
        }
 
@@ -538,7 +562,7 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
                        return -EINVAL;
 
                nr_nsids = le32_to_cpu(desc->nnsids);
-               nsid_buf_size = nr_nsids * sizeof(__le32);
+               nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
 
                if (WARN_ON_ONCE(desc->grpid == 0))
                        return -EINVAL;
index ed79a6c..b334af8 100644 (file)
@@ -342,6 +342,7 @@ struct nvme_ctrl {
        int nr_reconnects;
        unsigned long flags;
 #define NVME_CTRL_FAILFAST_EXPIRED     0
+#define NVME_CTRL_ADMIN_Q_STOPPED      1
        struct nvmf_ctrl_options *opts;
 
        struct page *discard_page;
@@ -372,6 +373,7 @@ struct nvme_subsystem {
        char                    model[40];
        char                    firmware_rev[8];
        u8                      cmic;
+       enum nvme_subsys_type   subtype;
        u16                     vendor_id;
        u16                     awupf;  /* 0's based awupf value. */
        struct ida              ns_ida;
@@ -463,6 +465,7 @@ struct nvme_ns {
 #define NVME_NS_ANA_PENDING    2
 #define NVME_NS_FORCE_RO       3
 #define NVME_NS_READY          4
+#define NVME_NS_STOPPED                5
 
        struct cdev             cdev;
        struct device           cdev_device;
@@ -638,6 +641,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
+                                               void (*fn)(struct request *rq))
+{
+       struct request *req;
+
+       rq_list_for_each(&iob->req_list, req) {
+               fn(req);
+               nvme_complete_batch_req(req);
+       }
+       blk_mq_end_request_batch(iob);
+}
+
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
@@ -665,6 +682,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
index 149ecf7..ca2ee80 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-mq-pci.h>
+#include <linux/blk-integrity.h>
 #include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -244,8 +245,15 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
 {
        unsigned int mem_size = nvme_dbbuf_size(dev);
 
-       if (dev->dbbuf_dbs)
+       if (dev->dbbuf_dbs) {
+               /*
+                * Clear the dbbuf memory so the driver doesn't observe stale
+                * values from the previous instantiation.
+                */
+               memset(dev->dbbuf_dbs, 0, mem_size);
+               memset(dev->dbbuf_eis, 0, mem_size);
                return 0;
+       }
 
        dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
                                            &dev->dbbuf_dbs_dma_addr,
@@ -958,7 +966,7 @@ out_free_cmd:
        return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_dev *dev = iod->nvmeq->dev;
@@ -968,9 +976,19 @@ static void nvme_pci_complete_rq(struct request *req)
                               rq_integrity_vec(req)->bv_len, rq_data_dir(req));
        if (blk_rq_nr_phys_segments(req))
                nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+       nvme_pci_unmap_rq(req);
        nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_comp_batch *iob)
+{
+       nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -995,7 +1013,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
        return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+                                  struct io_comp_batch *iob, u16 idx)
 {
        struct nvme_completion *cqe = &nvmeq->cqes[idx];
        __u16 command_id = READ_ONCE(cqe->command_id);
@@ -1022,7 +1041,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
        }
 
        trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-       if (!nvme_try_complete_req(req, cqe->status, cqe->result))
+       if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
+           !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
+                                       nvme_pci_complete_batch))
                nvme_pci_complete_rq(req);
 }
 
@@ -1038,7 +1059,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
        }
 }
 
-static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
+                              struct io_comp_batch *iob)
 {
        int found = 0;
 
@@ -1049,7 +1071,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
                 * the cqe requires a full read memory barrier
                 */
                dma_rmb();
-               nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+               nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
                nvme_update_cq_head(nvmeq);
        }
 
@@ -1061,9 +1083,13 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 static irqreturn_t nvme_irq(int irq, void *data)
 {
        struct nvme_queue *nvmeq = data;
+       DEFINE_IO_COMP_BATCH(iob);
 
-       if (nvme_process_cq(nvmeq))
+       if (nvme_poll_cq(nvmeq, &iob)) {
+               if (!rq_list_empty(iob.req_list))
+                       nvme_pci_complete_batch(&iob);
                return IRQ_HANDLED;
+       }
        return IRQ_NONE;
 }
 
@@ -1087,11 +1113,11 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
        WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
 
        disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
-       nvme_process_cq(nvmeq);
+       nvme_poll_cq(nvmeq, NULL);
        enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
        struct nvme_queue *nvmeq = hctx->driver_data;
        bool found;
@@ -1100,7 +1126,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
                return 0;
 
        spin_lock(&nvmeq->cq_poll_lock);
-       found = nvme_process_cq(nvmeq);
+       found = nvme_poll_cq(nvmeq, iob);
        spin_unlock(&nvmeq->cq_poll_lock);
 
        return found;
@@ -1273,7 +1299,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
         * Did we miss an interrupt?
         */
        if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
-               nvme_poll(req->mq_hctx);
+               nvme_poll(req->mq_hctx, NULL);
        else
                nvme_poll_irqdisable(nvmeq);
 
@@ -1395,7 +1421,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 
        nvmeq->dev->online_queues--;
        if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
-               blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+               nvme_stop_admin_queue(&nvmeq->dev->ctrl);
        if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
                pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
        return 0;
@@ -1433,7 +1459,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev)
 
        for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
                spin_lock(&dev->queues[i].cq_poll_lock);
-               nvme_process_cq(&dev->queues[i]);
+               nvme_poll_cq(&dev->queues[i], NULL);
                spin_unlock(&dev->queues[i].cq_poll_lock);
        }
 }
@@ -1654,7 +1680,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
                 * user requests may be waiting on a stopped queue. Start the
                 * queue to flush these to completion.
                 */
-               blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+               nvme_start_admin_queue(&dev->ctrl);
                blk_cleanup_queue(dev->ctrl.admin_q);
                blk_mq_free_tag_set(&dev->admin_tagset);
        }
@@ -1688,7 +1714,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
                        return -ENODEV;
                }
        } else
-               blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+               nvme_start_admin_queue(&dev->ctrl);
 
        return 0;
 }
@@ -2623,7 +2649,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
        if (shutdown) {
                nvme_start_queues(&dev->ctrl);
                if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
-                       blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+                       nvme_start_admin_queue(&dev->ctrl);
        }
        mutex_unlock(&dev->shutdown_lock);
 }
index 042c594..850f84d 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-mq-rdma.h>
+#include <linux/blk-integrity.h>
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -918,7 +919,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
        else
                ctrl->ctrl.max_integrity_segments = 0;
 
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
 
        error = nvme_init_ctrl_finish(&ctrl->ctrl);
        if (error)
@@ -927,7 +928,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
        return 0;
 
 out_quiesce_queue:
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        blk_sync_queue(ctrl->ctrl.admin_q);
 out_stop_queue:
        nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -1025,12 +1026,12 @@ out_free_io_queues:
 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
                bool remove)
 {
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        blk_sync_queue(ctrl->ctrl.admin_q);
        nvme_rdma_stop_queue(&ctrl->queues[0]);
        nvme_cancel_admin_tagset(&ctrl->ctrl);
        if (remove)
-               blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+               nvme_start_admin_queue(&ctrl->ctrl);
        nvme_rdma_destroy_admin_queue(ctrl, remove);
 }
 
@@ -1095,11 +1096,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
                return ret;
 
        if (ctrl->ctrl.icdoff) {
+               ret = -EOPNOTSUPP;
                dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
                goto destroy_admin;
        }
 
        if (!(ctrl->ctrl.sgls & (1 << 2))) {
+               ret = -EOPNOTSUPP;
                dev_err(ctrl->ctrl.device,
                        "Mandatory keyed sgls are not supported!\n");
                goto destroy_admin;
@@ -1111,6 +1114,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
                        ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
        }
 
+       if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
+               dev_warn(ctrl->ctrl.device,
+                       "ctrl sqsize %u > max queue size %u, clamping down\n",
+                       ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
+               ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
+       }
+
        if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
                dev_warn(ctrl->ctrl.device,
                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
@@ -1153,7 +1163,7 @@ destroy_io:
                nvme_rdma_destroy_io_queues(ctrl, new);
        }
 destroy_admin:
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        blk_sync_queue(ctrl->ctrl.admin_q);
        nvme_rdma_stop_queue(&ctrl->queues[0]);
        nvme_cancel_admin_tagset(&ctrl->ctrl);
@@ -1193,7 +1203,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
        nvme_rdma_teardown_io_queues(ctrl, false);
        nvme_start_queues(&ctrl->ctrl);
        nvme_rdma_teardown_admin_queue(ctrl, false);
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
 
        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we started ctrl delete */
@@ -2105,7 +2115,7 @@ unmap_qe:
        return ret;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
        struct nvme_rdma_queue *queue = hctx->driver_data;
 
@@ -2231,7 +2241,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
        cancel_delayed_work_sync(&ctrl->reconnect_work);
 
        nvme_rdma_teardown_io_queues(ctrl, shutdown);
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        if (shutdown)
                nvme_shutdown_ctrl(&ctrl->ctrl);
        else
@@ -2385,7 +2395,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
                goto out_uninit_ctrl;
 
        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
-               ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+               nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
 
        mutex_lock(&nvme_rdma_ctrl_mutex);
        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
index 3c1c29d..33bc83d 100644 (file)
@@ -926,12 +926,14 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 {
        struct nvme_tcp_queue *queue = req->queue;
+       int req_data_len = req->data_len;
 
        while (true) {
                struct page *page = nvme_tcp_req_cur_page(req);
                size_t offset = nvme_tcp_req_cur_offset(req);
                size_t len = nvme_tcp_req_cur_length(req);
                bool last = nvme_tcp_pdu_last_send(req, len);
+               int req_data_sent = req->data_sent;
                int ret, flags = MSG_DONTWAIT;
 
                if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
@@ -958,7 +960,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
                 * in the request where we don't want to modify it as we may
                 * compete with the RX path completing the request.
                 */
-               if (req->data_sent + ret < req->data_len)
+               if (req_data_sent + ret < req_data_len)
                        nvme_tcp_advance_req(req, ret);
 
                /* fully successful last send in current PDU */
@@ -1048,10 +1050,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
 {
        struct nvme_tcp_queue *queue = req->queue;
+       size_t offset = req->offset;
        int ret;
        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
        struct kvec iov = {
-               .iov_base = &req->ddgst + req->offset,
+               .iov_base = (u8 *)&req->ddgst + req->offset,
                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
        };
 
@@ -1064,7 +1067,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
        if (unlikely(ret <= 0))
                return ret;
 
-       if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+       if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
                nvme_tcp_done_send_req(queue);
                return 1;
        }
@@ -1915,7 +1918,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
        if (error)
                goto out_stop_queue;
 
-       blk_mq_unquiesce_queue(ctrl->admin_q);
+       nvme_start_admin_queue(ctrl);
 
        error = nvme_init_ctrl_finish(ctrl);
        if (error)
@@ -1924,7 +1927,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
        return 0;
 
 out_quiesce_queue:
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
        blk_sync_queue(ctrl->admin_q);
 out_stop_queue:
        nvme_tcp_stop_queue(ctrl, 0);
@@ -1946,12 +1949,12 @@ out_free_queue:
 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
                bool remove)
 {
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
        blk_sync_queue(ctrl->admin_q);
        nvme_tcp_stop_queue(ctrl, 0);
        nvme_cancel_admin_tagset(ctrl);
        if (remove)
-               blk_mq_unquiesce_queue(ctrl->admin_q);
+               nvme_start_admin_queue(ctrl);
        nvme_tcp_destroy_admin_queue(ctrl, remove);
 }
 
@@ -1960,7 +1963,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 {
        if (ctrl->queue_count <= 1)
                return;
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
        nvme_start_freeze(ctrl);
        nvme_stop_queues(ctrl);
        nvme_sync_io_queues(ctrl);
@@ -2055,7 +2058,7 @@ destroy_io:
                nvme_tcp_destroy_io_queues(ctrl, new);
        }
 destroy_admin:
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
        blk_sync_queue(ctrl->admin_q);
        nvme_tcp_stop_queue(ctrl, 0);
        nvme_cancel_admin_tagset(ctrl);
@@ -2098,7 +2101,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
        /* unquiesce to fail fast pending requests */
        nvme_start_queues(ctrl);
        nvme_tcp_teardown_admin_queue(ctrl, false);
-       blk_mq_unquiesce_queue(ctrl->admin_q);
+       nvme_start_admin_queue(ctrl);
 
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we started ctrl delete */
@@ -2116,7 +2119,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
 
        nvme_tcp_teardown_io_queues(ctrl, shutdown);
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
        if (shutdown)
                nvme_shutdown_ctrl(ctrl);
        else
@@ -2429,7 +2432,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
        return 0;
 }
 
-static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
        struct nvme_tcp_queue *queue = hctx->driver_data;
        struct sock *sk = queue->sock->sk;
@@ -2582,7 +2585,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
                goto out_uninit_ctrl;
 
        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
-               ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+               nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
 
        mutex_lock(&nvme_tcp_ctrl_mutex);
        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
index d950104..bfc259e 100644 (file)
@@ -233,6 +233,8 @@ out_free:
 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
                struct nvme_command *c, enum nvme_zone_mgmt_action action)
 {
+       memset(c, 0, sizeof(*c));
+
        c->zms.opcode = nvme_cmd_zone_mgmt_send;
        c->zms.nsid = cpu_to_le32(ns->head->ns_id);
        c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
index aa6d84d..6fb2474 100644 (file)
@@ -264,7 +264,7 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
        desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
        desc->state = req->port->ana_state[grpid];
        memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
-       return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
+       return struct_size(desc, nsids, count);
 }
 
 static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
@@ -278,8 +278,8 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
        u16 status;
 
        status = NVME_SC_INTERNAL;
-       desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
-                       NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
+       desc = kmalloc(struct_size(desc, nsids, NVMET_MAX_NAMESPACES),
+                      GFP_KERNEL);
        if (!desc)
                goto out;
 
@@ -374,13 +374,19 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
        id->rab = 6;
 
+       if (nvmet_is_disc_subsys(ctrl->subsys))
+               id->cntrltype = NVME_CTRL_DISC;
+       else
+               id->cntrltype = NVME_CTRL_IO;
+
        /*
         * XXX: figure out how we can assign a IEEE OUI, but until then
         * the safest is to leave it as zeroes.
         */
 
        /* we support multiple ports, multiples hosts and ANA: */
-       id->cmic = (1 << 0) | (1 << 1) | (1 << 3);
+       id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL |
+               NVME_CTRL_CMIC_ANA;
 
        /* Limit MDTS according to transport capability */
        if (ctrl->ops->get_mdts)
@@ -536,7 +542,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
         * Our namespace might always be shared.  Not just with other
         * controllers, but also with any other user of the block device.
         */
-       id->nmic = (1 << 0);
+       id->nmic = NVME_NS_NMIC_SHARED;
        id->anagrpid = cpu_to_le32(req->ns->anagrpid);
 
        memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
@@ -1008,7 +1014,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
        if (nvme_is_fabrics(cmd))
                return nvmet_parse_fabrics_cmd(req);
-       if (nvmet_req_subsys(req)->type == NVME_NQN_DISC)
+       if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
                return nvmet_parse_discovery_cmd(req);
 
        ret = nvmet_check_ctrl_status(req);
index be5d824..091a0ca 100644 (file)
@@ -1233,6 +1233,44 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item,
 }
 CONFIGFS_ATTR(nvmet_subsys_, attr_model);
 
+static ssize_t nvmet_subsys_attr_discovery_nqn_show(struct config_item *item,
+                       char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%s\n",
+                       nvmet_disc_subsys->subsysnqn);
+}
+
+static ssize_t nvmet_subsys_attr_discovery_nqn_store(struct config_item *item,
+                       const char *page, size_t count)
+{
+       struct nvmet_subsys *subsys = to_subsys(item);
+       char *subsysnqn;
+       int len;
+
+       len = strcspn(page, "\n");
+       if (!len)
+               return -EINVAL;
+
+       subsysnqn = kmemdup_nul(page, len, GFP_KERNEL);
+       if (!subsysnqn)
+               return -ENOMEM;
+
+       /*
+        * The discovery NQN must be different from subsystem NQN.
+        */
+       if (!strcmp(subsysnqn, subsys->subsysnqn)) {
+               kfree(subsysnqn);
+               return -EBUSY;
+       }
+       down_write(&nvmet_config_sem);
+       kfree(nvmet_disc_subsys->subsysnqn);
+       nvmet_disc_subsys->subsysnqn = subsysnqn;
+       up_write(&nvmet_config_sem);
+
+       return count;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_discovery_nqn);
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item,
                                                char *page)
@@ -1262,6 +1300,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
        &nvmet_subsys_attr_attr_cntlid_min,
        &nvmet_subsys_attr_attr_cntlid_max,
        &nvmet_subsys_attr_attr_model,
+       &nvmet_subsys_attr_attr_discovery_nqn,
 #ifdef CONFIG_BLK_DEV_INTEGRITY
        &nvmet_subsys_attr_attr_pi_enable,
 #endif
@@ -1553,6 +1592,8 @@ static void nvmet_port_release(struct config_item *item)
 {
        struct nvmet_port *port = to_nvmet_port(item);
 
+       /* Let inflight controllers teardown complete */
+       flush_scheduled_work();
        list_del(&port->global_entry);
 
        kfree(port->ana_state);
index b8425fa..5119c68 100644 (file)
@@ -1140,7 +1140,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
         * should verify iosqes,iocqes are zeroed, however that
         * would break backwards compatibility, so don't enforce it.
         */
-       if (ctrl->subsys->type != NVME_NQN_DISC &&
+       if (!nvmet_is_disc_subsys(ctrl->subsys) &&
            (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
             nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
                ctrl->csts = NVME_CSTS_CFS;
@@ -1205,7 +1205,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
        /* CC.EN timeout in 500msec units: */
        ctrl->cap |= (15ULL << 24);
        /* maximum queue entries supported: */
-       ctrl->cap |= NVMET_QUEUE_SIZE - 1;
+       if (ctrl->ops->get_max_queue_size)
+               ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1;
+       else
+               ctrl->cap |= NVMET_QUEUE_SIZE - 1;
 
        if (nvmet_is_passthru_subsys(ctrl->subsys))
                nvmet_passthrough_override_cap(ctrl);
@@ -1278,7 +1281,7 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
        if (subsys->allow_any_host)
                return true;
 
-       if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
+       if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */
                return true;
 
        list_for_each_entry(p, &subsys->hosts, entry) {
@@ -1367,6 +1370,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        mutex_init(&ctrl->lock);
 
        ctrl->port = req->port;
+       ctrl->ops = req->ops;
 
        INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
        INIT_LIST_HEAD(&ctrl->async_events);
@@ -1405,13 +1409,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        }
        ctrl->cntlid = ret;
 
-       ctrl->ops = req->ops;
-
        /*
         * Discovery controllers may use some arbitrary high value
         * in order to cleanup stale discovery sessions
         */
-       if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
+       if (nvmet_is_disc_subsys(ctrl->subsys) && !kato)
                kato = NVMET_DISC_KATO_MS;
 
        /* keep-alive timeout in seconds */
@@ -1491,7 +1493,8 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
        if (!port)
                return NULL;
 
-       if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
+       if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn) ||
+           !strcmp(nvmet_disc_subsys->subsysnqn, subsysnqn)) {
                if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
                        return NULL;
                return nvmet_disc_subsys;
@@ -1538,6 +1541,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
                subsys->max_qid = NVMET_NR_QUEUES;
                break;
        case NVME_NQN_DISC:
+       case NVME_NQN_CURR:
                subsys->max_qid = 0;
                break;
        default:
index 7aa62bc..c2162ee 100644 (file)
@@ -146,7 +146,7 @@ static size_t discovery_log_entries(struct nvmet_req *req)
        struct nvmet_ctrl *ctrl = req->sq->ctrl;
        struct nvmet_subsys_link *p;
        struct nvmet_port *r;
-       size_t entries = 0;
+       size_t entries = 1;
 
        list_for_each_entry(p, &req->port->subsystems, entry) {
                if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
@@ -171,6 +171,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
        u32 numrec = 0;
        u16 status = 0;
        void *buffer;
+       char traddr[NVMF_TRADDR_SIZE];
 
        if (!nvmet_check_transfer_len(req, data_len))
                return;
@@ -203,15 +204,19 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
                status = NVME_SC_INTERNAL;
                goto out;
        }
-
        hdr = buffer;
-       list_for_each_entry(p, &req->port->subsystems, entry) {
-               char traddr[NVMF_TRADDR_SIZE];
 
+       nvmet_set_disc_traddr(req, req->port, traddr);
+
+       nvmet_format_discovery_entry(hdr, req->port,
+                                    nvmet_disc_subsys->subsysnqn,
+                                    traddr, NVME_NQN_CURR, numrec);
+       numrec++;
+
+       list_for_each_entry(p, &req->port->subsystems, entry) {
                if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
                        continue;
 
-               nvmet_set_disc_traddr(req, req->port, traddr);
                nvmet_format_discovery_entry(hdr, req->port,
                                p->subsys->subsysnqn, traddr,
                                NVME_NQN_NVME, numrec);
@@ -268,6 +273,8 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
        memcpy_and_pad(id->fr, sizeof(id->fr),
                       UTS_RELEASE, strlen(UTS_RELEASE), ' ');
 
+       id->cntrltype = NVME_CTRL_DISC;
+
        /* no limit on data transfer sizes for now */
        id->mdts = 0;
        id->cntlid = cpu_to_le16(ctrl->cntlid);
@@ -387,7 +394,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 int __init nvmet_init_discovery(void)
 {
        nvmet_disc_subsys =
-               nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
+               nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_CURR);
        return PTR_ERR_OR_ZERO(nvmet_disc_subsys);
 }
 
index 7d0454c..70fb587 100644 (file)
@@ -221,7 +221,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
                goto out;
        }
 
-       pr_info("creating controller %d for subsystem %s for NQN %s%s.\n",
+       pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n",
+               nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
                ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
                ctrl->pi_support ? " T10-PI is enabled" : "");
        req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
index 0fc2781..70ca9df 100644 (file)
@@ -5,6 +5,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/module.h>
 #include "nvmet.h"
 
@@ -86,7 +87,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
                ns->bdev = NULL;
                return ret;
        }
-       ns->size = i_size_read(ns->bdev->bd_inode);
+       ns->size = bdev_nr_bytes(ns->bdev);
        ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
 
        ns->pi_type = 0;
@@ -107,7 +108,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 
 void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
 {
-       ns->size = i_size_read(ns->bdev->bd_inode);
+       ns->size = bdev_nr_bytes(ns->bdev);
 }
 
 u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
index 1dd1a0f..6aa30f3 100644 (file)
@@ -125,7 +125,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
        return call_iter(iocb, &iter);
 }
 
-static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
+static void nvmet_file_io_done(struct kiocb *iocb, long ret)
 {
        struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
        u16 status = NVME_SC_SUCCESS;
@@ -222,7 +222,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
        }
 
 complete:
-       nvmet_file_io_done(&req->f.iocb, ret, 0);
+       nvmet_file_io_done(&req->f.iocb, ret);
        return true;
 }
 
index 0285ccc..eb10942 100644 (file)
@@ -384,6 +384,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
                error = PTR_ERR(ctrl->ctrl.admin_q);
                goto out_cleanup_fabrics_q;
        }
+       /* reset stopped state for the fresh admin queue */
+       clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
 
        error = nvmf_connect_admin_queue(&ctrl->ctrl);
        if (error)
@@ -398,7 +400,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
        ctrl->ctrl.max_hw_sectors =
                (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
 
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
 
        error = nvme_init_ctrl_finish(&ctrl->ctrl);
        if (error)
@@ -428,7 +430,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
                nvme_loop_destroy_io_queues(ctrl);
        }
 
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
        if (ctrl->ctrl.state == NVME_CTRL_LIVE)
                nvme_shutdown_ctrl(&ctrl->ctrl);
 
index 7143c7f..af19342 100644 (file)
@@ -309,6 +309,7 @@ struct nvmet_fabrics_ops {
        u16 (*install_queue)(struct nvmet_sq *nvme_sq);
        void (*discovery_chg)(struct nvmet_port *port);
        u8 (*get_mdts)(const struct nvmet_ctrl *ctrl);
+       u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
 };
 
 #define NVMET_MAX_INLINE_BIOVEC        8
@@ -576,6 +577,11 @@ static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req)
        return req->sq->ctrl->subsys;
 }
 
+static inline bool nvmet_is_disc_subsys(struct nvmet_subsys *subsys)
+{
+    return subsys->type != NVME_NQN_NVME;
+}
+
 #ifdef CONFIG_NVME_TARGET_PASSTHRU
 void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
 int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
index 891174c..1deb404 100644 (file)
@@ -5,6 +5,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/atomic.h>
+#include <linux/blk-integrity.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/err.h>
@@ -1818,12 +1819,36 @@ restart:
        mutex_unlock(&nvmet_rdma_queue_mutex);
 }
 
+static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port)
+{
+       struct nvmet_rdma_queue *queue, *tmp;
+       struct nvmet_port *nport = port->nport;
+
+       mutex_lock(&nvmet_rdma_queue_mutex);
+       list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
+                                queue_list) {
+               if (queue->port != nport)
+                       continue;
+
+               list_del_init(&queue->queue_list);
+               __nvmet_rdma_queue_disconnect(queue);
+       }
+       mutex_unlock(&nvmet_rdma_queue_mutex);
+}
+
 static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
 {
        struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
 
        if (cm_id)
                rdma_destroy_id(cm_id);
+
+       /*
+        * Destroy the remaining queues, which are not belong to any
+        * controller yet. Do it here after the RDMA-CM was destroyed
+        * guarantees that no new queue will be created.
+        */
+       nvmet_rdma_destroy_port_queues(port);
 }
 
 static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
@@ -1975,6 +2000,11 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
        return NVMET_RDMA_MAX_MDTS;
 }
 
+static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl)
+{
+       return NVME_RDMA_MAX_QUEUE_SIZE;
+}
+
 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
        .owner                  = THIS_MODULE,
        .type                   = NVMF_TRTYPE_RDMA,
@@ -1986,6 +2016,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
        .delete_ctrl            = nvmet_rdma_delete_ctrl,
        .disc_traddr            = nvmet_rdma_disc_port_addr,
        .get_mdts               = nvmet_rdma_get_mdts,
+       .get_max_queue_size     = nvmet_rdma_get_max_queue_size,
 };
 
 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
index 07ee347..84c387e 100644 (file)
@@ -702,7 +702,7 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
        struct nvmet_tcp_queue *queue = cmd->queue;
        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
        struct kvec iov = {
-               .iov_base = &cmd->exp_ddgst + cmd->offset,
+               .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
                .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
        };
        int ret;
@@ -1096,7 +1096,7 @@ recv:
        }
 
        if (queue->hdr_digest &&
-           nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+           nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
                nvmet_tcp_fatal_error(queue); /* fatal */
                return -EPROTO;
        }
@@ -1428,6 +1428,7 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
+       struct page *page;
        struct nvmet_tcp_queue *queue =
                container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1447,6 +1448,8 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
                nvmet_tcp_free_crypto(queue);
        ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
 
+       page = virt_to_head_page(queue->pf_cache.va);
+       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
        kfree(queue);
 }
 
@@ -1737,6 +1740,17 @@ err_port:
        return ret;
 }
 
+static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
+{
+       struct nvmet_tcp_queue *queue;
+
+       mutex_lock(&nvmet_tcp_queue_mutex);
+       list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+               if (queue->port == port)
+                       kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+       mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
 {
        struct nvmet_tcp_port *port = nport->priv;
@@ -1746,6 +1760,11 @@ static void nvmet_tcp_remove_port(struct nvmet_port *nport)
        port->sock->sk->sk_user_data = NULL;
        write_unlock_bh(&port->sock->sk->sk_callback_lock);
        cancel_work_sync(&port->accept_work);
+       /*
+        * Destroy the remaining queues, which are not belong to any
+        * controller yet.
+        */
+       nvmet_tcp_destroy_port_queues(port);
 
        sock_release(port->sock);
        kfree(port);
index be799a5..b0056ae 100644 (file)
@@ -147,8 +147,8 @@ config RESET_OXNAS
        bool
 
 config RESET_PISTACHIO
-       bool "Pistachio Reset Driver" if COMPILE_TEST
-       default MACH_PISTACHIO
+       bool "Pistachio Reset Driver"
+       depends on MIPS || COMPILE_TEST
        help
          This enables the reset driver for ImgTec Pistachio SoCs.
 
index b6f074d..433fa0c 100644 (file)
@@ -38,7 +38,7 @@ static int brcm_rescal_reset_set(struct reset_controller_dev *rcdev,
        }
 
        ret = readl_poll_timeout(base + BRCM_RESCAL_STATUS, reg,
-                                !(reg & BRCM_RESCAL_STATUS_BIT), 100, 1000);
+                                (reg & BRCM_RESCAL_STATUS_BIT), 100, 1000);
        if (ret) {
                dev_err(data->dev, "time out on SATA/PCIe rescal\n");
                return ret;
index 2a72f86..8c6492e 100644 (file)
@@ -92,3 +92,29 @@ void __init socfpga_reset_init(void)
        for_each_matching_node(np, socfpga_early_reset_dt_ids)
                a10_reset_init(np);
 }
+
+/*
+ * The early driver is problematic, because it doesn't register
+ * itself as a driver. This causes certain device links to prevent
+ * consumer devices from probing. The hacky solution is to register
+ * an empty driver, whose only job is to attach itself to the reset
+ * manager and call probe.
+ */
+static const struct of_device_id socfpga_reset_dt_ids[] = {
+       { .compatible = "altr,rst-mgr", },
+       { /* sentinel */ },
+};
+
+static int reset_simple_probe(struct platform_device *pdev)
+{
+       return 0;
+}
+
+static struct platform_driver reset_socfpga_driver = {
+       .probe  = reset_simple_probe,
+       .driver = {
+               .name           = "socfpga-reset",
+               .of_match_table = socfpga_reset_dt_ids,
+       },
+};
+builtin_platform_driver(reset_socfpga_driver);
index 24d3395..4c5bba5 100644 (file)
@@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
        struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc);
        struct mrq_reset_request request;
        struct tegra_bpmp_message msg;
+       int err;
 
        memset(&request, 0, sizeof(request));
        request.cmd = command;
@@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
        msg.tx.data = &request;
        msg.tx.size = sizeof(request);
 
-       return tegra_bpmp_transfer(bpmp, &msg);
+       err = tegra_bpmp_transfer(bpmp, &msg);
+       if (err)
+               return err;
+       if (msg.rx.ret)
+               return -EINVAL;
+
+       return 0;
 }
 
 static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc,
index e34c6cc..8e87a31 100644 (file)
@@ -2077,12 +2077,15 @@ static void __dasd_device_check_path_events(struct dasd_device *device)
 
        if (device->stopped & ~(DASD_STOPPED_DC_WAIT))
                return;
+
+       dasd_path_clear_all_verify(device);
+       dasd_path_clear_all_fcsec(device);
+
        rc = device->discipline->pe_handler(device, tbvpm, fcsecpm);
        if (rc) {
+               dasd_path_add_tbvpm(device, tbvpm);
+               dasd_path_add_fcsecpm(device, fcsecpm);
                dasd_device_set_timer(device, 50);
-       } else {
-               dasd_path_clear_all_verify(device);
-               dasd_path_clear_all_fcsec(device);
        }
 };
 
index 4691a3c..299001a 100644 (file)
@@ -201,7 +201,7 @@ dasd_3990_erp_DCTL(struct dasd_ccw_req * erp, char modifier)
        struct ccw1 *ccw;
        struct dasd_ccw_req *dctl_cqr;
 
-       dctl_cqr = dasd_alloc_erp_request((char *) &erp->magic, 1,
+       dctl_cqr = dasd_alloc_erp_request(erp->magic, 1,
                                          sizeof(struct DCTL_data),
                                          device);
        if (IS_ERR(dctl_cqr)) {
@@ -1652,7 +1652,7 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense)
        }
 
        /* Build new ERP request including DE/LO */
-       erp = dasd_alloc_erp_request((char *) &cqr->magic,
+       erp = dasd_alloc_erp_request(cqr->magic,
                                     2 + 1,/* DE/LO + TIC */
                                     sizeof(struct DE_eckd_data) +
                                     sizeof(struct LO_eckd_data), device);
@@ -2388,7 +2388,7 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
        }
 
        /* allocate additional request block */
-       erp = dasd_alloc_erp_request((char *) &cqr->magic,
+       erp = dasd_alloc_erp_request(cqr->magic,
                                     cplength, datasize, device);
        if (IS_ERR(erp)) {
                 if (cqr->retries <= 0) {
index 460e0f1..8410a25 100644 (file)
@@ -560,8 +560,8 @@ static int prefix_LRE(struct ccw1 *ccw, struct PFX_eckd_data *pfxdata,
                return -EINVAL;
        }
        pfxdata->format = format;
-       pfxdata->base_address = basepriv->ned->unit_addr;
-       pfxdata->base_lss = basepriv->ned->ID;
+       pfxdata->base_address = basepriv->conf.ned->unit_addr;
+       pfxdata->base_lss = basepriv->conf.ned->ID;
        pfxdata->validity.define_extent = 1;
 
        /* private uid is kept up to date, conf_data may be outdated */
@@ -736,32 +736,30 @@ dasd_eckd_cdl_reclen(int recid)
        return LABEL_SIZE;
 }
 /* create unique id from private structure. */
-static void create_uid(struct dasd_eckd_private *private)
+static void create_uid(struct dasd_conf *conf, struct dasd_uid *uid)
 {
        int count;
-       struct dasd_uid *uid;
 
-       uid = &private->uid;
        memset(uid, 0, sizeof(struct dasd_uid));
-       memcpy(uid->vendor, private->ned->HDA_manufacturer,
+       memcpy(uid->vendor, conf->ned->HDA_manufacturer,
               sizeof(uid->vendor) - 1);
        EBCASC(uid->vendor, sizeof(uid->vendor) - 1);
-       memcpy(uid->serial, &private->ned->serial,
+       memcpy(uid->serial, &conf->ned->serial,
               sizeof(uid->serial) - 1);
        EBCASC(uid->serial, sizeof(uid->serial) - 1);
-       uid->ssid = private->gneq->subsystemID;
-       uid->real_unit_addr = private->ned->unit_addr;
-       if (private->sneq) {
-               uid->type = private->sneq->sua_flags;
+       uid->ssid = conf->gneq->subsystemID;
+       uid->real_unit_addr = conf->ned->unit_addr;
+       if (conf->sneq) {
+               uid->type = conf->sneq->sua_flags;
                if (uid->type == UA_BASE_PAV_ALIAS)
-                       uid->base_unit_addr = private->sneq->base_unit_addr;
+                       uid->base_unit_addr = conf->sneq->base_unit_addr;
        } else {
                uid->type = UA_BASE_DEVICE;
        }
-       if (private->vdsneq) {
+       if (conf->vdsneq) {
                for (count = 0; count < 16; count++) {
                        sprintf(uid->vduit+2*count, "%02x",
-                               private->vdsneq->uit[count]);
+                               conf->vdsneq->uit[count]);
                }
        }
 }
@@ -776,10 +774,10 @@ static int dasd_eckd_generate_uid(struct dasd_device *device)
 
        if (!private)
                return -ENODEV;
-       if (!private->ned || !private->gneq)
+       if (!private->conf.ned || !private->conf.gneq)
                return -ENODEV;
        spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
-       create_uid(private);
+       create_uid(&private->conf, &private->uid);
        spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
        return 0;
 }
@@ -803,14 +801,15 @@ static int dasd_eckd_get_uid(struct dasd_device *device, struct dasd_uid *uid)
  * return 0 for match
  */
 static int dasd_eckd_compare_path_uid(struct dasd_device *device,
-                                     struct dasd_eckd_private *private)
+                                     struct dasd_conf *path_conf)
 {
        struct dasd_uid device_uid;
+       struct dasd_uid path_uid;
 
-       create_uid(private);
+       create_uid(path_conf, &path_uid);
        dasd_eckd_get_uid(device, &device_uid);
 
-       return memcmp(&device_uid, &private->uid, sizeof(struct dasd_uid));
+       return memcmp(&device_uid, &path_uid, sizeof(struct dasd_uid));
 }
 
 static void dasd_eckd_fill_rcd_cqr(struct dasd_device *device,
@@ -946,34 +945,34 @@ out_error:
        return ret;
 }
 
-static int dasd_eckd_identify_conf_parts(struct dasd_eckd_private *private)
+static int dasd_eckd_identify_conf_parts(struct dasd_conf *conf)
 {
 
        struct dasd_sneq *sneq;
        int i, count;
 
-       private->ned = NULL;
-       private->sneq = NULL;
-       private->vdsneq = NULL;
-       private->gneq = NULL;
-       count = private->conf_len / sizeof(struct dasd_sneq);
-       sneq = (struct dasd_sneq *)private->conf_data;
+       conf->ned = NULL;
+       conf->sneq = NULL;
+       conf->vdsneq = NULL;
+       conf->gneq = NULL;
+       count = conf->len / sizeof(struct dasd_sneq);
+       sneq = (struct dasd_sneq *)conf->data;
        for (i = 0; i < count; ++i) {
                if (sneq->flags.identifier == 1 && sneq->format == 1)
-                       private->sneq = sneq;
+                       conf->sneq = sneq;
                else if (sneq->flags.identifier == 1 && sneq->format == 4)
-                       private->vdsneq = (struct vd_sneq *)sneq;
+                       conf->vdsneq = (struct vd_sneq *)sneq;
                else if (sneq->flags.identifier == 2)
-                       private->gneq = (struct dasd_gneq *)sneq;
+                       conf->gneq = (struct dasd_gneq *)sneq;
                else if (sneq->flags.identifier == 3 && sneq->res1 == 1)
-                       private->ned = (struct dasd_ned *)sneq;
+                       conf->ned = (struct dasd_ned *)sneq;
                sneq++;
        }
-       if (!private->ned || !private->gneq) {
-               private->ned = NULL;
-               private->sneq = NULL;
-               private->vdsneq = NULL;
-               private->gneq = NULL;
+       if (!conf->ned || !conf->gneq) {
+               conf->ned = NULL;
+               conf->sneq = NULL;
+               conf->vdsneq = NULL;
+               conf->gneq = NULL;
                return -EINVAL;
        }
        return 0;
@@ -1016,9 +1015,9 @@ static void dasd_eckd_store_conf_data(struct dasd_device *device,
         * with the new one if this points to the same data
         */
        cdp = device->path[chp].conf_data;
-       if (private->conf_data == cdp) {
-               private->conf_data = (void *)conf_data;
-               dasd_eckd_identify_conf_parts(private);
+       if (private->conf.data == cdp) {
+               private->conf.data = (void *)conf_data;
+               dasd_eckd_identify_conf_parts(&private->conf);
        }
        ccw_device_get_schid(device->cdev, &sch_id);
        device->path[chp].conf_data = conf_data;
@@ -1036,8 +1035,8 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device)
        struct dasd_eckd_private *private = device->private;
        int i;
 
-       private->conf_data = NULL;
-       private->conf_len = 0;
+       private->conf.data = NULL;
+       private->conf.len = 0;
        for (i = 0; i < 8; i++) {
                kfree(device->path[i].conf_data);
                device->path[i].conf_data = NULL;
@@ -1071,15 +1070,55 @@ static void dasd_eckd_read_fc_security(struct dasd_device *device)
        }
 }
 
+static void dasd_eckd_get_uid_string(struct dasd_conf *conf,
+                                    char *print_uid)
+{
+       struct dasd_uid uid;
+
+       create_uid(conf, &uid);
+       if (strlen(uid.vduit) > 0)
+               snprintf(print_uid, sizeof(*print_uid),
+                        "%s.%s.%04x.%02x.%s",
+                        uid.vendor, uid.serial, uid.ssid,
+                        uid.real_unit_addr, uid.vduit);
+       else
+               snprintf(print_uid, sizeof(*print_uid),
+                        "%s.%s.%04x.%02x",
+                        uid.vendor, uid.serial, uid.ssid,
+                        uid.real_unit_addr);
+}
+
+static int dasd_eckd_check_cabling(struct dasd_device *device,
+                                  void *conf_data, __u8 lpm)
+{
+       struct dasd_eckd_private *private = device->private;
+       char print_path_uid[60], print_device_uid[60];
+       struct dasd_conf path_conf;
+
+       path_conf.data = conf_data;
+       path_conf.len = DASD_ECKD_RCD_DATA_SIZE;
+       if (dasd_eckd_identify_conf_parts(&path_conf))
+               return 1;
+
+       if (dasd_eckd_compare_path_uid(device, &path_conf)) {
+               dasd_eckd_get_uid_string(&path_conf, print_path_uid);
+               dasd_eckd_get_uid_string(&private->conf, print_device_uid);
+               dev_err(&device->cdev->dev,
+                       "Not all channel paths lead to the same device, path %02X leads to device %s instead of %s\n",
+                       lpm, print_path_uid, print_device_uid);
+               return 1;
+       }
+
+       return 0;
+}
+
 static int dasd_eckd_read_conf(struct dasd_device *device)
 {
        void *conf_data;
        int conf_len, conf_data_saved;
        int rc, path_err, pos;
        __u8 lpm, opm;
-       struct dasd_eckd_private *private, path_private;
-       struct dasd_uid *uid;
-       char print_path_uid[60], print_device_uid[60];
+       struct dasd_eckd_private *private;
 
        private = device->private;
        opm = ccw_device_get_path_mask(device->cdev);
@@ -1109,11 +1148,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                if (!conf_data_saved) {
                        /* initially clear previously stored conf_data */
                        dasd_eckd_clear_conf_data(device);
-                       private->conf_data = conf_data;
-                       private->conf_len = conf_len;
-                       if (dasd_eckd_identify_conf_parts(private)) {
-                               private->conf_data = NULL;
-                               private->conf_len = 0;
+                       private->conf.data = conf_data;
+                       private->conf.len = conf_len;
+                       if (dasd_eckd_identify_conf_parts(&private->conf)) {
+                               private->conf.data = NULL;
+                               private->conf.len = 0;
                                kfree(conf_data);
                                continue;
                        }
@@ -1123,59 +1162,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                         */
                        dasd_eckd_generate_uid(device);
                        conf_data_saved++;
-               } else {
-                       path_private.conf_data = conf_data;
-                       path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE;
-                       if (dasd_eckd_identify_conf_parts(
-                                   &path_private)) {
-                               path_private.conf_data = NULL;
-                               path_private.conf_len = 0;
-                               kfree(conf_data);
-                               continue;
-                       }
-                       if (dasd_eckd_compare_path_uid(
-                                   device, &path_private)) {
-                               uid = &path_private.uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_path_uid,
-                                                sizeof(print_path_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_path_uid,
-                                                sizeof(print_path_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
-                               uid = &private->uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_device_uid,
-                                                sizeof(print_device_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_device_uid,
-                                                sizeof(print_device_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
-                               dev_err(&device->cdev->dev,
-                                       "Not all channel paths lead to "
-                                       "the same device, path %02X leads to "
-                                       "device %s instead of %s\n", lpm,
-                                       print_path_uid, print_device_uid);
-                               path_err = -EINVAL;
-                               dasd_path_add_cablepm(device, lpm);
-                               continue;
-                       }
-                       path_private.conf_data = NULL;
-                       path_private.conf_len = 0;
+               } else if (dasd_eckd_check_cabling(device, conf_data, lpm)) {
+                       dasd_path_add_cablepm(device, lpm);
+                       path_err = -EINVAL;
+                       kfree(conf_data);
+                       continue;
                }
 
                pos = pathmask_to_pos(lpm);
@@ -1197,8 +1188,6 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                }
        }
 
-       dasd_eckd_read_fc_security(device);
-
        return path_err;
 }
 
@@ -1213,7 +1202,7 @@ static u32 get_fcx_max_data(struct dasd_device *device)
                return 0;
        /* is transport mode supported? */
        fcx_in_css = css_general_characteristics.fcx;
-       fcx_in_gneq = private->gneq->reserved2[7] & 0x04;
+       fcx_in_gneq = private->conf.gneq->reserved2[7] & 0x04;
        fcx_in_features = private->features.feature[40] & 0x80;
        tpm = fcx_in_css && fcx_in_gneq && fcx_in_features;
 
@@ -1282,9 +1271,9 @@ static int rebuild_device_uid(struct dasd_device *device,
                                        "returned error %d", rc);
                        break;
                }
-               memcpy(private->conf_data, data->rcd_buffer,
+               memcpy(private->conf.data, data->rcd_buffer,
                       DASD_ECKD_RCD_DATA_SIZE);
-               if (dasd_eckd_identify_conf_parts(private)) {
+               if (dasd_eckd_identify_conf_parts(&private->conf)) {
                        rc = -ENODEV;
                } else /* first valid path is enough */
                        break;
@@ -1299,11 +1288,10 @@ static int rebuild_device_uid(struct dasd_device *device,
 static void dasd_eckd_path_available_action(struct dasd_device *device,
                                            struct pe_handler_work_data *data)
 {
-       struct dasd_eckd_private path_private;
-       struct dasd_uid *uid;
        __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE];
        __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm;
        struct dasd_conf_data *conf_data;
+       struct dasd_conf path_conf;
        unsigned long flags;
        char print_uid[60];
        int rc, pos;
@@ -1367,11 +1355,11 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                 */
                memcpy(&path_rcd_buf, data->rcd_buffer,
                       DASD_ECKD_RCD_DATA_SIZE);
-               path_private.conf_data = (void *) &path_rcd_buf;
-               path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE;
-               if (dasd_eckd_identify_conf_parts(&path_private)) {
-                       path_private.conf_data = NULL;
-                       path_private.conf_len = 0;
+               path_conf.data = (void *)&path_rcd_buf;
+               path_conf.len = DASD_ECKD_RCD_DATA_SIZE;
+               if (dasd_eckd_identify_conf_parts(&path_conf)) {
+                       path_conf.data = NULL;
+                       path_conf.len = 0;
                        continue;
                }
 
@@ -1382,7 +1370,7 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                 * the first working path UID will be used as device UID
                 */
                if (dasd_path_get_opm(device) &&
-                   dasd_eckd_compare_path_uid(device, &path_private)) {
+                   dasd_eckd_compare_path_uid(device, &path_conf)) {
                        /*
                         * the comparison was not successful
                         * rebuild the device UID with at least one
@@ -1396,20 +1384,8 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                         */
                        if (rebuild_device_uid(device, data) ||
                            dasd_eckd_compare_path_uid(
-                                   device, &path_private)) {
-                               uid = &path_private.uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_uid, sizeof(print_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_uid, sizeof(print_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
+                                   device, &path_conf)) {
+                               dasd_eckd_get_uid_string(&path_conf, print_uid);
                                dev_err(&device->cdev->dev,
                                        "The newly added channel path %02X "
                                        "will not be used because it leads "
@@ -1427,6 +1403,14 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                if (conf_data) {
                        memcpy(conf_data, data->rcd_buffer,
                               DASD_ECKD_RCD_DATA_SIZE);
+               } else {
+                       /*
+                        * path is operational but path config data could not
+                        * be stored due to low mem condition
+                        * add it to the error path mask and schedule a path
+                        * verification later that this could be added again
+                        */
+                       epm |= lpm;
                }
                pos = pathmask_to_pos(lpm);
                dasd_eckd_store_conf_data(device, conf_data, pos);
@@ -1447,7 +1431,10 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                }
                dasd_path_add_nppm(device, npm);
                dasd_path_add_ppm(device, ppm);
-               dasd_path_add_tbvpm(device, epm);
+               if (epm) {
+                       dasd_path_add_tbvpm(device, epm);
+                       dasd_device_set_timer(device, 50);
+               }
                dasd_path_add_cablepm(device, cablepm);
                dasd_path_add_nohpfpm(device, hpfpm);
                spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
@@ -1625,8 +1612,8 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device)
        prssdp = cqr->data;
        prssdp->order = PSF_ORDER_PRSSD;
        prssdp->suborder = PSF_SUBORDER_VSQ;    /* Volume Storage Query */
-       prssdp->lss = private->ned->ID;
-       prssdp->volume = private->ned->unit_addr;
+       prssdp->lss = private->conf.ned->ID;
+       prssdp->volume = private->conf.ned->unit_addr;
 
        ccw = cqr->cpaddr;
        ccw->cmd_code = DASD_ECKD_CCW_PSF;
@@ -2085,11 +2072,11 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
        device->path_thrhld = DASD_ECKD_PATH_THRHLD;
        device->path_interval = DASD_ECKD_PATH_INTERVAL;
 
-       if (private->gneq) {
+       if (private->conf.gneq) {
                value = 1;
-               for (i = 0; i < private->gneq->timeout.value; i++)
+               for (i = 0; i < private->conf.gneq->timeout.value; i++)
                        value = 10 * value;
-               value = value * private->gneq->timeout.number;
+               value = value * private->conf.gneq->timeout.number;
                /* do not accept useless values */
                if (value != 0 && value <= DASD_EXPIRES_MAX)
                        device->default_expires = value;
@@ -2121,6 +2108,7 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
        if (rc)
                goto out_err3;
 
+       dasd_eckd_read_fc_security(device);
        dasd_path_create_kobjects(device);
 
        /* Read Feature Codes */
@@ -2195,10 +2183,10 @@ static void dasd_eckd_uncheck_device(struct dasd_device *device)
                return;
 
        dasd_alias_disconnect_device_from_lcu(device);
-       private->ned = NULL;
-       private->sneq = NULL;
-       private->vdsneq = NULL;
-       private->gneq = NULL;
+       private->conf.ned = NULL;
+       private->conf.sneq = NULL;
+       private->conf.vdsneq = NULL;
+       private->conf.gneq = NULL;
        dasd_eckd_clear_conf_data(device);
        dasd_path_remove_kobjects(device);
 }
@@ -3750,8 +3738,8 @@ dasd_eckd_dso_ras(struct dasd_device *device, struct dasd_block *block,
         * subset.
         */
        ras_data->op_flags.guarantee_init = !!(features->feature[56] & 0x01);
-       ras_data->lss = private->ned->ID;
-       ras_data->dev_addr = private->ned->unit_addr;
+       ras_data->lss = private->conf.ned->ID;
+       ras_data->dev_addr = private->conf.ned->unit_addr;
        ras_data->nr_exts = nr_exts;
 
        if (by_extent) {
@@ -4293,8 +4281,8 @@ static int prepare_itcw(struct itcw *itcw,
 
        memset(&pfxdata, 0, sizeof(pfxdata));
        pfxdata.format = 1; /* PFX with LRE */
-       pfxdata.base_address = basepriv->ned->unit_addr;
-       pfxdata.base_lss = basepriv->ned->ID;
+       pfxdata.base_address = basepriv->conf.ned->unit_addr;
+       pfxdata.base_lss = basepriv->conf.ned->ID;
        pfxdata.validity.define_extent = 1;
 
        /* private uid is kept up to date, conf_data may be outdated */
@@ -4963,9 +4951,9 @@ dasd_eckd_fill_info(struct dasd_device * device,
        info->characteristics_size = sizeof(private->rdc_data);
        memcpy(info->characteristics, &private->rdc_data,
               sizeof(private->rdc_data));
-       info->confdata_size = min((unsigned long)private->conf_len,
-                                 sizeof(info->configuration_data));
-       memcpy(info->configuration_data, private->conf_data,
+       info->confdata_size = min_t(unsigned long, private->conf.len,
+                                   sizeof(info->configuration_data));
+       memcpy(info->configuration_data, private->conf.data,
               info->confdata_size);
        return 0;
 }
@@ -5808,6 +5796,8 @@ static int dasd_eckd_reload_device(struct dasd_device *device)
        if (rc)
                goto out_err;
 
+       dasd_eckd_read_fc_security(device);
+
        rc = dasd_eckd_generate_uid(device);
        if (rc)
                goto out_err;
@@ -5820,15 +5810,7 @@ static int dasd_eckd_reload_device(struct dasd_device *device)
        dasd_eckd_get_uid(device, &uid);
 
        if (old_base != uid.base_unit_addr) {
-               if (strlen(uid.vduit) > 0)
-                       snprintf(print_uid, sizeof(print_uid),
-                                "%s.%s.%04x.%02x.%s", uid.vendor, uid.serial,
-                                uid.ssid, uid.base_unit_addr, uid.vduit);
-               else
-                       snprintf(print_uid, sizeof(print_uid),
-                                "%s.%s.%04x.%02x", uid.vendor, uid.serial,
-                                uid.ssid, uid.base_unit_addr);
-
+               dasd_eckd_get_uid_string(&private->conf, print_uid);
                dev_info(&device->cdev->dev,
                         "An Alias device was reassigned to a new base device "
                         "with UID: %s\n", print_uid);
@@ -5966,8 +5948,8 @@ static int dasd_eckd_query_host_access(struct dasd_device *device,
        prssdp->order = PSF_ORDER_PRSSD;
        prssdp->suborder = PSF_SUBORDER_QHA;    /* query host access */
        /* LSS and Volume that will be queried */
-       prssdp->lss = private->ned->ID;
-       prssdp->volume = private->ned->unit_addr;
+       prssdp->lss = private->conf.ned->ID;
+       prssdp->volume = private->conf.ned->unit_addr;
        /* all other bytes of prssdp must be zero */
 
        ccw = cqr->cpaddr;
index 65e4630..a91b265 100644 (file)
@@ -658,16 +658,19 @@ struct dasd_conf_data {
        struct dasd_gneq gneq;
 } __packed;
 
-struct dasd_eckd_private {
-       struct dasd_eckd_characteristics rdc_data;
-       u8 *conf_data;
-       int conf_len;
-
+struct dasd_conf {
+       u8 *data;
+       int len;
        /* pointers to specific parts in the conf_data */
        struct dasd_ned *ned;
        struct dasd_sneq *sneq;
        struct vd_sneq *vdsneq;
        struct dasd_gneq *gneq;
+};
+
+struct dasd_eckd_private {
+       struct dasd_eckd_characteristics rdc_data;
+       struct dasd_conf conf;
 
        struct eckd_count count_area[5];
        int init_cqr_status;
index ba4fa37..c07e6e7 100644 (file)
@@ -24,7 +24,7 @@
 #include "dasd_int.h"
 
 struct dasd_ccw_req *
-dasd_alloc_erp_request(char *magic, int cplength, int datasize,
+dasd_alloc_erp_request(unsigned int magic, int cplength, int datasize,
                       struct dasd_device * device)
 {
        unsigned long flags;
@@ -33,8 +33,8 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize,
        int size;
 
        /* Sanity checks */
-       BUG_ON( magic == NULL || datasize > PAGE_SIZE ||
-            (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
+       BUG_ON(datasize > PAGE_SIZE ||
+              (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
 
        size = (sizeof(struct dasd_ccw_req) + 7L) & -8L;
        if (cplength > 0)
@@ -62,7 +62,7 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize,
                cqr->data = data;
                memset(cqr->data, 0, datasize);
        }
-       strncpy((char *) &cqr->magic, magic, 4);
+       cqr->magic = magic;
        ASCEBC((char *) &cqr->magic, 4);
        set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
        dasd_get_device(device);
index fa966e0..3a6f3af 100644 (file)
@@ -14,6 +14,7 @@
 #define KMSG_COMPONENT "dasd"
 
 #include <linux/interrupt.h>
+#include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/blkpg.h>
 
index 155428b..8b45801 100644 (file)
@@ -887,7 +887,7 @@ void dasd_proc_exit(void);
 /* externals in dasd_erp.c */
 struct dasd_ccw_req *dasd_default_erp_action(struct dasd_ccw_req *);
 struct dasd_ccw_req *dasd_default_erp_postaction(struct dasd_ccw_req *);
-struct dasd_ccw_req *dasd_alloc_erp_request(char *, int, int,
+struct dasd_ccw_req *dasd_alloc_erp_request(unsigned int, int, int,
                                            struct dasd_device *);
 void dasd_free_erp_request(struct dasd_ccw_req *, struct dasd_device *);
 void dasd_log_sense(struct dasd_ccw_req *, struct irb *);
@@ -1305,6 +1305,15 @@ static inline void dasd_path_add_ppm(struct dasd_device *device, __u8 pm)
                        dasd_path_preferred(device, chp);
 }
 
+static inline void dasd_path_add_fcsecpm(struct dasd_device *device, __u8 pm)
+{
+       int chp;
+
+       for (chp = 0; chp < 8; chp++)
+               if (pm & (0x80 >> chp))
+                       dasd_path_fcsec(device, chp);
+}
+
 /*
  * set functions for path masks
  * the existing path mask will be replaced by the given path mask
index 468cbeb..95349f9 100644 (file)
@@ -650,8 +650,8 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode,
 
 /**
  * dasd_biodasdinfo() - fill out the dasd information structure
- * @disk [in]: pointer to gendisk structure that references a DASD
- * @info [out]: pointer to the dasd_information2_t structure
+ * @disk: [in] pointer to gendisk structure that references a DASD
+ * @info: [out] pointer to the dasd_information2_t structure
  *
  * Provide access to DASD specific information.
  * The gendisk structure is checked if it belongs to the DASD driver by
index 5be3d1c..59e513d 100644 (file)
@@ -30,7 +30,7 @@
 
 static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
-static blk_qc_t dcssblk_submit_bio(struct bio *bio);
+static void dcssblk_submit_bio(struct bio *bio);
 static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn);
 
@@ -854,7 +854,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode)
        up_write(&dcssblk_devices_sem);
 }
 
-static blk_qc_t
+static void
 dcssblk_submit_bio(struct bio *bio)
 {
        struct dcssblk_dev_info *dev_info;
@@ -907,10 +907,9 @@ dcssblk_submit_bio(struct bio *bio)
                bytes_done += bvec.bv_len;
        }
        bio_endio(bio);
-       return BLK_QC_T_NONE;
+       return;
 fail:
        bio_io_error(bio);
-       return BLK_QC_T_NONE;
 }
 
 static long
index 3ab669d..27884f3 100644 (file)
@@ -3,6 +3,7 @@
  * Copyright (c) 2017 Hisilicon Limited.
  */
 
+#include <linux/sched/clock.h>
 #include "hisi_sas.h"
 #define DRV_NAME "hisi_sas_v3_hw"
 
index 1f1586a..01f7999 100644 (file)
@@ -1696,6 +1696,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
 
        spin_lock_irqsave(&evt->queue->l_lock, flags);
        list_add_tail(&evt->queue_list, &evt->queue->sent);
+       atomic_set(&evt->active, 1);
 
        mb();
 
@@ -1710,6 +1711,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
                                     be64_to_cpu(crq_as_u64[1]));
 
        if (rc) {
+               atomic_set(&evt->active, 0);
                list_del(&evt->queue_list);
                spin_unlock_irqrestore(&evt->queue->l_lock, flags);
                del_timer(&evt->timer);
@@ -1737,7 +1739,6 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
 
                evt->done(evt);
        } else {
-               atomic_set(&evt->active, 1);
                spin_unlock_irqrestore(&evt->queue->l_lock, flags);
                ibmvfc_trc_start(evt);
        }
index befeb7c..337e6ed 100644 (file)
@@ -22,6 +22,7 @@
  *******************************************************************/
 
 #include <scsi/scsi_host.h>
+#include <linux/hashtable.h>
 #include <linux/ktime.h>
 #include <linux/workqueue.h>
 
index d383d4a..ad1b6c2 100644 (file)
@@ -5065,9 +5065,12 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
        if (scmd->prot_flags & SCSI_PROT_GUARD_CHECK)
                eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
 
-       if (scmd->prot_flags & SCSI_PROT_REF_CHECK) {
-               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG |
-                       MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG;
+       if (scmd->prot_flags & SCSI_PROT_REF_CHECK)
+               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG;
+
+       if (scmd->prot_flags & SCSI_PROT_REF_INCREMENT) {
+               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG;
+
                mpi_request->CDB.EEDP32.PrimaryReferenceTag =
                        cpu_to_be32(scsi_prot_ref_tag(scmd));
        }
index 1c5da2d..253055c 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/delay.h>
 #include <linux/nvme.h>
 #include <linux/nvme-fc.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/blk-mq.h>
 
 static struct nvme_fc_port_template qla_nvme_fc_transport;
 
@@ -642,6 +644,18 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport,
        return rval;
 }
 
+static void qla_nvme_map_queues(struct nvme_fc_local_port *lport,
+               struct blk_mq_queue_map *map)
+{
+       struct scsi_qla_host *vha = lport->private;
+       int rc;
+
+       rc = blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset);
+       if (rc)
+               ql_log(ql_log_warn, vha, 0x21de,
+                      "pci map queue failed 0x%x", rc);
+}
+
 static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport)
 {
        struct scsi_qla_host *vha = lport->private;
@@ -676,6 +690,7 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
        .ls_abort       = qla_nvme_ls_abort,
        .fcp_io         = qla_nvme_post_cmd,
        .fcp_abort      = qla_nvme_fcp_abort,
+       .map_queues     = qla_nvme_map_queues,
        .max_hw_queues  = 8,
        .max_sgl_segments = 1024,
        .max_dif_sgl_segments = 64,
index 81c3853..081b84b 100644 (file)
@@ -25,8 +25,8 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
                return -EOPNOTSUPP;
        }
 
-       rq = blk_get_request(q, hdr->dout_xfer_len ?
-                            REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(q, hdr->dout_xfer_len ?
+                               REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        rq->timeout = timeout;
@@ -95,7 +95,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 out_free_cmd:
        scsi_req_free_cmd(scsi_req(rq));
 out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
        return ret;
 }
 
index 66f5074..40b473e 100644 (file)
@@ -5384,7 +5384,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 {
        bool new_sd_dp;
        bool inject = false;
-       bool hipri = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_HIPRI;
+       bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED;
        int k, num_in_q, qdepth;
        unsigned long iflags;
        u64 ns_from_boot = 0;
@@ -5471,7 +5471,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
        if (sdebug_host_max_queue)
                sd_dp->hc_idx = get_tag(cmnd);
 
-       if (hipri)
+       if (polled)
                ns_from_boot = ktime_get_boottime_ns();
 
        /* one of the resp_*() response functions is called here */
@@ -5531,7 +5531,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
                                kt -= d;
                        }
                }
-               if (hipri) {
+               if (polled) {
                        sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);
                        spin_lock_irqsave(&sqp->qc_lock, iflags);
                        if (!sd_dp->init_poll) {
@@ -5562,7 +5562,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
                if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
                             atomic_read(&sdeb_inject_pending)))
                        sd_dp->aborted = true;
-               if (hipri) {
+               if (polled) {
                        sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);
                        spin_lock_irqsave(&sqp->qc_lock, iflags);
                        if (!sd_dp->init_poll) {
@@ -7331,7 +7331,7 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
                        if (kt_from_boot < sd_dp->cmpl_ts)
                                continue;
 
-               } else          /* ignoring non REQ_HIPRI requests */
+               } else          /* ignoring non REQ_POLLED requests */
                        continue;
                devip = (struct sdebug_dev_info *)scp->device->hostdata;
                if (likely(devip))
index b6c86cc..36870b4 100644 (file)
@@ -1979,7 +1979,7 @@ maybe_retry:
 
 static void eh_lock_door_done(struct request *req, blk_status_t status)
 {
-       blk_put_request(req);
+       blk_mq_free_request(req);
 }
 
 /**
@@ -1998,7 +1998,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
        struct request *req;
        struct scsi_request *rq;
 
-       req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, 0);
+       req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0);
        if (IS_ERR(req))
                return;
        rq = scsi_req(req);
index 6ff2207..34412ea 100644 (file)
@@ -438,7 +438,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk,
                at_head = 1;
 
        ret = -ENOMEM;
-       rq = blk_get_request(sdev->request_queue, writing ?
+       rq = scsi_alloc_request(sdev->request_queue, writing ?
                             REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
@@ -490,7 +490,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk,
 out_free_cdb:
        scsi_req_free_cmd(req);
 out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
        return ret;
 }
 
@@ -561,7 +561,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk,
 
        }
 
-       rq = blk_get_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq)) {
                err = PTR_ERR(rq);
                goto error_free_buffer;
@@ -634,7 +634,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk,
        }
 
 error:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
 
 error_free_buffer:
        kfree(buffer);
index 5726738..9c2b99e 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/scatterlist.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
 #include <linux/ratelimit.h>
 #include <asm/unaligned.h>
 
@@ -215,7 +216,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
        struct scsi_request *rq;
        int ret;
 
-       req = blk_get_request(sdev->request_queue,
+       req = scsi_alloc_request(sdev->request_queue,
                        data_direction == DMA_TO_DEVICE ?
                        REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
                        rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0);
@@ -259,7 +260,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
        ret = rq->result;
  out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
        return ret;
 }
@@ -1078,9 +1079,6 @@ EXPORT_SYMBOL(scsi_alloc_sgtables);
  * This function initializes the members of struct scsi_cmnd that must be
  * initialized before request processing starts and that won't be
  * reinitialized if a SCSI command is requeued.
- *
- * Called from inside blk_get_request() for pass-through requests and from
- * inside scsi_init_command() for filesystem requests.
  */
 static void scsi_initialize_rq(struct request *rq)
 {
@@ -1097,6 +1095,18 @@ static void scsi_initialize_rq(struct request *rq)
        cmd->retries = 0;
 }
 
+struct request *scsi_alloc_request(struct request_queue *q,
+               unsigned int op, blk_mq_req_flags_t flags)
+{
+       struct request *rq;
+
+       rq = blk_mq_alloc_request(q, op, flags);
+       if (!IS_ERR(rq))
+               scsi_initialize_rq(rq);
+       return rq;
+}
+EXPORT_SYMBOL_GPL(scsi_alloc_request);
+
 /*
  * Only called when the request isn't completed by SCSI, and not freed by
  * SCSI
@@ -1783,7 +1793,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
 }
 
 
-static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
        struct Scsi_Host *shost = hctx->driver_data;
 
@@ -1863,7 +1873,6 @@ static const struct blk_mq_ops scsi_mq_ops_no_commit = {
 #endif
        .init_request   = scsi_mq_init_request,
        .exit_request   = scsi_mq_exit_request,
-       .initialize_rq_fn = scsi_initialize_rq,
        .cleanup_rq     = scsi_cleanup_rq,
        .busy           = scsi_mq_lld_busy,
        .map_queues     = scsi_map_queues,
@@ -1893,7 +1902,6 @@ static const struct blk_mq_ops scsi_mq_ops = {
 #endif
        .init_request   = scsi_mq_init_request,
        .exit_request   = scsi_mq_exit_request,
-       .initialize_rq_fn = scsi_initialize_rq,
        .cleanup_rq     = scsi_cleanup_rq,
        .busy           = scsi_mq_lld_busy,
        .map_queues     = scsi_map_queues,
@@ -1959,6 +1967,14 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
 
        return sdev;
 }
+/*
+ * pktcdvd should have been integrated into the SCSI layers, but for historical
+ * reasons like the old IDE driver it isn't.  This export allows it to safely
+ * probe if a given device is a SCSI one and only attach to that.
+ */
+#ifdef CONFIG_CDROM_PKTCDVD_MODULE
+EXPORT_SYMBOL_GPL(scsi_device_from_queue);
+#endif
 
 /**
  * scsi_block_requests - Utility function used by low-level drivers to prevent
index fe22191..2808c0c 100644 (file)
@@ -280,7 +280,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
        sdev->request_queue = q;
        q->queuedata = sdev;
        __scsi_init_queue(sdev->host, q);
-       blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
        WARN_ON_ONCE(!blk_get_queue(q));
 
        depth = sdev->host->cmd_per_lun ?: 1;
index fce6333..252e43d 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/blkpg.h>
 #include <linux/blk-pm.h>
 #include <linux/delay.h>
+#include <linux/major.h>
 #include <linux/mutex.h>
 #include <linux/string_helpers.h>
 #include <linux/async.h>
@@ -1756,6 +1757,44 @@ static void sd_rescan(struct device *dev)
        sd_revalidate_disk(sdkp->disk);
 }
 
+static int sd_get_unique_id(struct gendisk *disk, u8 id[16],
+               enum blk_unique_id type)
+{
+       struct scsi_device *sdev = scsi_disk(disk)->device;
+       const struct scsi_vpd *vpd;
+       const unsigned char *d;
+       int ret = -ENXIO, len;
+
+       rcu_read_lock();
+       vpd = rcu_dereference(sdev->vpd_pg83);
+       if (!vpd)
+               goto out_unlock;
+
+       ret = -EINVAL;
+       for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) {
+               /* we only care about designators with LU association */
+               if (((d[1] >> 4) & 0x3) != 0x00)
+                       continue;
+               if ((d[1] & 0xf) != type)
+                       continue;
+
+               /*
+                * Only exit early if a 16-byte descriptor was found.  Otherwise
+                * keep looking as one with more entropy might still show up.
+                */
+               len = d[3];
+               if (len != 8 && len != 12 && len != 16)
+                       continue;
+               ret = len;
+               memcpy(id, d + 4, len);
+               if (len == 16)
+                       break;
+       }
+out_unlock:
+       rcu_read_unlock();
+       return ret;
+}
+
 static char sd_pr_type(enum pr_type type)
 {
        switch (type) {
@@ -1860,6 +1899,7 @@ static const struct block_device_operations sd_fops = {
        .check_events           = sd_check_events,
        .unlock_native_capacity = sd_unlock_native_capacity,
        .report_zones           = sd_zbc_report_zones,
+       .get_unique_id          = sd_get_unique_id,
        .pr_ops                 = &sd_pr_ops,
 };
 
@@ -3087,6 +3127,86 @@ static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer)
                sdkp->security = 1;
 }
 
+static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf)
+{
+       return logical_to_sectors(sdkp->device, get_unaligned_be64(buf));
+}
+
+/**
+ * sd_read_cpr - Query concurrent positioning ranges
+ * @sdkp:      disk to query
+ */
+static void sd_read_cpr(struct scsi_disk *sdkp)
+{
+       struct blk_independent_access_ranges *iars = NULL;
+       unsigned char *buffer = NULL;
+       unsigned int nr_cpr = 0;
+       int i, vpd_len, buf_len = SD_BUF_SIZE;
+       u8 *desc;
+
+       /*
+        * We need to have the capacity set first for the block layer to be
+        * able to check the ranges.
+        */
+       if (sdkp->first_scan)
+               return;
+
+       if (!sdkp->capacity)
+               goto out;
+
+       /*
+        * Concurrent Positioning Ranges VPD: there can be at most 256 ranges,
+        * leading to a maximum page size of 64 + 256*32 bytes.
+        */
+       buf_len = 64 + 256*32;
+       buffer = kmalloc(buf_len, GFP_KERNEL);
+       if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len))
+               goto out;
+
+       /* We must have at least a 64B header and one 32B range descriptor */
+       vpd_len = get_unaligned_be16(&buffer[2]) + 3;
+       if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) {
+               sd_printk(KERN_ERR, sdkp,
+                         "Invalid Concurrent Positioning Ranges VPD page\n");
+               goto out;
+       }
+
+       nr_cpr = (vpd_len - 64) / 32;
+       if (nr_cpr == 1) {
+               nr_cpr = 0;
+               goto out;
+       }
+
+       iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr);
+       if (!iars) {
+               nr_cpr = 0;
+               goto out;
+       }
+
+       desc = &buffer[64];
+       for (i = 0; i < nr_cpr; i++, desc += 32) {
+               if (desc[0] != i) {
+                       sd_printk(KERN_ERR, sdkp,
+                               "Invalid Concurrent Positioning Range number\n");
+                       nr_cpr = 0;
+                       break;
+               }
+
+               iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8);
+               iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16);
+       }
+
+out:
+       disk_set_independent_access_ranges(sdkp->disk, iars);
+       if (nr_cpr && sdkp->nr_actuators != nr_cpr) {
+               sd_printk(KERN_NOTICE, sdkp,
+                         "%u concurrent positioning ranges\n", nr_cpr);
+               sdkp->nr_actuators = nr_cpr;
+       }
+
+       kfree(buffer);
+}
+
 /*
  * Determine the device's preferred I/O size for reads and writes
  * unless the reported value is unreasonably small, large, not a
@@ -3202,6 +3322,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
                sd_read_app_tag_own(sdkp, buffer);
                sd_read_write_same(sdkp, buffer);
                sd_read_security(sdkp, buffer);
+               sd_read_cpr(sdkp);
        }
 
        /*
index b59136c..2e5932b 100644 (file)
@@ -106,6 +106,7 @@ struct scsi_disk {
        u8              protection_type;/* Data Integrity Field */
        u8              provisioning_mode;
        u8              zeroing_mode;
+       u8              nr_actuators;           /* Number of actuators */
        unsigned        ATO : 1;        /* state of disk ATO bit */
        unsigned        cache_override : 1; /* temp override of WCE,RCD */
        unsigned        WCE : 1;        /* state of disk WCE bit */
index 4cadb26..3499506 100644 (file)
@@ -6,7 +6,7 @@
  * Written by: Martin K. Petersen <martin.petersen@oracle.com>
  */
 
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/t10-pi.h>
 
 #include <scsi/scsi.h>
index 8f05248..141099a 100644 (file)
@@ -31,6 +31,7 @@ static int sg_version_num = 30536;    /* 2 digits for each component */
 #include <linux/errno.h>
 #include <linux/mtio.h>
 #include <linux/ioctl.h>
+#include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/init.h>
@@ -814,7 +815,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
        if (atomic_read(&sdp->detaching)) {
                if (srp->bio) {
                        scsi_req_free_cmd(scsi_req(srp->rq));
-                       blk_put_request(srp->rq);
+                       blk_mq_free_request(srp->rq);
                        srp->rq = NULL;
                }
 
@@ -1389,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
         */
        srp->rq = NULL;
        scsi_req_free_cmd(scsi_req(rq));
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
 
        write_lock_irqsave(&sfp->rq_list_lock, iflags);
        if (unlikely(srp->orphan)) {
@@ -1717,13 +1718,13 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
         *
         * With scsi-mq enabled, there are a fixed number of preallocated
         * requests equal in number to shost->can_queue.  If all of the
-        * preallocated requests are already in use, then blk_get_request()
+        * preallocated requests are already in use, then scsi_alloc_request()
         * will sleep until an active command completes, freeing up a request.
         * Although waiting in an asynchronous interface is less than ideal, we
         * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might
         * not expect an EWOULDBLOCK from this condition.
         */
-       rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
+       rq = scsi_alloc_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
                        REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq)) {
                kfree(long_cmdp);
@@ -1829,7 +1830,7 @@ sg_finish_rem_req(Sg_request *srp)
 
        if (srp->rq) {
                scsi_req_free_cmd(scsi_req(srp->rq));
-               blk_put_request(srp->rq);
+               blk_mq_free_request(srp->rq);
        }
 
        if (srp->res_used)
index 8b17b35..3009b98 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/cdrom.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/blk-pm.h>
 #include <linux/mutex.h>
@@ -966,7 +967,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf,
        struct bio *bio;
        int ret;
 
-       rq = blk_get_request(disk->queue, REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        req = scsi_req(rq);
@@ -1002,7 +1003,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf,
        if (blk_rq_unmap_user(bio))
                ret = -EFAULT;
 out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
        return ret;
 }
 
index ae8636d..c2d5608 100644 (file)
@@ -32,6 +32,7 @@ static const char *verstr = "20160209";
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/mtio.h>
+#include <linux/major.h>
 #include <linux/cdrom.h>
 #include <linux/ioctl.h>
 #include <linux/fcntl.h>
@@ -529,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
                complete(SRpnt->waiting);
 
        blk_rq_unmap_user(tmp);
-       blk_put_request(req);
+       blk_mq_free_request(req);
 }
 
 static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
@@ -542,7 +543,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
        int err = 0;
        struct scsi_tape *STp = SRpnt->stp;
 
-       req = blk_get_request(SRpnt->stp->device->request_queue,
+       req = scsi_alloc_request(SRpnt->stp->device->request_queue,
                        data_direction == DMA_TO_DEVICE ?
                        REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(req))
@@ -556,7 +557,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
                err = blk_rq_map_user(req->q, req, mdata, NULL, bufflen,
                                      GFP_KERNEL);
                if (err) {
-                       blk_put_request(req);
+                       blk_mq_free_request(req);
                        return err;
                }
        }
index a14dd8c..bb2dd79 100644 (file)
@@ -642,9 +642,9 @@ static int exynos_ufs_pre_pwr_mode(struct ufs_hba *hba,
        }
 
        /* setting for three timeout values for traffic class #0 */
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), 8064);
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), 28224);
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), 20160);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_FC0PROTTIMEOUTVAL), 8064);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_TC0REPLAYTIMEOUTVAL), 28224);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_AFC0REQTIMEOUTVAL), 20160);
 
        return 0;
 out:
index d70cdcd..67402ba 100644 (file)
@@ -48,11 +48,12 @@ out:
        return err;
 }
 
-static int ufshcd_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile,
                                         const struct blk_crypto_key *key,
                                         unsigned int slot)
 {
-       struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+       struct ufs_hba *hba =
+               container_of(profile, struct ufs_hba, crypto_profile);
        const union ufs_crypto_cap_entry *ccap_array = hba->crypto_cap_array;
        const struct ufs_crypto_alg_entry *alg =
                        &ufs_crypto_algs[key->crypto_cfg.crypto_mode];
@@ -105,11 +106,12 @@ static int ufshcd_clear_keyslot(struct ufs_hba *hba, int slot)
        return ufshcd_program_key(hba, &cfg, slot);
 }
 
-static int ufshcd_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_evict(struct blk_crypto_profile *profile,
                                       const struct blk_crypto_key *key,
                                       unsigned int slot)
 {
-       struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+       struct ufs_hba *hba =
+               container_of(profile, struct ufs_hba, crypto_profile);
 
        return ufshcd_clear_keyslot(hba, slot);
 }
@@ -120,11 +122,11 @@ bool ufshcd_crypto_enable(struct ufs_hba *hba)
                return false;
 
        /* Reset might clear all keys, so reprogram all the keys. */
-       blk_ksm_reprogram_all_keys(&hba->ksm);
+       blk_crypto_reprogram_all_keys(&hba->crypto_profile);
        return true;
 }
 
-static const struct blk_ksm_ll_ops ufshcd_ksm_ops = {
+static const struct blk_crypto_ll_ops ufshcd_crypto_ops = {
        .keyslot_program        = ufshcd_crypto_keyslot_program,
        .keyslot_evict          = ufshcd_crypto_keyslot_evict,
 };
@@ -179,15 +181,16 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
        }
 
        /* The actual number of configurations supported is (CFGC+1) */
-       err = devm_blk_ksm_init(hba->dev, &hba->ksm,
-                               hba->crypto_capabilities.config_count + 1);
+       err = devm_blk_crypto_profile_init(
+                       hba->dev, &hba->crypto_profile,
+                       hba->crypto_capabilities.config_count + 1);
        if (err)
                goto out;
 
-       hba->ksm.ksm_ll_ops = ufshcd_ksm_ops;
+       hba->crypto_profile.ll_ops = ufshcd_crypto_ops;
        /* UFS only supports 8 bytes for any DUN */
-       hba->ksm.max_dun_bytes_supported = 8;
-       hba->ksm.dev = hba->dev;
+       hba->crypto_profile.max_dun_bytes_supported = 8;
+       hba->crypto_profile.dev = hba->dev;
 
        /*
         * Cache all the UFS crypto capabilities and advertise the supported
@@ -202,7 +205,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
                blk_mode_num = ufshcd_find_blk_crypto_mode(
                                                hba->crypto_cap_array[cap_idx]);
                if (blk_mode_num != BLK_ENCRYPTION_MODE_INVALID)
-                       hba->ksm.crypto_modes_supported[blk_mode_num] |=
+                       hba->crypto_profile.modes_supported[blk_mode_num] |=
                                hba->crypto_cap_array[cap_idx].sdus_mask * 512;
        }
 
@@ -230,9 +233,8 @@ void ufshcd_init_crypto(struct ufs_hba *hba)
                ufshcd_clear_keyslot(hba, slot);
 }
 
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                           struct request_queue *q)
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q)
 {
        if (hba->caps & UFSHCD_CAP_CRYPTO)
-               blk_ksm_register(&hba->ksm, q);
+               blk_crypto_register(&hba->crypto_profile, q);
 }
index 78a58e7..e18c012 100644 (file)
@@ -18,7 +18,7 @@ static inline void ufshcd_prepare_lrbp_crypto(struct request *rq,
                return;
        }
 
-       lrbp->crypto_key_slot = blk_ksm_get_slot_idx(rq->crypt_keyslot);
+       lrbp->crypto_key_slot = blk_crypto_keyslot_index(rq->crypt_keyslot);
        lrbp->data_unit_num = rq->crypt_ctx->bc_dun[0];
 }
 
@@ -40,8 +40,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba);
 
 void ufshcd_init_crypto(struct ufs_hba *hba);
 
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                           struct request_queue *q);
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q);
 
 #else /* CONFIG_SCSI_UFS_CRYPTO */
 
@@ -64,8 +63,8 @@ static inline int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
 
 static inline void ufshcd_init_crypto(struct ufs_hba *hba) { }
 
-static inline void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                               struct request_queue *q) { }
+static inline void ufshcd_crypto_register(struct ufs_hba *hba,
+                                         struct request_queue *q) { }
 
 #endif /* CONFIG_SCSI_UFS_CRYPTO */
 
index 95be7ec..db1bc86 100644 (file)
@@ -2737,12 +2737,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 
        lrbp->req_abort_skip = false;
 
-       err = ufshpb_prep(hba, lrbp);
-       if (err == -EAGAIN) {
-               lrbp->cmd = NULL;
-               ufshcd_release(hba);
-               goto out;
-       }
+       ufshpb_prep(hba, lrbp);
 
        ufshcd_comp_scsi_upiu(hba, lrbp);
 
@@ -2925,7 +2920,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
         * Even though we use wait_event() which sleeps indefinitely,
         * the maximum wait time is bounded by SCSI request timeout.
         */
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out_unlock;
@@ -2952,7 +2947,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
                                    (struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
 
 out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
 out_unlock:
        up_read(&hba->clk_scaling_lock);
        return err;
@@ -4986,7 +4981,7 @@ static int ufshcd_slave_configure(struct scsi_device *sdev)
        else if (ufshcd_is_rpm_autosuspend_allowed(hba))
                sdev->rpm_autosuspend = 1;
 
-       ufshcd_crypto_setup_rq_keyslot_manager(hba, q);
+       ufshcd_crypto_register(hba, q);
 
        return 0;
 }
@@ -6517,9 +6512,9 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba,
        int task_tag, err;
 
        /*
-        * blk_get_request() is used here only to get a free tag.
+        * blk_mq_alloc_request() is used here only to get a free tag.
         */
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
        if (IS_ERR(req))
                return PTR_ERR(req);
 
@@ -6575,7 +6570,7 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba,
        spin_unlock_irqrestore(hba->host->host_lock, flags);
 
        ufshcd_release(hba);
-       blk_put_request(req);
+       blk_mq_free_request(req);
 
        return err;
 }
@@ -6660,7 +6655,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
        down_read(&hba->clk_scaling_lock);
 
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out_unlock;
@@ -6741,7 +6736,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
                                    (struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
 
 out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
 out_unlock:
        up_read(&hba->clk_scaling_lock);
        return err;
@@ -7912,7 +7907,7 @@ static void ufshcd_request_sense_done(struct request *rq, blk_status_t error)
        if (error != BLK_STS_OK)
                pr_err("%s: REQUEST SENSE failed (%d)\n", __func__, error);
        kfree(rq->end_io_data);
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
 }
 
 static int
@@ -7932,7 +7927,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev)
        if (!buffer)
                return -ENOMEM;
 
-       req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN,
+       req = blk_mq_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
                              /*flags=*/BLK_MQ_REQ_PM);
        if (IS_ERR(req)) {
                ret = PTR_ERR(req);
@@ -7957,7 +7952,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev)
        return 0;
 
 out_put:
-       blk_put_request(req);
+       blk_mq_free_request(req);
 out_free:
        kfree(buffer);
        return ret;
index 41f6e06..62bdc41 100644 (file)
@@ -32,7 +32,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/bitfield.h>
 #include <linux/devfreq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 #include "unipro.h"
 
 #include <asm/irq.h>
@@ -766,7 +766,7 @@ struct ufs_hba_monitor {
  * @crypto_capabilities: Content of crypto capabilities register (0x100)
  * @crypto_cap_array: Array of crypto capabilities
  * @crypto_cfg_register: Start of the crypto cfg array
- * @ksm: the keyslot manager tied to this hba
+ * @crypto_profile: the crypto profile of this hba (if applicable)
  */
 struct ufs_hba {
        void __iomem *mmio_base;
@@ -911,7 +911,7 @@ struct ufs_hba {
        union ufs_crypto_capabilities crypto_capabilities;
        union ufs_crypto_cap_entry *crypto_cap_array;
        u32 crypto_cfg_register;
-       struct blk_keyslot_manager ksm;
+       struct blk_crypto_profile crypto_profile;
 #endif
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_root;
index 589af5f..182bcbf 100644 (file)
@@ -84,16 +84,6 @@ static bool ufshpb_is_supported_chunk(struct ufshpb_lu *hpb, int transfer_len)
        return transfer_len <= hpb->pre_req_max_tr_len;
 }
 
-/*
- * In this driver, WRITE_BUFFER CMD support 36KB (len=9) ~ 1MB (len=256) as
- * default. It is possible to change range of transfer_len through sysfs.
- */
-static inline bool ufshpb_is_required_wb(struct ufshpb_lu *hpb, int len)
-{
-       return len > hpb->pre_req_min_tr_len &&
-              len <= hpb->pre_req_max_tr_len;
-}
-
 static bool ufshpb_is_general_lun(int lun)
 {
        return lun < UFS_UPIU_MAX_UNIT_NUM_ID;
@@ -334,7 +324,7 @@ ufshpb_get_pos_from_lpn(struct ufshpb_lu *hpb, unsigned long lpn, int *rgn_idx,
 
 static void
 ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-                           __be64 ppn, u8 transfer_len, int read_id)
+                           __be64 ppn, u8 transfer_len)
 {
        unsigned char *cdb = lrbp->cmd->cmnd;
        __be64 ppn_tmp = ppn;
@@ -346,256 +336,11 @@ ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
        /* ppn value is stored as big-endian in the host memory */
        memcpy(&cdb[6], &ppn_tmp, sizeof(__be64));
        cdb[14] = transfer_len;
-       cdb[15] = read_id;
+       cdb[15] = 0;
 
        lrbp->cmd->cmd_len = UFS_CDB_SIZE;
 }
 
-static inline void ufshpb_set_write_buf_cmd(unsigned char *cdb,
-                                           unsigned long lpn, unsigned int len,
-                                           int read_id)
-{
-       cdb[0] = UFSHPB_WRITE_BUFFER;
-       cdb[1] = UFSHPB_WRITE_BUFFER_PREFETCH_ID;
-
-       put_unaligned_be32(lpn, &cdb[2]);
-       cdb[6] = read_id;
-       put_unaligned_be16(len * HPB_ENTRY_SIZE, &cdb[7]);
-
-       cdb[9] = 0x00;  /* Control = 0x00 */
-}
-
-static struct ufshpb_req *ufshpb_get_pre_req(struct ufshpb_lu *hpb)
-{
-       struct ufshpb_req *pre_req;
-
-       if (hpb->num_inflight_pre_req >= hpb->throttle_pre_req) {
-               dev_info(&hpb->sdev_ufs_lu->sdev_dev,
-                        "pre_req throttle. inflight %d throttle %d",
-                        hpb->num_inflight_pre_req, hpb->throttle_pre_req);
-               return NULL;
-       }
-
-       pre_req = list_first_entry_or_null(&hpb->lh_pre_req_free,
-                                          struct ufshpb_req, list_req);
-       if (!pre_req) {
-               dev_info(&hpb->sdev_ufs_lu->sdev_dev, "There is no pre_req");
-               return NULL;
-       }
-
-       list_del_init(&pre_req->list_req);
-       hpb->num_inflight_pre_req++;
-
-       return pre_req;
-}
-
-static inline void ufshpb_put_pre_req(struct ufshpb_lu *hpb,
-                                     struct ufshpb_req *pre_req)
-{
-       pre_req->req = NULL;
-       bio_reset(pre_req->bio);
-       list_add_tail(&pre_req->list_req, &hpb->lh_pre_req_free);
-       hpb->num_inflight_pre_req--;
-}
-
-static void ufshpb_pre_req_compl_fn(struct request *req, blk_status_t error)
-{
-       struct ufshpb_req *pre_req = (struct ufshpb_req *)req->end_io_data;
-       struct ufshpb_lu *hpb = pre_req->hpb;
-       unsigned long flags;
-
-       if (error) {
-               struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-               struct scsi_sense_hdr sshdr;
-
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev, "block status %d", error);
-               scsi_command_normalize_sense(cmd, &sshdr);
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "code %x sense_key %x asc %x ascq %x",
-                       sshdr.response_code,
-                       sshdr.sense_key, sshdr.asc, sshdr.ascq);
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "byte4 %x byte5 %x byte6 %x additional_len %x",
-                       sshdr.byte4, sshdr.byte5,
-                       sshdr.byte6, sshdr.additional_length);
-       }
-
-       blk_mq_free_request(req);
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       ufshpb_put_pre_req(pre_req->hpb, pre_req);
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-}
-
-static int ufshpb_prep_entry(struct ufshpb_req *pre_req, struct page *page)
-{
-       struct ufshpb_lu *hpb = pre_req->hpb;
-       struct ufshpb_region *rgn;
-       struct ufshpb_subregion *srgn;
-       __be64 *addr;
-       int offset = 0;
-       int copied;
-       unsigned long lpn = pre_req->wb.lpn;
-       int rgn_idx, srgn_idx, srgn_offset;
-       unsigned long flags;
-
-       addr = page_address(page);
-       ufshpb_get_pos_from_lpn(hpb, lpn, &rgn_idx, &srgn_idx, &srgn_offset);
-
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-
-next_offset:
-       rgn = hpb->rgn_tbl + rgn_idx;
-       srgn = rgn->srgn_tbl + srgn_idx;
-
-       if (!ufshpb_is_valid_srgn(rgn, srgn))
-               goto mctx_error;
-
-       if (!srgn->mctx)
-               goto mctx_error;
-
-       copied = ufshpb_fill_ppn_from_page(hpb, srgn->mctx, srgn_offset,
-                                          pre_req->wb.len - offset,
-                                          &addr[offset]);
-
-       if (copied < 0)
-               goto mctx_error;
-
-       offset += copied;
-       srgn_offset += copied;
-
-       if (srgn_offset == hpb->entries_per_srgn) {
-               srgn_offset = 0;
-
-               if (++srgn_idx == hpb->srgns_per_rgn) {
-                       srgn_idx = 0;
-                       rgn_idx++;
-               }
-       }
-
-       if (offset < pre_req->wb.len)
-               goto next_offset;
-
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       return 0;
-mctx_error:
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       return -ENOMEM;
-}
-
-static int ufshpb_pre_req_add_bio_page(struct ufshpb_lu *hpb,
-                                      struct request_queue *q,
-                                      struct ufshpb_req *pre_req)
-{
-       struct page *page = pre_req->wb.m_page;
-       struct bio *bio = pre_req->bio;
-       int entries_bytes, ret;
-
-       if (!page)
-               return -ENOMEM;
-
-       if (ufshpb_prep_entry(pre_req, page))
-               return -ENOMEM;
-
-       entries_bytes = pre_req->wb.len * sizeof(__be64);
-
-       ret = bio_add_pc_page(q, bio, page, entries_bytes, 0);
-       if (ret != entries_bytes) {
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "bio_add_pc_page fail: %d", ret);
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-static inline int ufshpb_get_read_id(struct ufshpb_lu *hpb)
-{
-       if (++hpb->cur_read_id >= MAX_HPB_READ_ID)
-               hpb->cur_read_id = 1;
-       return hpb->cur_read_id;
-}
-
-static int ufshpb_execute_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd,
-                                 struct ufshpb_req *pre_req, int read_id)
-{
-       struct scsi_device *sdev = cmd->device;
-       struct request_queue *q = sdev->request_queue;
-       struct request *req;
-       struct scsi_request *rq;
-       struct bio *bio = pre_req->bio;
-
-       pre_req->hpb = hpb;
-       pre_req->wb.lpn = sectors_to_logical(cmd->device,
-                                            blk_rq_pos(scsi_cmd_to_rq(cmd)));
-       pre_req->wb.len = sectors_to_logical(cmd->device,
-                                            blk_rq_sectors(scsi_cmd_to_rq(cmd)));
-       if (ufshpb_pre_req_add_bio_page(hpb, q, pre_req))
-               return -ENOMEM;
-
-       req = pre_req->req;
-
-       /* 1. request setup */
-       blk_rq_append_bio(req, bio);
-       req->rq_disk = NULL;
-       req->end_io_data = (void *)pre_req;
-       req->end_io = ufshpb_pre_req_compl_fn;
-
-       /* 2. scsi_request setup */
-       rq = scsi_req(req);
-       rq->retries = 1;
-
-       ufshpb_set_write_buf_cmd(rq->cmd, pre_req->wb.lpn, pre_req->wb.len,
-                                read_id);
-       rq->cmd_len = scsi_command_size(rq->cmd);
-
-       if (blk_insert_cloned_request(q, req) != BLK_STS_OK)
-               return -EAGAIN;
-
-       hpb->stats.pre_req_cnt++;
-
-       return 0;
-}
-
-static int ufshpb_issue_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd,
-                               int *read_id)
-{
-       struct ufshpb_req *pre_req;
-       struct request *req = NULL;
-       unsigned long flags;
-       int _read_id;
-       int ret = 0;
-
-       req = blk_get_request(cmd->device->request_queue,
-                             REQ_OP_DRV_OUT | REQ_SYNC, BLK_MQ_REQ_NOWAIT);
-       if (IS_ERR(req))
-               return -EAGAIN;
-
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       pre_req = ufshpb_get_pre_req(hpb);
-       if (!pre_req) {
-               ret = -EAGAIN;
-               goto unlock_out;
-       }
-       _read_id = ufshpb_get_read_id(hpb);
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-
-       pre_req->req = req;
-
-       ret = ufshpb_execute_pre_req(hpb, cmd, pre_req, _read_id);
-       if (ret)
-               goto free_pre_req;
-
-       *read_id = _read_id;
-
-       return ret;
-free_pre_req:
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       ufshpb_put_pre_req(hpb, pre_req);
-unlock_out:
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       blk_put_request(req);
-       return ret;
-}
-
 /*
  * This function will set up HPB read command using host-side L2P map data.
  */
@@ -609,7 +354,6 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
        __be64 ppn;
        unsigned long flags;
        int transfer_len, rgn_idx, srgn_idx, srgn_offset;
-       int read_id = 0;
        int err = 0;
 
        hpb = ufshpb_get_hpb_data(cmd->device);
@@ -685,24 +429,8 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
                dev_err(hba->dev, "get ppn failed. err %d\n", err);
                return err;
        }
-       if (!ufshpb_is_legacy(hba) &&
-           ufshpb_is_required_wb(hpb, transfer_len)) {
-               err = ufshpb_issue_pre_req(hpb, cmd, &read_id);
-               if (err) {
-                       unsigned long timeout;
-
-                       timeout = cmd->jiffies_at_alloc + msecs_to_jiffies(
-                                 hpb->params.requeue_timeout_ms);
-
-                       if (time_before(jiffies, timeout))
-                               return -EAGAIN;
-
-                       hpb->stats.miss_cnt++;
-                       return 0;
-               }
-       }
 
-       ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len, read_id);
+       ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len);
 
        hpb->stats.hit_cnt++;
        return 0;
@@ -721,7 +449,7 @@ static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb,
                return NULL;
 
 retry:
-       req = blk_get_request(hpb->sdev_ufs_lu->request_queue, dir,
+       req = blk_mq_alloc_request(hpb->sdev_ufs_lu->request_queue, dir,
                              BLK_MQ_REQ_NOWAIT);
 
        if (!atomic && (PTR_ERR(req) == -EWOULDBLOCK) && (--retries > 0)) {
@@ -745,7 +473,7 @@ free_rq:
 
 static void ufshpb_put_req(struct ufshpb_lu *hpb, struct ufshpb_req *rq)
 {
-       blk_put_request(rq->req);
+       blk_mq_free_request(rq->req);
        kmem_cache_free(hpb->map_req_cache, rq);
 }
 
@@ -1841,16 +1569,11 @@ static void ufshpb_lu_parameter_init(struct ufs_hba *hba,
        u32 entries_per_rgn;
        u64 rgn_mem_size, tmp;
 
-       /* for pre_req */
-       hpb->pre_req_min_tr_len = hpb_dev_info->max_hpb_single_cmd + 1;
-
        if (ufshpb_is_legacy(hba))
                hpb->pre_req_max_tr_len = HPB_LEGACY_CHUNK_HIGH;
        else
                hpb->pre_req_max_tr_len = HPB_MULTI_CHUNK_HIGH;
 
-       hpb->cur_read_id = 0;
-
        hpb->lu_pinned_start = hpb_lu_info->pinned_start;
        hpb->lu_pinned_end = hpb_lu_info->num_pinned ?
                (hpb_lu_info->pinned_start + hpb_lu_info->num_pinned - 1)
index a79e073..f15d8fd 100644 (file)
@@ -241,8 +241,6 @@ struct ufshpb_lu {
        spinlock_t param_lock;
 
        struct list_head lh_pre_req_free;
-       int cur_read_id;
-       int pre_req_min_tr_len;
        int pre_req_max_tr_len;
 
        /* cached L2P map management worker */
index 07d0250..b8455fc 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/virtio_scsi.h>
 #include <linux/cpu.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
index 44fc9ee..ca40923 100644 (file)
@@ -134,7 +134,7 @@ static int dfl_spi_altera_probe(struct dfl_device *dfl_dev)
        if (!master)
                return -ENOMEM;
 
-       master->bus_num = dfl_dev->id;
+       master->bus_num = -1;
 
        hw = spi_master_get_devdata(master);
 
index f7a7c14..65147aa 100644 (file)
@@ -48,7 +48,7 @@ static int altera_spi_probe(struct platform_device *pdev)
                return err;
 
        /* setup the master state. */
-       master->bus_num = pdev->id;
+       master->bus_num = -1;
 
        if (pdata) {
                if (pdata->num_chipselect > ALTERA_SPI_MAX_CS) {
index feebda6..e4484ac 100644 (file)
@@ -1716,12 +1716,13 @@ static int verify_controller_parameters(struct pl022 *pl022,
                                return -EINVAL;
                        }
                } else {
-                       if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX)
+                       if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) {
                                dev_err(&pl022->adev->dev,
                                        "Microwire half duplex mode requested,"
                                        " but this is only available in the"
                                        " ST version of PL022\n");
-                       return -EINVAL;
+                               return -EINVAL;
+                       }
                }
        }
        return 0;
index ef4a8e1..8190b84 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/falloc.h>
 #include <linux/uio.h>
+#include <linux/scatterlist.h>
 #include <scsi/scsi_proto.h>
 #include <asm/unaligned.h>
 
@@ -244,7 +245,7 @@ struct target_core_file_cmd {
        struct bio_vec  bvecs[];
 };
 
-static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void cmd_rw_aio_complete(struct kiocb *iocb, long ret)
 {
        struct target_core_file_cmd *cmd;
 
@@ -302,7 +303,7 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
                ret = call_read_iter(file, &aio_cmd->iocb, &iter);
 
        if (ret != -EIOCBQUEUED)
-               cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
+               cmd_rw_aio_complete(&aio_cmd->iocb, ret);
 
        return 0;
 }
index 4069a1e..b1ef041 100644 (file)
 #include <linux/timer.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/bio.h>
 #include <linux/genhd.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/scatterlist.h>
 #include <scsi/scsi_proto.h>
 #include <asm/unaligned.h>
 
@@ -230,9 +232,9 @@ static unsigned long long iblock_emulate_read_cap_with_block_size(
        struct block_device *bd,
        struct request_queue *q)
 {
-       unsigned long long blocks_long = (div_u64(i_size_read(bd->bd_inode),
-                                       bdev_logical_block_size(bd)) - 1);
        u32 block_size = bdev_logical_block_size(bd);
+       unsigned long long blocks_long =
+               div_u64(bdev_nr_bytes(bd), block_size) - 1;
 
        if (block_size == dev->dev_attrib.block_size)
                return blocks_long;
index 75ef52f..7fa57fb 100644 (file)
@@ -980,11 +980,10 @@ pscsi_execute_cmd(struct se_cmd *cmd)
        memcpy(pt->pscsi_cdb, cmd->t_task_cdb,
                scsi_command_size(cmd->t_task_cdb));
 
-       req = blk_get_request(pdv->pdv_sd->request_queue,
+       req = scsi_alloc_request(pdv->pdv_sd->request_queue,
                        cmd->data_direction == DMA_TO_DEVICE ?
                        REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
        if (IS_ERR(req)) {
-               pr_err("PSCSI: blk_get_request() failed\n");
                ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
                goto fail;
        }
@@ -1012,7 +1011,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
        return 0;
 
 fail_put_request:
-       blk_put_request(req);
+       blk_mq_free_request(req);
 fail:
        kfree(pt);
        return ret;
@@ -1067,7 +1066,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
                break;
        }
 
-       blk_put_request(req);
+       blk_mq_free_request(req);
        kfree(pt);
 }
 
index 8260f38..e20c19a 100644 (file)
@@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
                kthread_unuse_mm(io_data->mm);
        }
 
-       io_data->kiocb->ki_complete(io_data->kiocb, ret, ret);
+       io_data->kiocb->ki_complete(io_data->kiocb, ret);
 
        if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd)
                eventfd_signal(io_data->ffs->ffs_eventfd, 1);
index 539220d..78be947 100644 (file)
@@ -469,7 +469,7 @@ static void ep_user_copy_worker(struct work_struct *work)
                ret = -EFAULT;
 
        /* completing the iocb can drop the ctx and mm, don't touch mm after */
-       iocb->ki_complete(iocb, ret, ret);
+       iocb->ki_complete(iocb, ret);
 
        kfree(priv->buf);
        kfree(priv->to_free);
@@ -496,11 +496,8 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
                kfree(priv->to_free);
                kfree(priv);
                iocb->private = NULL;
-               /* aio_complete() reports bytes-transferred _and_ faults */
-
                iocb->ki_complete(iocb,
-                               req->actual ? req->actual : (long)req->status,
-                               req->status);
+                               req->actual ? req->actual : (long)req->status);
        } else {
                /* ep_copy_to_user() won't report both; we hide some faults */
                if (unlikely(0 != req->status))
index 26e3d90..841667a 100644 (file)
@@ -80,6 +80,7 @@ struct vduse_dev {
        struct vdpa_callback config_cb;
        struct work_struct inject;
        spinlock_t irq_lock;
+       struct rw_semaphore rwsem;
        int minor;
        bool broken;
        bool connected;
@@ -410,6 +411,8 @@ static void vduse_dev_reset(struct vduse_dev *dev)
        if (domain->bounce_map)
                vduse_domain_reset_bounce_map(domain);
 
+       down_write(&dev->rwsem);
+
        dev->status = 0;
        dev->driver_features = 0;
        dev->generation++;
@@ -443,6 +446,8 @@ static void vduse_dev_reset(struct vduse_dev *dev)
                flush_work(&vq->inject);
                flush_work(&vq->kick);
        }
+
+       up_write(&dev->rwsem);
 }
 
 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
@@ -885,6 +890,23 @@ static void vduse_vq_irq_inject(struct work_struct *work)
        spin_unlock_irq(&vq->irq_lock);
 }
 
+static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
+                                   struct work_struct *irq_work)
+{
+       int ret = -EINVAL;
+
+       down_read(&dev->rwsem);
+       if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+               goto unlock;
+
+       ret = 0;
+       queue_work(vduse_irq_wq, irq_work);
+unlock:
+       up_read(&dev->rwsem);
+
+       return ret;
+}
+
 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
@@ -966,8 +988,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                break;
        }
        case VDUSE_DEV_INJECT_CONFIG_IRQ:
-               ret = 0;
-               queue_work(vduse_irq_wq, &dev->inject);
+               ret = vduse_dev_queue_irq_work(dev, &dev->inject);
                break;
        case VDUSE_VQ_SETUP: {
                struct vduse_vq_config config;
@@ -1053,9 +1074,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                if (index >= dev->vq_num)
                        break;
 
-               ret = 0;
                index = array_index_nospec(index, dev->vq_num);
-               queue_work(vduse_irq_wq, &dev->vqs[index].inject);
+               ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
                break;
        }
        default:
@@ -1136,6 +1156,7 @@ static struct vduse_dev *vduse_dev_create(void)
        INIT_LIST_HEAD(&dev->send_list);
        INIT_LIST_HEAD(&dev->recv_list);
        spin_lock_init(&dev->irq_lock);
+       init_rwsem(&dev->rwsem);
 
        INIT_WORK(&dev->inject, vduse_dev_irq_inject);
        init_waitqueue_head(&dev->waitq);
index dd95dfd..3035bb6 100644 (file)
@@ -576,7 +576,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
        /* Last one doesn't continue. */
        desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
        if (!indirect && vq->use_dma_api)
-               vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags =
+               vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
                        ~VRING_DESC_F_NEXT;
 
        if (indirect) {
index 643c6c2..ced2fc0 100644 (file)
@@ -71,8 +71,6 @@
 #define TCOBASE(p)     ((p)->tco_res->start)
 /* SMI Control and Enable Register */
 #define SMI_EN(p)      ((p)->smi_res->start)
-#define TCO_EN         (1 << 13)
-#define GBL_SMI_EN     (1 << 0)
 
 #define TCO_RLD(p)     (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */
 #define TCOv1_TMR(p)   (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/
@@ -357,12 +355,8 @@ static int iTCO_wdt_set_timeout(struct watchdog_device *wd_dev, unsigned int t)
 
        tmrval = seconds_to_ticks(p, t);
 
-       /*
-        * If TCO SMIs are off, the timer counts down twice before rebooting.
-        * Otherwise, the BIOS generally reboots when the SMI triggers.
-        */
-       if (p->smi_res &&
-           (inl(SMI_EN(p)) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN))
+       /* For TCO v1 the timer counts down twice before rebooting */
+       if (p->iTCO_version == 1)
                tmrval /= 2;
 
        /* from the specs: */
@@ -527,7 +521,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev)
                 * Disables TCO logic generating an SMI#
                 */
                val32 = inl(SMI_EN(p));
-               val32 &= ~TCO_EN;       /* Turn off SMI clearing watchdog */
+               val32 &= 0xffffdfff;    /* Turn off SMI clearing watchdog */
                outl(val32, SMI_EN(p));
        }
 
index 2693ffb..31b03fa 100644 (file)
@@ -119,7 +119,7 @@ static int ixp4xx_wdt_probe(struct platform_device *pdev)
        iwdt = devm_kzalloc(dev, sizeof(*iwdt), GFP_KERNEL);
        if (!iwdt)
                return -ENOMEM;
-       iwdt->base = dev->platform_data;
+       iwdt->base = (void __iomem *)dev->platform_data;
 
        /*
         * Retrieve rate from a fixed clock from the device tree if
index 1616f93..74d785b 100644 (file)
@@ -268,8 +268,12 @@ static int omap_wdt_probe(struct platform_device *pdev)
                        wdev->wdog.bootstatus = WDIOF_CARDRESET;
        }
 
-       if (!early_enable)
+       if (early_enable) {
+               omap_wdt_start(&wdev->wdog);
+               set_bit(WDOG_HW_RUNNING, &wdev->wdog.status);
+       } else {
                omap_wdt_disable(wdev);
+       }
 
        ret = watchdog_register_device(&wdev->wdog);
        if (ret) {
index ee9ff38..9791c74 100644 (file)
@@ -130,7 +130,7 @@ static u64 sbsa_gwdt_reg_read(struct sbsa_gwdt *gwdt)
        if (gwdt->version == 0)
                return readl(gwdt->control_base + SBSA_GWDT_WOR);
        else
-               return readq(gwdt->control_base + SBSA_GWDT_WOR);
+               return lo_hi_readq(gwdt->control_base + SBSA_GWDT_WOR);
 }
 
 static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt)
@@ -138,7 +138,7 @@ static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt)
        if (gwdt->version == 0)
                writel((u32)val, gwdt->control_base + SBSA_GWDT_WOR);
        else
-               writeq(val, gwdt->control_base + SBSA_GWDT_WOR);
+               lo_hi_writeq(val, gwdt->control_base + SBSA_GWDT_WOR);
 }
 
 /*
@@ -411,4 +411,3 @@ MODULE_AUTHOR("Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>");
 MODULE_AUTHOR("Al Stone <al.stone@linaro.org>");
 MODULE_AUTHOR("Timur Tabi <timur@codeaurora.org>");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" DRV_NAME);
index c6c2a51..c609005 100644 (file)
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
         * blocks, we will have to change it.
         */
 
-       size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       size = bdev_nr_sectors(sb->s_bdev);
        pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
        affs_set_blocksize(sb, PAGE_SIZE);
index f24370f..8b1d9c2 100644 (file)
@@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  */
 vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
 {
-       struct page *page = thp_head(vmf->page);
+       struct folio *folio = page_folio(vmf->page);
+       struct page *page = &folio->page;
        struct file *file = vmf->vma->vm_file;
        struct inode *inode = file_inode(file);
        struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
                goto out;
 #endif
 
-       if (wait_on_page_writeback_killable(page))
+       if (folio_wait_writeback_killable(folio))
                goto out;
 
        if (lock_page_killable(page) < 0)
@@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
         * details the portion of the page we need to write back and we might
         * need to redirty the page if there's a problem.
         */
-       if (wait_on_page_writeback_killable(page) < 0) {
-               unlock_page(page);
+       if (folio_wait_writeback_killable(folio) < 0) {
+               folio_unlock(folio);
                goto out;
        }
 
index 51b08ab..836dc7e 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1417,7 +1417,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
        spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
 {
        struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
 
@@ -1437,7 +1437,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
        }
 
        iocb->ki_res.res = res;
-       iocb->ki_res.res2 = res2;
+       iocb->ki_res.res2 = 0;
        iocb_put(iocb);
 }
 
@@ -1508,7 +1508,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
                ret = -EINTR;
                fallthrough;
        default:
-               req->ki_complete(req, ret, 0);
+               req->ki_complete(req, ret);
        }
 }
 
index 1d071c8..32da97c 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/kthread.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
@@ -173,9 +174,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
                /* Hash through the page sector by sector */
                for (pg_offset = 0; pg_offset < bytes_left;
                     pg_offset += sectorsize) {
-                       kaddr = page_address(page);
+                       kaddr = kmap_atomic(page);
                        crypto_shash_digest(shash, kaddr + pg_offset,
                                            sectorsize, csum);
+                       kunmap_atomic(kaddr);
 
                        if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                btrfs_print_data_csum_error(inode, disk_start,
index 74c8e18..c3983bd 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/mm.h>
+#include <linux/error-injection.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
index 59ef388..c85a7d4 100644 (file)
@@ -281,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
        }
 
 
-       if (i_size_read(bdev->bd_inode) <
-           btrfs_device_get_total_bytes(srcdev)) {
+       if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
                btrfs_err(fs_info,
                          "target device is smaller than source device!");
                ret = -EINVAL;
index c725433..59c3be8 100644 (file)
@@ -3748,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
        else if (ret)
                return ERR_PTR(ret);
 
-       if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+       if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
                return ERR_PTR(-EINVAL);
 
        page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
index 5fec009..b8c911a 100644 (file)
@@ -6,6 +6,7 @@
 #include <crypto/hash.h>
 #include <linux/kernel.h>
 #include <linux/bio.h>
+#include <linux/blk-cgroup.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_SIZE);
 
-                       kaddr = page_address(cpage);
+                       kaddr = kmap_atomic(cpage);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                       kunmap_atomic(kaddr);
 
                        i++;
                        ptr += cur_size;
@@ -8261,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
        return dip;
 }
 
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
                struct bio *dio_bio, loff_t file_offset)
 {
        struct inode *inode = iter->inode;
@@ -8291,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
                }
                dio_bio->bi_status = BLK_STS_RESOURCE;
                bio_endio(dio_bio);
-               return BLK_QC_T_NONE;
+               return;
        }
 
        if (!write) {
@@ -8384,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
 
                free_extent_map(em);
        } while (submit_len > 0);
-       return BLK_QC_T_NONE;
+       return;
 
 out_err_em:
        free_extent_map(em);
 out_err:
        dip->dio_bio->bi_status = status;
        btrfs_dio_private_put(dip);
-
-       return BLK_QC_T_NONE;
 }
 
 const struct iomap_ops btrfs_dio_iomap_ops = {
index 92424a2..02ff085 100644 (file)
@@ -1691,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        }
 
        if (!strcmp(sizestr, "max"))
-               new_size = device->bdev->bd_inode->i_size;
+               new_size = bdev_nr_bytes(device->bdev);
        else {
                if (sizestr[0] == '-') {
                        mod = -1;
@@ -1732,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                ret = -EINVAL;
                goto out_finish;
        }
-       if (new_size > device->bdev->bd_inode->i_size) {
+       if (new_size > bdev_nr_bytes(device->bdev)) {
                ret = -EFBIG;
                goto out_finish;
        }
index 00cffc1..65cb076 100644 (file)
@@ -131,6 +131,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
        u32 sector_bytes_left;
        u32 orig_out;
        struct page *cur_page;
+       char *kaddr;
 
        /*
         * We never allow a segment header crossing sector boundary, previous
@@ -147,7 +148,8 @@ static int copy_compressed_data_to_page(char *compressed_data,
                out_pages[*cur_out / PAGE_SIZE] = cur_page;
        }
 
-       write_compress_length(page_address(cur_page) + offset_in_page(*cur_out),
+       kaddr = kmap(cur_page);
+       write_compress_length(kaddr + offset_in_page(*cur_out),
                              compressed_size);
        *cur_out += LZO_LEN;
 
@@ -158,6 +160,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
                u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
                                     orig_out + compressed_size - *cur_out);
 
+               kunmap(cur_page);
                cur_page = out_pages[*cur_out / PAGE_SIZE];
                /* Allocate a new page */
                if (!cur_page) {
@@ -166,8 +169,9 @@ static int copy_compressed_data_to_page(char *compressed_data,
                                return -ENOMEM;
                        out_pages[*cur_out / PAGE_SIZE] = cur_page;
                }
+               kaddr = kmap(cur_page);
 
-               memcpy(page_address(cur_page) + offset_in_page(*cur_out),
+               memcpy(kaddr + offset_in_page(*cur_out),
                       compressed_data + *cur_out - orig_out, copy_len);
 
                *cur_out += copy_len;
@@ -179,12 +183,15 @@ static int copy_compressed_data_to_page(char *compressed_data,
         */
        sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
        if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
-               return 0;
+               goto out;
 
        /* The remaining size is not enough, pad it with zeros */
-       memset(page_address(cur_page) + offset_in_page(*cur_out), 0,
+       memset(kaddr + offset_in_page(*cur_out), 0,
               sector_bytes_left);
        *cur_out += sector_bytes_left;
+
+out:
+       kunmap(cur_page);
        return 0;
 }
 
@@ -195,6 +202,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
        struct workspace *workspace = list_entry(ws, struct workspace, list);
        const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
        struct page *page_in = NULL;
+       char *sizes_ptr;
        int ret = 0;
        /* Points to the file offset of input data */
        u64 cur_in = start;
@@ -212,6 +220,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
         */
        cur_out += LZO_LEN;
        while (cur_in < start + len) {
+               char *data_in;
                const u32 sectorsize_mask = sectorsize - 1;
                u32 sector_off = (cur_in - start) & sectorsize_mask;
                u32 in_len;
@@ -226,10 +235,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                /* Compress at most one sector of data each time */
                in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
                ASSERT(in_len);
-               ret = lzo1x_1_compress(page_address(page_in) +
+               data_in = kmap(page_in);
+               ret = lzo1x_1_compress(data_in +
                                       offset_in_page(cur_in), in_len,
                                       workspace->cbuf, &out_len,
                                       workspace->mem);
+               kunmap(page_in);
                if (ret < 0) {
                        pr_debug("BTRFS: lzo in loop returned %d\n", ret);
                        ret = -EIO;
@@ -260,7 +271,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
        }
 
        /* Store the size of all chunks of compressed data */
-       write_compress_length(page_address(pages[0]), cur_out);
+       sizes_ptr = kmap_local_page(pages[0]);
+       write_compress_length(sizes_ptr, cur_out);
+       kunmap_local(sizes_ptr);
 
        ret = 0;
        *total_out = cur_out;
@@ -281,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
        u32 orig_in = *cur_in;
 
        while (*cur_in < orig_in + len) {
+               char *kaddr;
                struct page *cur_page;
                u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
                                          orig_in + len - *cur_in);
@@ -288,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
                ASSERT(copy_len);
                cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
 
+               kaddr = kmap(cur_page);
                memcpy(dest + *cur_in - orig_in,
-                       page_address(cur_page) + offset_in_page(*cur_in),
+                       kaddr + offset_in_page(*cur_in),
                        copy_len);
+               kunmap(cur_page);
 
                *cur_in += copy_len;
        }
@@ -301,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
        struct workspace *workspace = list_entry(ws, struct workspace, list);
        const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
        const u32 sectorsize = fs_info->sectorsize;
+       char *kaddr;
        int ret;
        /* Compressed data length, can be unaligned */
        u32 len_in;
@@ -309,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
        /* Bytes decompressed so far */
        u32 cur_out = 0;
 
-       len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+       kaddr = kmap(cb->compressed_pages[0]);
+       len_in = read_compress_length(kaddr);
+       kunmap(cb->compressed_pages[0]);
        cur_in += LZO_LEN;
 
        /*
@@ -343,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                       (cur_in + LZO_LEN - 1) / sectorsize);
                cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
                ASSERT(cur_page);
-               seg_len = read_compress_length(page_address(cur_page) +
-                                              offset_in_page(cur_in));
+               kaddr = kmap(cur_page);
+               seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+               kunmap(cur_page);
                cur_in += LZO_LEN;
 
                /* Copy the compressed segment payload into workspace */
@@ -429,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
        destlen = min_t(unsigned long, destlen, PAGE_SIZE);
        bytes = min_t(unsigned long, destlen, out_len - start_byte);
 
-       kaddr = page_address(dest_page);
+       kaddr = kmap_local_page(dest_page);
        memcpy(kaddr, workspace->buf + start_byte, bytes);
 
        /*
@@ -439,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
         */
        if (bytes < destlen)
                memset(kaddr+bytes, 0, destlen-bytes);
+       kunmap_local(kaddr);
 out:
        return ret;
 }
index 546bf11..61ac57b 100644 (file)
@@ -509,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
        }
 
        if (flush)
-               filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+               sync_blockdev(*bdev);
        ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
        if (ret) {
                blkdev_put(*bdev, flags);
@@ -1293,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
        pgoff_t index;
 
        /* make sure our super fits in the device */
-       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+       if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
                return ERR_PTR(-EINVAL);
 
        /* make sure our super fits in the page */
@@ -2657,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
        device->io_width = fs_info->sectorsize;
        device->io_align = fs_info->sectorsize;
        device->sector_size = fs_info->sectorsize;
-       device->total_bytes = round_down(i_size_read(bdev->bd_inode),
-                                        fs_info->sectorsize);
+       device->total_bytes =
+               round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
        device->disk_total_bytes = device->total_bytes;
        device->commit_total_bytes = device->total_bytes;
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -7313,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf,
 
        fill_device_from_item(leaf, dev_item, device);
        if (device->bdev) {
-               u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+               u64 max_total_bytes = bdev_nr_bytes(device->bdev);
 
                if (device->total_bytes > max_total_bytes) {
                        btrfs_err(fs_info,
index 8afa900..767a0c6 100644 (file)
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                ret = -ENOMEM;
                goto out;
        }
-       cpage_out = page_address(out_page);
+       cpage_out = kmap(out_page);
        pages[0] = out_page;
        nr_pages = 1;
 
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                int i;
 
                                for (i = 0; i < in_buf_pages; i++) {
-                                       if (in_page)
+                                       if (in_page) {
+                                               kunmap(in_page);
                                                put_page(in_page);
+                                       }
                                        in_page = find_get_page(mapping,
                                                                start >> PAGE_SHIFT);
-                                       data_in = page_address(in_page);
+                                       data_in = kmap(in_page);
                                        memcpy(workspace->buf + i * PAGE_SIZE,
                                               data_in, PAGE_SIZE);
                                        start += PAGE_SIZE;
                                }
                                workspace->strm.next_in = workspace->buf;
                        } else {
-                               if (in_page)
+                               if (in_page) {
+                                       kunmap(in_page);
                                        put_page(in_page);
+                               }
                                in_page = find_get_page(mapping,
                                                        start >> PAGE_SHIFT);
-                               data_in = page_address(in_page);
+                               data_in = kmap(in_page);
                                start += PAGE_SIZE;
                                workspace->strm.next_in = data_in;
                        }
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                 * the stream end if required
                 */
                if (workspace->strm.avail_out == 0) {
+                       kunmap(out_page);
                        if (nr_pages == nr_dest_pages) {
                                out_page = NULL;
                                ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                ret = -ENOMEM;
                                goto out;
                        }
-                       cpage_out = page_address(out_page);
+                       cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
                        workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                        goto out;
                } else if (workspace->strm.avail_out == 0) {
                        /* get another page for the stream end */
+                       kunmap(out_page);
                        if (nr_pages == nr_dest_pages) {
                                out_page = NULL;
                                ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                ret = -ENOMEM;
                                goto out;
                        }
-                       cpage_out = page_address(out_page);
+                       cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
                        workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
        *total_in = workspace->strm.total_in;
 out:
        *out_pages = nr_pages;
-       if (in_page)
+       if (out_page)
+               kunmap(out_page);
+
+       if (in_page) {
+               kunmap(in_page);
                put_page(in_page);
+       }
        return ret;
 }
 
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
        unsigned long buf_start;
        struct page **pages_in = cb->compressed_pages;
 
-       data_in = page_address(pages_in[page_in_index]);
+       data_in = kmap(pages_in[page_in_index]);
        workspace->strm.next_in = data_in;
        workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
        workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
        if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
                pr_warn("BTRFS: inflateInit failed\n");
+               kunmap(pages_in[page_in_index]);
                return -EIO;
        }
        while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
                if (workspace->strm.avail_in == 0) {
                        unsigned long tmp;
-
+                       kunmap(pages_in[page_in_index]);
                        page_in_index++;
                        if (page_in_index >= total_pages_in) {
                                data_in = NULL;
                                break;
                        }
-                       data_in = page_address(pages_in[page_in_index]);
+                       data_in = kmap(pages_in[page_in_index]);
                        workspace->strm.next_in = data_in;
                        tmp = srclen - workspace->strm.total_in;
                        workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                ret = 0;
 done:
        zlib_inflateEnd(&workspace->strm);
+       if (data_in)
+               kunmap(pages_in[page_in_index]);
        if (!ret)
                zero_fill_bio(cb->orig_bio);
        return ret;
index 56dce9f..f06b680 100644 (file)
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
        /* map in the first page of input data */
        in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       workspace->in_buf.src = page_address(in_page);
+       workspace->in_buf.src = kmap(in_page);
        workspace->in_buf.pos = 0;
        workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                goto out;
        }
        pages[nr_pages++] = out_page;
-       workspace->out_buf.dst = page_address(out_page);
+       workspace->out_buf.dst = kmap(out_page);
        workspace->out_buf.pos = 0;
        workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                if (workspace->out_buf.pos == workspace->out_buf.size) {
                        tot_out += PAGE_SIZE;
                        max_out -= PAGE_SIZE;
+                       kunmap(out_page);
                        if (nr_pages == nr_dest_pages) {
                                out_page = NULL;
                                ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                                goto out;
                        }
                        pages[nr_pages++] = out_page;
-                       workspace->out_buf.dst = page_address(out_page);
+                       workspace->out_buf.dst = kmap(out_page);
                        workspace->out_buf.pos = 0;
                        workspace->out_buf.size = min_t(size_t, max_out,
                                                        PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                /* Check if we need more input */
                if (workspace->in_buf.pos == workspace->in_buf.size) {
                        tot_in += PAGE_SIZE;
+                       kunmap(in_page);
                        put_page(in_page);
 
                        start += PAGE_SIZE;
                        len -= PAGE_SIZE;
                        in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-                       workspace->in_buf.src = page_address(in_page);
+                       workspace->in_buf.src = kmap(in_page);
                        workspace->in_buf.pos = 0;
                        workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
                }
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
                tot_out += PAGE_SIZE;
                max_out -= PAGE_SIZE;
+               kunmap(out_page);
                if (nr_pages == nr_dest_pages) {
                        out_page = NULL;
                        ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                        goto out;
                }
                pages[nr_pages++] = out_page;
-               workspace->out_buf.dst = page_address(out_page);
+               workspace->out_buf.dst = kmap(out_page);
                workspace->out_buf.pos = 0;
                workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
        }
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 out:
        *out_pages = nr_pages;
        /* Cleanup */
-       if (in_page)
+       if (in_page) {
+               kunmap(in_page);
                put_page(in_page);
+       }
+       if (out_page)
+               kunmap(out_page);
        return ret;
 }
 
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                goto done;
        }
 
-       workspace->in_buf.src = page_address(pages_in[page_in_index]);
+       workspace->in_buf.src = kmap(pages_in[page_in_index]);
        workspace->in_buf.pos = 0;
        workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                        break;
 
                if (workspace->in_buf.pos == workspace->in_buf.size) {
-                       page_in_index++;
+                       kunmap(pages_in[page_in_index++]);
                        if (page_in_index >= total_pages_in) {
                                workspace->in_buf.src = NULL;
                                ret = -EIO;
                                goto done;
                        }
                        srclen -= PAGE_SIZE;
-                       workspace->in_buf.src = page_address(pages_in[page_in_index]);
+                       workspace->in_buf.src = kmap(pages_in[page_in_index]);
                        workspace->in_buf.pos = 0;
                        workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
                }
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
        ret = 0;
        zero_fill_bio(cb->orig_bio);
 done:
+       if (workspace->in_buf.src)
+               kunmap(pages_in[page_in_index]);
        return ret;
 }
 
index c615387..46bc589 100644 (file)
@@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 {
        sector_t retval = ~((sector_t)0);
-       loff_t sz = i_size_read(bdev->bd_inode);
+       loff_t sz = bdev_nr_bytes(bdev);
 
        if (sz) {
                unsigned int sizebits = blksize_bits(size);
@@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh = head;
        int uptodate = PageUptodate(page);
-       sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+       sector_t end_block = blkdev_max_block(bdev, size);
 
        do {
                if (!buffer_mapped(bh)) {
index fac2e8e..effe37e 100644 (file)
@@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
 /*
  * Handle completion of a read from the cache.
  */
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
 {
        struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
 
-       _enter("%ld,%ld", ret, ret2);
+       _enter("%ld", ret);
 
        if (ki->term_func) {
                if (ret >= 0)
@@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
                fallthrough;
        default:
                ki->was_async = false;
-               cachefiles_read_complete(&ki->iocb, ret, 0);
+               cachefiles_read_complete(&ki->iocb, ret);
                if (ret > 0)
                        ret = 0;
                break;
@@ -159,12 +159,12 @@ presubmission_error:
 /*
  * Handle completion of a write to the cache.
  */
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
 {
        struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
        struct inode *inode = file_inode(ki->iocb.ki_filp);
 
-       _enter("%ld,%ld", ret, ret2);
+       _enter("%ld", ret);
 
        /* Tell lockdep we inherited freeze protection from submission thread */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
@@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
                fallthrough;
        default:
                ki->was_async = false;
-               cachefiles_write_complete(&ki->iocb, ret, 0);
+               cachefiles_write_complete(&ki->iocb, ret);
                if (ret > 0)
                        ret = 0;
                break;
index 8ffc40e..fcf4f3b 100644 (file)
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
        struct cachefiles_object *object;
        struct fscache_retrieval *op = monitor->op;
        struct wait_page_key *key = _key;
-       struct page *page = wait->private;
+       struct folio *folio = wait->private;
 
        ASSERT(key);
 
        _enter("{%lu},%u,%d,{%p,%u}",
               monitor->netfs_page->index, mode, sync,
-              key->page, key->bit_nr);
+              key->folio, key->bit_nr);
 
-       if (key->page != page || key->bit_nr != PG_locked)
+       if (key->folio != folio || key->bit_nr != PG_locked)
                return 0;
 
-       _debug("--- monitor %p %lx ---", page, page->flags);
+       _debug("--- monitor %p %lx ---", folio, folio->flags);
 
-       if (!PageUptodate(page) && !PageError(page)) {
+       if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
                /* unlocked, not uptodate and not erronous? */
                _debug("page probably truncated");
        }
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
        put_page(backpage2);
 
        INIT_LIST_HEAD(&monitor->op_link);
-       add_page_wait_queue(backpage, &monitor->monitor);
+       folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
 
        if (trylock_page(backpage)) {
                ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
        get_page(backpage);
        monitor->back_page = backpage;
        monitor->monitor.private = backpage;
-       add_page_wait_queue(backpage, &monitor->monitor);
+       folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
        monitor = NULL;
 
        /* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                get_page(backpage);
                monitor->back_page = backpage;
                monitor->monitor.private = backpage;
-               add_page_wait_queue(backpage, &monitor->monitor);
+               folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
                monitor = NULL;
 
                /* but the page may have been read before the monitor was
index e61018d..b129ea5 100644 (file)
@@ -1022,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode,
        ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
                                                CEPH_CAP_FILE_RD));
 
-       aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+       aio_req->iocb->ki_complete(aio_req->iocb, ret);
 
        ceph_free_cap_flush(aio_req->prealloc_cf);
        kfree(aio_req);
index bdeb271..d8c3106 100644 (file)
@@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
-       /* No mandatory locks */
-       if (fl->fl_type & LOCK_MAND)
-               return -EOPNOTSUPP;
 
        dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 
index 13f3182..1b855fc 100644 (file)
@@ -3184,7 +3184,7 @@ restart_loop:
        mutex_unlock(&ctx->aio_mutex);
 
        if (ctx->iocb && ctx->iocb->ki_complete)
-               ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+               ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
        else
                complete(&ctx->done);
 }
@@ -3917,7 +3917,7 @@ again:
        mutex_unlock(&ctx->aio_mutex);
 
        if (ctx->iocb && ctx->iocb->ki_complete)
-               ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+               ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
        else
                complete(&ctx->done);
 }
index 2be6526..666aa38 100644 (file)
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
                return read_buffers[i] + blk_offset;
        }
 
-       devsize = mapping->host->i_size >> PAGE_SHIFT;
+       devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
 
        /* Ok, read in BLKS_PER_BUF pages completely first. */
        for (i = 0; i < BLKS_PER_BUF; i++) {
index 68a2de6..bfc2a5b 100644 (file)
@@ -1,23 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
  *
  * Copyright (C) 2015, Google, Inc.
  * Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- *     Uday Savagaonkar, 2014
- * Encryption policy handling additions
- *     Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- *     Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
  */
 
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include "fscrypt_private.h"
 
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read.  The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache.  If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable.  This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted.  The filesystem must not mark such pages uptodate.
+ */
 void fscrypt_decrypt_bio(struct bio *bio)
 {
        struct bio_vec *bv;
index eb538c2..a9be4bc 100644 (file)
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 
        if (fscrypt_has_encryption_key(dir)) {
                if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
-                                                 iname->len,
-                                                 dir->i_sb->s_cop->max_namelen,
+                                                 iname->len, NAME_MAX,
                                                  &fname->crypto_buf.len))
                        return -ENAMETOOLONG;
                fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
index 3fa965e..5b0a9e6 100644 (file)
 
 #define FSCRYPT_FILE_NONCE_SIZE        16
 
+/*
+ * Minimum size of an fscrypt master key.  Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used.  This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
 #define FSCRYPT_MIN_KEY_SIZE   16
 
 #define FSCRYPT_CONTEXT_V1     1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
         */
        struct fscrypt_hkdf     hkdf;
 
-       /* Size of the raw key in bytes.  Set even if ->raw isn't set. */
+       /*
+        * Size of the raw key in bytes.  This remains set even if ->raw was
+        * zeroized due to no longer being needed.  I.e. we still remember the
+        * size of the key even if we don't need to remember the key itself.
+        */
        u32                     size;
 
        /* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
 struct fscrypt_mode {
        const char *friendly_name;
        const char *cipher_str;
-       int keysize;
-       int ivsize;
+       int keysize;            /* key size in bytes */
+       int security_strength;  /* security strength in bytes */
+       int ivsize;             /* IV size in bytes */
        int logged_impl_name;
        enum blk_crypto_mode_num blk_crypto_mode;
 };
index e0ec210..7607d18 100644 (file)
 
 /*
  * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here.  A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256.  In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
  */
 #define HKDF_HMAC_ALG          "hmac(sha512)"
 #define HKDF_HASHLEN           SHA512_DIGEST_SIZE
index bca9c66..eede186 100644 (file)
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
                .friendly_name = "AES-256-XTS",
                .cipher_str = "xts(aes)",
                .keysize = 64,
+               .security_strength = 32,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
        },
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
                .friendly_name = "AES-256-CTS-CBC",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 32,
+               .security_strength = 32,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_AES_128_CBC] = {
                .friendly_name = "AES-128-CBC-ESSIV",
                .cipher_str = "essiv(cbc(aes),sha256)",
                .keysize = 16,
+               .security_strength = 16,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        },
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
                .friendly_name = "AES-128-CTS-CBC",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 16,
+               .security_strength = 16,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_ADIANTUM] = {
                .friendly_name = "Adiantum",
                .cipher_str = "adiantum(xchacha12,aes)",
                .keysize = 32,
+               .security_strength = 32,
                .ivsize = 32,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
        },
@@ -117,8 +122,9 @@ err_free_tfm:
 
 /*
  * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
  */
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                        const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
 }
 
 /*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode).  This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long.  (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+                                         const struct fscrypt_info *ci)
+{
+       unsigned int min_keysize;
+
+       if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+               min_keysize = ci->ci_mode->keysize;
+       else
+               min_keysize = ci->ci_mode->security_strength;
+
+       if (mk->mk_secret.size < min_keysize) {
+               fscrypt_warn(NULL,
+                            "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+                            master_key_spec_type(&mk->mk_spec),
+                            master_key_spec_len(&mk->mk_spec),
+                            (u8 *)&mk->mk_spec.u,
+                            mk->mk_secret.size, min_keysize);
+               return false;
+       }
+       return true;
+}
+
+/*
  * Find the master key, then set up the inode's actual encryption key.
  *
  * If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
                goto out_release_key;
        }
 
-       /*
-        * Require that the master key be at least as long as the derived key.
-        * Otherwise, the derived key cannot possibly contain as much entropy as
-        * that required by the encryption mode it will be used for.  For v1
-        * policies it's also required for the KDF to work at all.
-        */
-       if (mk->mk_secret.size < ci->ci_mode->keysize) {
-               fscrypt_warn(NULL,
-                            "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
-                            master_key_spec_type(&mk_spec),
-                            master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
-                            mk->mk_secret.size, ci->ci_mode->keysize);
+       if (!fscrypt_valid_master_key_size(mk, ci)) {
                err = -ENOKEY;
                goto out_release_key;
        }
index b2e86e7..6544435 100644 (file)
@@ -119,7 +119,6 @@ struct dio {
        int flags;                      /* doesn't change */
        int op;
        int op_flags;
-       blk_qc_t bio_cookie;
        struct gendisk *bio_disk;
        struct inode *inode;
        loff_t i_size;                  /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 
                if (ret > 0 && dio->op == REQ_OP_WRITE)
                        ret = generic_write_sync(dio->iocb, ret);
-               dio->iocb->ki_complete(dio->iocb, ret, 0);
+               dio->iocb->ki_complete(dio->iocb, ret);
        }
 
        kmem_cache_free(dio_cache, dio);
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 
        dio->bio_disk = bio->bi_bdev->bd_disk;
 
-       if (sdio->submit_io) {
+       if (sdio->submit_io)
                sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
-               dio->bio_cookie = BLK_QC_T_NONE;
-       } else
-               dio->bio_cookie = submit_bio(bio);
+       else
+               submit_bio(bio);
 
        sdio->bio = NULL;
        sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
                __set_current_state(TASK_UNINTERRUPTIBLE);
                dio->waiter = current;
                spin_unlock_irqrestore(&dio->bio_lock, flags);
-               if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
-                       blk_io_schedule();
+               blk_io_schedule();
                /* wake up sets us TASK_RUNNING */
                spin_lock_irqsave(&dio->bio_lock, flags);
                dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
        } else {
                dio->op = REQ_OP_READ;
        }
-       if (iocb->ki_flags & IOCB_HIPRI)
-               dio->op_flags |= REQ_HIPRI;
 
        /*
         * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
index 14b7470..f57255a 100644 (file)
@@ -6,16 +6,22 @@ config EROFS_FS
        select FS_IOMAP
        select LIBCRC32C
        help
-         EROFS (Enhanced Read-Only File System) is a lightweight
-         read-only file system with modern designs (eg. page-sized
-         blocks, inline xattrs/data, etc.) for scenarios which need
-         high-performance read-only requirements, e.g. Android OS
-         for mobile phones and LIVECDs.
+         EROFS (Enhanced Read-Only File System) is a lightweight read-only
+         file system with modern designs (e.g. no buffer heads, inline
+         xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+         scenarios which need high-performance read-only solutions, e.g.
+         smartphones with Android OS, LiveCDs and high-density hosts with
+         numerous containers;
 
-         It also provides fixed-sized output compression support,
-         which improves storage density, keeps relatively higher
-         compression ratios, which is more useful to achieve high
-         performance for embedded devices with limited memory.
+         It also provides fixed-sized output compression support in order to
+         improve storage density as well as keep relatively higher compression
+         ratios and implements in-place decompression to reuse the file page
+         for compressed data temporarily with proper strategies, which is
+         quite useful to ensure guaranteed end-to-end runtime decompression
+         performance under extremely memory pressure without extra cost.
+
+         See the documentation at <file:Documentation/filesystems/erofs.rst>
+         for more details.
 
          If unsure, say N.
 
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
          Enable fixed-sized output compression for EROFS.
 
          If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+       bool "EROFS LZMA compressed data support"
+       depends on EROFS_FS_ZIP
+       select XZ_DEC
+       select XZ_DEC_MICROLZMA
+       help
+         Saying Y here includes support for reading EROFS file systems
+         containing LZMA compressed data, specifically called microLZMA. it
+         gives better compression ratios than the LZ4 algorithm, at the
+         expense of more CPU overhead.
+
+         LZMA support is an experimental feature for now and so most file
+         systems will be readable without selecting this option.
+
+         If unsure, say N.
index 1f9aced..756fe2d 100644 (file)
@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
index 3701c72..5794065 100644 (file)
@@ -8,11 +8,6 @@
 
 #include "internal.h"
 
-enum {
-       Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
-       Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
 struct z_erofs_decompress_req {
        struct super_block *sb;
        struct page **in, **out;
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
        bool inplace_io, partial_decoding;
 };
 
+struct z_erofs_decompressor {
+       int (*decompress)(struct z_erofs_decompress_req *rq,
+                         struct page **pagepool);
+       char *name;
+};
+
 /* some special page->private (unsigned long, see below) */
 #define Z_EROFS_SHORTLIVED_PAGE                (-1UL << 2)
 #define Z_EROFS_PREALLOCATED_PAGE      (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
        return true;
 }
 
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
                                              struct page *page)
 {
        if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
                put_page(page);
        } else {
                /* follow the pcluster rule above. */
-               set_page_private(page, 0);
-               list_add(&page->lru, pagepool);
+               erofs_pagepool_add(pagepool, page);
        }
        return true;
 }
 
+#define MNGD_MAPPING(sbi)      ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+                                        struct page *page)
+{
+       return page->mapping == MNGD_MAPPING(sbi);
+}
+
 int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-                      struct list_head *pagepool);
+                      struct page **pagepool);
 
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+                           struct page **pagepool);
 #endif
index 9db8297..808234d 100644 (file)
@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
        erofs_off_t pos;
        int err = 0;
 
+       map->m_deviceid = 0;
        if (map->m_la >= inode->i_size) {
                /* leave out-of-bound access unmapped */
                map->m_flags = 0;
@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
                map->m_flags = 0;
                break;
        default:
-               /* only one device is supported for now */
-               if (idx->device_id) {
-                       erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
-                                 le16_to_cpu(idx->device_id),
-                                 chunknr, vi->nid);
-                       err = -EFSCORRUPTED;
-                       goto out_unlock;
-               }
+               map->m_deviceid = le16_to_cpu(idx->device_id) &
+                       EROFS_SB(sb)->device_id_mask;
                map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
                map->m_flags = EROFS_MAP_MAPPED;
                break;
@@ -155,11 +150,55 @@ out:
        return err;
 }
 
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+       struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+       struct erofs_device_info *dif;
+       int id;
+
+       /* primary device by default */
+       map->m_bdev = sb->s_bdev;
+       map->m_daxdev = EROFS_SB(sb)->dax_dev;
+
+       if (map->m_deviceid) {
+               down_read(&devs->rwsem);
+               dif = idr_find(&devs->tree, map->m_deviceid - 1);
+               if (!dif) {
+                       up_read(&devs->rwsem);
+                       return -ENODEV;
+               }
+               map->m_bdev = dif->bdev;
+               map->m_daxdev = dif->dax_dev;
+               up_read(&devs->rwsem);
+       } else if (devs->extra_devices) {
+               down_read(&devs->rwsem);
+               idr_for_each_entry(&devs->tree, dif, id) {
+                       erofs_off_t startoff, length;
+
+                       if (!dif->mapped_blkaddr)
+                               continue;
+                       startoff = blknr_to_addr(dif->mapped_blkaddr);
+                       length = blknr_to_addr(dif->blocks);
+
+                       if (map->m_pa >= startoff &&
+                           map->m_pa < startoff + length) {
+                               map->m_pa -= startoff;
+                               map->m_bdev = dif->bdev;
+                               map->m_daxdev = dif->dax_dev;
+                               break;
+                       }
+               }
+               up_read(&devs->rwsem);
+       }
+       return 0;
+}
+
 static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
 {
        int ret;
        struct erofs_map_blocks map;
+       struct erofs_map_dev mdev;
 
        map.m_la = offset;
        map.m_llen = length;
@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
        if (ret < 0)
                return ret;
 
-       iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+       mdev = (struct erofs_map_dev) {
+               .m_deviceid = map.m_deviceid,
+               .m_pa = map.m_pa,
+       };
+       ret = erofs_map_dev(inode->i_sb, &mdev);
+       if (ret)
+               return ret;
+
+       iomap->bdev = mdev.m_bdev;
+       iomap->dax_dev = mdev.m_daxdev;
        iomap->offset = map.m_la;
        iomap->length = map.m_llen;
        iomap->flags = 0;
@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 
                iomap->type = IOMAP_INLINE;
                ipage = erofs_get_meta_page(inode->i_sb,
-                                           erofs_blknr(map.m_pa));
+                                           erofs_blknr(mdev.m_pa));
                if (IS_ERR(ipage))
                        return PTR_ERR(ipage);
                iomap->inline_data = page_address(ipage) +
-                                       erofs_blkoff(map.m_pa);
+                                       erofs_blkoff(mdev.m_pa);
                iomap->private = ipage;
        } else {
                iomap->type = IOMAP_MAPPED;
-               iomap->addr = map.m_pa;
+               iomap->addr = mdev.m_pa;
        }
        return 0;
 }
index a5bc4b1..bf37fc7 100644 (file)
 #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
 #endif
 
-struct z_erofs_decompressor {
-       /*
-        * if destpages have sparsed pages, fill them with bounce pages.
-        * it also check whether destpages indicate continuous physical memory.
-        */
-       int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
-                                struct list_head *pagepool);
-       int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
-       char *name;
-};
-
 int z_erofs_load_lz4_config(struct super_block *sb,
                            struct erofs_super_block *dsb,
                            struct z_erofs_lz4_cfgs *lz4, int size)
@@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
        return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
 }
 
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
-                                        struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+                                       struct page **pagepool)
 {
        const unsigned int nr =
                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
        return kaddr ? 1 : 0;
 }
 
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
                        void *inpage, unsigned int *inputmargin, int *maptype,
                        bool support_0padding)
 {
@@ -189,7 +182,8 @@ docopy:
        return src;
 }
 
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+                                     u8 *out)
 {
        unsigned int inputmargin;
        u8 *headpage, *src;
@@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
        }
 
        rq->inputsize -= inputmargin;
-       src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
-                                       support_0padding);
+       src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
+                                           &maptype, support_0padding);
        if (IS_ERR(src))
                return PTR_ERR(src);
 
@@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
                          ret, rq->inputsize, inputmargin, rq->outputsize);
 
-               WARN_ON(1);
                print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
                               16, 1, src + inputmargin, rq->inputsize, true);
                print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                if (ret >= 0)
                        memset(out + ret, 0, rq->outputsize - ret);
                ret = -EIO;
+       } else {
+               ret = 0;
        }
 
        if (maptype == 0) {
@@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
        return ret;
 }
 
-static struct z_erofs_decompressor decompressors[] = {
-       [Z_EROFS_COMPRESSION_SHIFTED] = {
-               .name = "shifted"
-       },
-       [Z_EROFS_COMPRESSION_LZ4] = {
-               .prepare_destpages = z_erofs_lz4_prepare_destpages,
-               .decompress = z_erofs_lz4_decompress,
-               .name = "lz4"
-       },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
-                             unsigned short pageofs_out,
-                             unsigned int outputsize)
-{
-       const char *end = dst + outputsize;
-       const unsigned int righthalf = PAGE_SIZE - pageofs_out;
-       const char *cur = dst - pageofs_out;
-
-       while (cur < end) {
-               struct page *const page = *out++;
-
-               if (page) {
-                       char *buf = kmap_atomic(page);
-
-                       if (cur >= dst) {
-                               memcpy(buf, cur, min_t(uint, PAGE_SIZE,
-                                                      end - cur));
-                       } else {
-                               memcpy(buf + pageofs_out, cur + pageofs_out,
-                                      min_t(uint, righthalf, end - cur));
-                       }
-                       kunmap_atomic(buf);
-               }
-               cur += PAGE_SIZE;
-       }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
-                                     struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+                                 struct page **pagepool)
 {
        const unsigned int nrpages_out =
                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
-       const struct z_erofs_decompressor *alg = decompressors + rq->alg;
        unsigned int dst_maptype;
        void *dst;
        int ret;
 
-       /* two optimized fast paths only for non bigpcluster cases yet */
-       if (rq->inputsize <= PAGE_SIZE) {
-               if (nrpages_out == 1 && !rq->inplace_io) {
-                       DBG_BUGON(!*rq->out);
-                       dst = kmap_atomic(*rq->out);
-                       dst_maptype = 0;
-                       goto dstmap_out;
-               }
-
-               /*
-                * For the case of small output size (especially much less
-                * than PAGE_SIZE), memcpy the decompressed data rather than
-                * compressed data is preferred.
-                */
-               if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-                       dst = erofs_get_pcpubuf(1);
-                       if (IS_ERR(dst))
-                               return PTR_ERR(dst);
-
-                       rq->inplace_io = false;
-                       ret = alg->decompress(rq, dst);
-                       if (!ret)
-                               copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
-                                                 rq->outputsize);
-
-                       erofs_put_pcpubuf(dst);
-                       return ret;
-               }
+       /* one optimized fast path only for non bigpcluster cases yet */
+       if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+               DBG_BUGON(!*rq->out);
+               dst = kmap_atomic(*rq->out);
+               dst_maptype = 0;
+               goto dstmap_out;
        }
 
        /* general decoding path which can be used for all cases */
-       ret = alg->prepare_destpages(rq, pagepool);
+       ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
        if (ret < 0)
                return ret;
        if (ret) {
@@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
        dst_maptype = 2;
 
 dstmap_out:
-       ret = alg->decompress(rq, dst + rq->pageofs_out);
+       ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
 
        if (!dst_maptype)
                kunmap_atomic(dst);
@@ -360,8 +294,8 @@ dstmap_out:
        return ret;
 }
 
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
-                                    struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+                                    struct page **pagepool)
 {
        const unsigned int nrpages_out =
                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
        return 0;
 }
 
+static struct z_erofs_decompressor decompressors[] = {
+       [Z_EROFS_COMPRESSION_SHIFTED] = {
+               .decompress = z_erofs_shifted_transform,
+               .name = "shifted"
+       },
+       [Z_EROFS_COMPRESSION_LZ4] = {
+               .decompress = z_erofs_lz4_decompress,
+               .name = "lz4"
+       },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+       [Z_EROFS_COMPRESSION_LZMA] = {
+               .decompress = z_erofs_lzma_decompress,
+               .name = "lzma"
+       },
+#endif
+};
+
 int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-                      struct list_head *pagepool)
+                      struct page **pagepool)
 {
-       if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
-               return z_erofs_shifted_transform(rq, pagepool);
-       return z_erofs_decompress_generic(rq, pagepool);
+       return decompressors[rq->alg].decompress(rq, pagepool);
 }
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644 (file)
index 0000000..5004551
--- /dev/null
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+       struct z_erofs_lzma *next;
+       struct xz_dec_microlzma *state;
+       struct xz_buf buf;
+       u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+       /* there should be no running fs instance */
+       while (z_erofs_lzma_avail_strms) {
+               struct z_erofs_lzma *strm;
+
+               spin_lock(&z_erofs_lzma_lock);
+               strm = z_erofs_lzma_head;
+               if (!strm) {
+                       spin_unlock(&z_erofs_lzma_lock);
+                       DBG_BUGON(1);
+                       return;
+               }
+               z_erofs_lzma_head = NULL;
+               spin_unlock(&z_erofs_lzma_lock);
+
+               while (strm) {
+                       struct z_erofs_lzma *n = strm->next;
+
+                       if (strm->state)
+                               xz_dec_microlzma_end(strm->state);
+                       kfree(strm);
+                       --z_erofs_lzma_avail_strms;
+                       strm = n;
+               }
+       }
+}
+
+int z_erofs_lzma_init(void)
+{
+       unsigned int i;
+
+       /* by default, use # of possible CPUs instead */
+       if (!z_erofs_lzma_nstrms)
+               z_erofs_lzma_nstrms = num_possible_cpus();
+
+       for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+               struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+               if (!strm) {
+                       z_erofs_lzma_exit();
+                       return -ENOMEM;
+               }
+               spin_lock(&z_erofs_lzma_lock);
+               strm->next = z_erofs_lzma_head;
+               z_erofs_lzma_head = strm;
+               spin_unlock(&z_erofs_lzma_lock);
+               ++z_erofs_lzma_avail_strms;
+       }
+       return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size)
+{
+       static DEFINE_MUTEX(lzma_resize_mutex);
+       unsigned int dict_size, i;
+       struct z_erofs_lzma *strm, *head = NULL;
+       int err;
+
+       if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+               erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+               return -EINVAL;
+       }
+       if (lzma->format) {
+               erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+                         le16_to_cpu(lzma->format));
+               return -EINVAL;
+       }
+       dict_size = le32_to_cpu(lzma->dict_size);
+       if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+               erofs_err(sb, "unsupported lzma dictionary size %u",
+                         dict_size);
+               return -EINVAL;
+       }
+
+       erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+       /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+       mutex_lock(&lzma_resize_mutex);
+
+       if (z_erofs_lzma_max_dictsize >= dict_size) {
+               mutex_unlock(&lzma_resize_mutex);
+               return 0;
+       }
+
+       /* 1. collect/isolate all streams for the following check */
+       for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+               struct z_erofs_lzma *last;
+
+again:
+               spin_lock(&z_erofs_lzma_lock);
+               strm = z_erofs_lzma_head;
+               if (!strm) {
+                       spin_unlock(&z_erofs_lzma_lock);
+                       wait_event(z_erofs_lzma_wq,
+                                  READ_ONCE(z_erofs_lzma_head));
+                       goto again;
+               }
+               z_erofs_lzma_head = NULL;
+               spin_unlock(&z_erofs_lzma_lock);
+
+               for (last = strm; last->next; last = last->next)
+                       ++i;
+               last->next = head;
+               head = strm;
+       }
+
+       err = 0;
+       /* 2. walk each isolated stream and grow max dict_size if needed */
+       for (strm = head; strm; strm = strm->next) {
+               if (strm->state)
+                       xz_dec_microlzma_end(strm->state);
+               strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+               if (!strm->state)
+                       err = -ENOMEM;
+       }
+
+       /* 3. push back all to the global list and update max dict_size */
+       spin_lock(&z_erofs_lzma_lock);
+       DBG_BUGON(z_erofs_lzma_head);
+       z_erofs_lzma_head = head;
+       spin_unlock(&z_erofs_lzma_lock);
+
+       z_erofs_lzma_max_dictsize = dict_size;
+       mutex_unlock(&lzma_resize_mutex);
+       return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+                           struct page **pagepool)
+{
+       const unsigned int nrpages_out =
+               PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+       const unsigned int nrpages_in =
+               PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+       unsigned int inputmargin, inlen, outlen, pageofs;
+       struct z_erofs_lzma *strm;
+       u8 *kin;
+       bool bounced = false;
+       int no, ni, j, err = 0;
+
+       /* 1. get the exact LZMA compressed size */
+       kin = kmap(*rq->in);
+       inputmargin = 0;
+       while (!kin[inputmargin & ~PAGE_MASK])
+               if (!(++inputmargin & ~PAGE_MASK))
+                       break;
+
+       if (inputmargin >= PAGE_SIZE) {
+               kunmap(*rq->in);
+               return -EFSCORRUPTED;
+       }
+       rq->inputsize -= inputmargin;
+
+       /* 2. get an available lzma context */
+again:
+       spin_lock(&z_erofs_lzma_lock);
+       strm = z_erofs_lzma_head;
+       if (!strm) {
+               spin_unlock(&z_erofs_lzma_lock);
+               wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+               goto again;
+       }
+       z_erofs_lzma_head = strm->next;
+       spin_unlock(&z_erofs_lzma_lock);
+
+       /* 3. multi-call decompress */
+       inlen = rq->inputsize;
+       outlen = rq->outputsize;
+       xz_dec_microlzma_reset(strm->state, inlen, outlen,
+                              !rq->partial_decoding);
+       pageofs = rq->pageofs_out;
+       strm->buf.in = kin + inputmargin;
+       strm->buf.in_pos = 0;
+       strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+       inlen -= strm->buf.in_size;
+       strm->buf.out = NULL;
+       strm->buf.out_pos = 0;
+       strm->buf.out_size = 0;
+
+       for (ni = 0, no = -1;;) {
+               enum xz_ret xz_err;
+
+               if (strm->buf.out_pos == strm->buf.out_size) {
+                       if (strm->buf.out) {
+                               kunmap(rq->out[no]);
+                               strm->buf.out = NULL;
+                       }
+
+                       if (++no >= nrpages_out || !outlen) {
+                               erofs_err(rq->sb, "decompressed buf out of bound");
+                               err = -EFSCORRUPTED;
+                               break;
+                       }
+                       strm->buf.out_pos = 0;
+                       strm->buf.out_size = min_t(u32, outlen,
+                                                  PAGE_SIZE - pageofs);
+                       outlen -= strm->buf.out_size;
+                       if (rq->out[no])
+                               strm->buf.out = kmap(rq->out[no]) + pageofs;
+                       pageofs = 0;
+               } else if (strm->buf.in_pos == strm->buf.in_size) {
+                       kunmap(rq->in[ni]);
+
+                       if (++ni >= nrpages_in || !inlen) {
+                               erofs_err(rq->sb, "compressed buf out of bound");
+                               err = -EFSCORRUPTED;
+                               break;
+                       }
+                       strm->buf.in_pos = 0;
+                       strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+                       inlen -= strm->buf.in_size;
+                       kin = kmap(rq->in[ni]);
+                       strm->buf.in = kin;
+                       bounced = false;
+               }
+
+               /*
+                * Handle overlapping: Use bounced buffer if the compressed
+                * data is under processing; Otherwise, Use short-lived pages
+                * from the on-stack pagepool where pages share with the same
+                * request.
+                */
+               if (!bounced && rq->out[no] == rq->in[ni]) {
+                       memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+                       strm->buf.in = strm->bounce;
+                       bounced = true;
+               }
+               for (j = ni + 1; j < nrpages_in; ++j) {
+                       struct page *tmppage;
+
+                       if (rq->out[no] != rq->in[j])
+                               continue;
+
+                       DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+                                                       rq->in[j]));
+                       tmppage = erofs_allocpage(pagepool,
+                                                 GFP_KERNEL | __GFP_NOFAIL);
+                       set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+                       copy_highpage(tmppage, rq->in[j]);
+                       rq->in[j] = tmppage;
+               }
+               xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+               DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+               DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+               if (xz_err != XZ_OK) {
+                       if (xz_err == XZ_STREAM_END && !outlen)
+                               break;
+                       erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+                                 xz_err, rq->inputsize, rq->outputsize);
+                       err = -EFSCORRUPTED;
+                       break;
+               }
+       }
+       if (no < nrpages_out && strm->buf.out)
+               kunmap(rq->in[no]);
+       if (ni < nrpages_in)
+               kunmap(rq->in[ni]);
+       /* 4. push back LZMA stream context to the global list */
+       spin_lock(&z_erofs_lzma_lock);
+       strm->next = z_erofs_lzma_head;
+       z_erofs_lzma_head = strm;
+       spin_unlock(&z_erofs_lzma_lock);
+       wake_up(&z_erofs_lzma_wq);
+       return err;
+}
index b0b23f4..083997a 100644 (file)
 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS      0x00000002
 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER    0x00000002
 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE    0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE    0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2     0x00000008
 #define EROFS_ALL_FEATURE_INCOMPAT             \
        (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
         EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
         EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
-        EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+        EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+        EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+        EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
 
 #define EROFS_SB_EXTSLOT_SIZE  16
 
+struct erofs_deviceslot {
+       union {
+               u8 uuid[16];            /* used for device manager later */
+               u8 userdata[64];        /* digest(sha256), etc. */
+       } u;
+       __le32 blocks;                  /* total fs blocks of this device */
+       __le32 mapped_blkaddr;          /* map starting at mapped_blkaddr */
+       u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE   sizeof(struct erofs_deviceslot)
+
 /* erofs on-disk super block (currently 128 bytes) */
 struct erofs_super_block {
        __le32 magic;           /* file system magic number */
@@ -54,7 +69,9 @@ struct erofs_super_block {
                /* customized sliding window size instead of 64k by default */
                __le16 lz4_max_distance;
        } __packed u1;
-       __u8 reserved2[42];
+       __le16 extra_devices;   /* # of devices besides the primary device */
+       __le16 devt_slotoff;    /* startoff = devt_slotoff * devt_slotsize */
+       __u8 reserved2[38];
 };
 
 /*
@@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
 /* 8-byte inode chunk indexes */
 struct erofs_inode_chunk_index {
        __le16 advise;          /* always 0, don't care for now */
-       __le16 device_id;       /* back-end storage id, always 0 for now */
+       __le16 device_id;       /* back-end storage id (with bits masked) */
        __le32 blkaddr;         /* start block address of this inode chunk */
 };
 
@@ -247,10 +264,11 @@ struct erofs_inode_chunk_index {
 
 /* available compression algorithm types (for h_algorithmtype) */
 enum {
-       Z_EROFS_COMPRESSION_LZ4 = 0,
+       Z_EROFS_COMPRESSION_LZ4         = 0,
+       Z_EROFS_COMPRESSION_LZMA        = 1,
        Z_EROFS_COMPRESSION_MAX
 };
-#define Z_EROFS_ALL_COMPR_ALGS         (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS         ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
 
 /* 14 bytes (+ length field = 16 bytes) */
 struct z_erofs_lz4_cfgs {
@@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs {
        u8 reserved[10];
 } __packed;
 
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+       __le32 dict_size;
+       __le16 format;
+       u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE     (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
 /*
  * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
  *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
@@ -288,35 +315,34 @@ struct z_erofs_map_header {
 #define Z_EROFS_VLE_LEGACY_HEADER_PADDING       8
 
 /*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- *    0 - literal (uncompressed) cluster
- *    1 - compressed cluster (for the head logical cluster)
- *    2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ *    0   - literal (uncompressed) lcluster
+ *    1,3 - compressed lcluster (for HEAD lclusters)
+ *    2   - compressed lcluster (for NONHEAD lclusters)
  *
  * In detail,
- *    0 - literal (uncompressed) cluster,
+ *    0 - literal (uncompressed) lcluster,
  *        di_advise = 0
- *        di_clusterofs = the literal data offset of the cluster
- *        di_blkaddr = the blkaddr of the literal cluster
+ *        di_clusterofs = the literal data offset of the lcluster
+ *        di_blkaddr = the blkaddr of the literal pcluster
  *
- *    1 - compressed cluster (for the head logical cluster)
- *        di_advise = 1
- *        di_clusterofs = the decompressed data offset of the cluster
- *        di_blkaddr = the blkaddr of the compressed cluster
+ *    1,3 - compressed lcluster (for HEAD lclusters)
+ *        di_advise = 1 or 3
+ *        di_clusterofs = the decompressed data offset of the lcluster
+ *        di_blkaddr = the blkaddr of the compressed pcluster
  *
- *    2 - compressed cluster (for the other logical clusters)
+ *    2 - compressed lcluster (for NONHEAD lclusters)
  *        di_advise = 2
  *        di_clusterofs =
- *           the decompressed data offset in its own head cluster
- *        di_u.delta[0] = distance to its corresponding head cluster
- *        di_u.delta[1] = distance to its corresponding tail cluster
- *                (di_advise could be 0, 1 or 2)
+ *           the decompressed data offset in its own HEAD lcluster
+ *        di_u.delta[0] = distance to this HEAD lcluster
+ *        di_u.delta[1] = distance to the next HEAD lcluster
  */
 enum {
        Z_EROFS_VLE_CLUSTER_TYPE_PLAIN          = 0,
-       Z_EROFS_VLE_CLUSTER_TYPE_HEAD           = 1,
+       Z_EROFS_VLE_CLUSTER_TYPE_HEAD1          = 1,
        Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD        = 2,
-       Z_EROFS_VLE_CLUSTER_TYPE_RESERVED       = 3,
+       Z_EROFS_VLE_CLUSTER_TYPE_HEAD2          = 3,
        Z_EROFS_VLE_CLUSTER_TYPE_MAX
 };
 
@@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
        /* keep in sync between 2 index structures for better extendibility */
        BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
                     sizeof(struct z_erofs_vle_decompressed_index));
+       BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
 
        BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
                     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
index a552399..2345f1d 100644 (file)
@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
        inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
        inode->i_flags &= ~S_DAX;
-       if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+       if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
            vi->datalayout == EROFS_INODE_FLAT_PLAIN)
                inode->i_flags |= S_DAX;
        if (!nblks)
index 9524e15..3265688 100644 (file)
@@ -47,7 +47,16 @@ typedef u64 erofs_off_t;
 /* data type for filesystem-wide blocks number */
 typedef u32 erofs_blk_t;
 
-struct erofs_fs_context {
+struct erofs_device_info {
+       char *path;
+       struct block_device *bdev;
+       struct dax_device *dax_dev;
+
+       u32 blocks;
+       u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
 #ifdef CONFIG_EROFS_FS_ZIP
        /* current strategy of how to use managed cache */
        unsigned char cache_strategy;
@@ -60,6 +69,18 @@ struct erofs_fs_context {
        unsigned int mount_opt;
 };
 
+struct erofs_dev_context {
+       struct idr tree;
+       struct rw_semaphore rwsem;
+
+       unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+       struct erofs_mount_opts opt;
+       struct erofs_dev_context *devs;
+};
+
 /* all filesystem-wide lz4 configurations */
 struct erofs_sb_lz4_info {
        /* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +90,7 @@ struct erofs_sb_lz4_info {
 };
 
 struct erofs_sb_info {
+       struct erofs_mount_opts opt;    /* options */
 #ifdef CONFIG_EROFS_FS_ZIP
        /* list for all registered superblocks, mainly for shrinker */
        struct list_head list;
@@ -85,12 +107,16 @@ struct erofs_sb_info {
 
        struct erofs_sb_lz4_info lz4;
 #endif /* CONFIG_EROFS_FS_ZIP */
+       struct erofs_dev_context *devs;
        struct dax_device *dax_dev;
-       u32 blocks;
+       u64 total_blocks;
+       u32 primarydevice_blocks;
+
        u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
        u32 xattr_blkaddr;
 #endif
+       u16 device_id_mask;     /* valid bits of device id to be used */
 
        /* inode slot unit size in bit shift */
        unsigned char islotbits;
@@ -108,8 +134,6 @@ struct erofs_sb_info {
        u8 volume_name[16];             /* volume name */
        u32 feature_compat;
        u32 feature_incompat;
-
-       struct erofs_fs_context ctx;    /* options */
 };
 
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +145,9 @@ struct erofs_sb_info {
 #define EROFS_MOUNT_DAX_ALWAYS         0x00000040
 #define EROFS_MOUNT_DAX_NEVER          0x00000080
 
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option)   ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option)  ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option)   ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option)  ((opt)->mount_opt & EROFS_MOUNT_##option)
 
 enum {
        EROFS_ZIP_CACHE_DISABLED,
@@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
 EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
 EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
 EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */
@@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
                              EROFS_I_DATALAYOUT_BITS);
 }
 
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+                                         pgoff_t index)
+{
+       return pagecache_get_page(mapping, index,
+                       FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+                       readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
 extern const struct super_operations erofs_sops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops;
  * of the corresponding uncompressed data in the file.
  */
 enum {
-       BH_Zipped = BH_PrivateStart,
+       BH_Encoded = BH_PrivateStart,
        BH_FullMapped,
 };
 
@@ -346,8 +384,8 @@ enum {
 #define EROFS_MAP_MAPPED       (1 << BH_Mapped)
 /* Located in metadata (could be copied from bd_inode) */
 #define EROFS_MAP_META         (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED       (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED      (1 << BH_Encoded)
 /* The length of extent is full */
 #define EROFS_MAP_FULL_MAPPED  (1 << BH_FullMapped)
 
@@ -355,6 +393,8 @@ struct erofs_map_blocks {
        erofs_off_t m_pa, m_la;
        u64 m_plen, m_llen;
 
+       unsigned short m_deviceid;
+       char m_algorithmformat;
        unsigned int m_flags;
 
        struct page *mpage;
@@ -367,6 +407,13 @@ struct erofs_map_blocks {
  * approach instead if possible since it's more metadata lightweight.)
  */
 #define EROFS_GET_BLOCKS_FIEMAP        0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE      0x0004
+
+enum {
+       Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+       Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
 
 /* zmap.c */
 extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 }
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
+struct erofs_map_dev {
+       struct block_device *m_bdev;
+       struct dax_device *m_daxdev;
+
+       erofs_off_t m_pa;
+       unsigned int m_deviceid;
+};
+
 /* data.c */
 extern const struct file_operations erofs_file_fops;
 struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
 int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 u64 start, u64 len);
 
@@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void);
 void erofs_pcpubuf_exit(void);
 
 /* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+               struct page *page)
+{
+       set_page_private(page, (unsigned long)*pagepool);
+       *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
 
 #ifdef CONFIG_EROFS_FS_ZIP
 int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
 }
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size) {
+       if (lzma) {
+               erofs_err(sb, "lzma algorithm isn't enabled");
+               return -EINVAL;
+       }
+       return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
 
 #endif /* __EROFS_INTERNAL_H */
index 6c88557..a2efd83 100644 (file)
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
 {
        static DEFINE_MUTEX(pcb_resize_mutex);
        static unsigned int pcb_nrpages;
-       LIST_HEAD(pagepool);
+       struct page *pagepool = NULL;
        int delta, cpu, ret, i;
 
        mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
                        vunmap(old_ptr);
 free_pagearray:
                while (i)
-                       list_add(&oldpages[--i]->lru, &pagepool);
+                       erofs_pagepool_add(&pagepool, oldpages[--i]);
                kfree(oldpages);
                if (ret)
                        break;
        }
        pcb_nrpages = nrpages;
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
 out:
        mutex_unlock(&pcb_resize_mutex);
        return ret;
index 11b8855..6a969b1 100644 (file)
@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
                case Z_EROFS_COMPRESSION_LZ4:
                        ret = z_erofs_load_lz4_config(sb, dsb, data, size);
                        break;
+               case Z_EROFS_COMPRESSION_LZMA:
+                       ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+                       break;
                default:
                        DBG_BUGON(1);
                        ret = -EFAULT;
@@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
 }
 #endif
 
+static int erofs_init_devices(struct super_block *sb,
+                             struct erofs_super_block *dsb)
+{
+       struct erofs_sb_info *sbi = EROFS_SB(sb);
+       unsigned int ondisk_extradevs;
+       erofs_off_t pos;
+       struct page *page = NULL;
+       struct erofs_device_info *dif;
+       struct erofs_deviceslot *dis;
+       void *ptr;
+       int id, err = 0;
+
+       sbi->total_blocks = sbi->primarydevice_blocks;
+       if (!erofs_sb_has_device_table(sbi))
+               ondisk_extradevs = 0;
+       else
+               ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+       if (ondisk_extradevs != sbi->devs->extra_devices) {
+               erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+                         ondisk_extradevs, sbi->devs->extra_devices);
+               return -EINVAL;
+       }
+       if (!ondisk_extradevs)
+               return 0;
+
+       sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+       pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+       down_read(&sbi->devs->rwsem);
+       idr_for_each_entry(&sbi->devs->tree, dif, id) {
+               erofs_blk_t blk = erofs_blknr(pos);
+               struct block_device *bdev;
+
+               if (!page || page->index != blk) {
+                       if (page) {
+                               kunmap(page);
+                               unlock_page(page);
+                               put_page(page);
+                       }
+
+                       page = erofs_get_meta_page(sb, blk);
+                       if (IS_ERR(page)) {
+                               up_read(&sbi->devs->rwsem);
+                               return PTR_ERR(page);
+                       }
+                       ptr = kmap(page);
+               }
+               dis = ptr + erofs_blkoff(pos);
+
+               bdev = blkdev_get_by_path(dif->path,
+                                         FMODE_READ | FMODE_EXCL,
+                                         sb->s_type);
+               if (IS_ERR(bdev)) {
+                       err = PTR_ERR(bdev);
+                       goto err_out;
+               }
+               dif->bdev = bdev;
+               dif->dax_dev = fs_dax_get_by_bdev(bdev);
+               dif->blocks = le32_to_cpu(dis->blocks);
+               dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+               sbi->total_blocks += dif->blocks;
+               pos += EROFS_DEVT_SLOT_SIZE;
+       }
+err_out:
+       up_read(&sbi->devs->rwsem);
+       if (page) {
+               kunmap(page);
+               unlock_page(page);
+               put_page(page);
+       }
+       return err;
+}
+
 static int erofs_read_superblock(struct super_block *sb)
 {
        struct erofs_sb_info *sbi;
@@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb)
                          sbi->sb_size);
                goto out;
        }
-       sbi->blocks = le32_to_cpu(dsb->blocks);
+       sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
        sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
        sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb)
                ret = erofs_load_compr_cfgs(sb, dsb);
        else
                ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+       if (ret < 0)
+               goto out;
+
+       /* handle multiple devices */
+       ret = erofs_init_devices(sb, dsb);
 out:
        kunmap(page);
        put_page(page);
@@ -340,15 +421,15 @@ out:
 static void erofs_default_options(struct erofs_fs_context *ctx)
 {
 #ifdef CONFIG_EROFS_FS_ZIP
-       ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
-       ctx->max_sync_decompress_pages = 3;
-       ctx->readahead_sync_decompress = false;
+       ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+       ctx->opt.max_sync_decompress_pages = 3;
+       ctx->opt.readahead_sync_decompress = false;
 #endif
 #ifdef CONFIG_EROFS_FS_XATTR
-       set_opt(ctx, XATTR_USER);
+       set_opt(&ctx->opt, XATTR_USER);
 #endif
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-       set_opt(ctx, POSIX_ACL);
+       set_opt(&ctx->opt, POSIX_ACL);
 #endif
 }
 
@@ -358,6 +439,7 @@ enum {
        Opt_cache_strategy,
        Opt_dax,
        Opt_dax_enum,
+       Opt_device,
        Opt_err
 };
 
@@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
                     erofs_param_cache_strategy),
        fsparam_flag("dax",             Opt_dax),
        fsparam_enum("dax",             Opt_dax_enum, erofs_dax_param_enums),
+       fsparam_string("device",        Opt_device),
        {}
 };
 
@@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
        switch (mode) {
        case EROFS_MOUNT_DAX_ALWAYS:
                warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-               set_opt(ctx, DAX_ALWAYS);
-               clear_opt(ctx, DAX_NEVER);
+               set_opt(&ctx->opt, DAX_ALWAYS);
+               clear_opt(&ctx->opt, DAX_NEVER);
                return true;
        case EROFS_MOUNT_DAX_NEVER:
-               set_opt(ctx, DAX_NEVER);
-               clear_opt(ctx, DAX_ALWAYS);
+               set_opt(&ctx->opt, DAX_NEVER);
+               clear_opt(&ctx->opt, DAX_ALWAYS);
                return true;
        default:
                DBG_BUGON(1);
@@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
 static int erofs_fc_parse_param(struct fs_context *fc,
                                struct fs_parameter *param)
 {
-       struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+       struct erofs_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
-       int opt;
+       struct erofs_device_info *dif;
+       int opt, ret;
 
        opt = fs_parse(fc, erofs_fs_parameters, param, &result);
        if (opt < 0)
@@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
        case Opt_user_xattr:
 #ifdef CONFIG_EROFS_FS_XATTR
                if (result.boolean)
-                       set_opt(ctx, XATTR_USER);
+                       set_opt(&ctx->opt, XATTR_USER);
                else
-                       clear_opt(ctx, XATTR_USER);
+                       clear_opt(&ctx->opt, XATTR_USER);
 #else
                errorfc(fc, "{,no}user_xattr options not supported");
 #endif
@@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
        case Opt_acl:
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
                if (result.boolean)
-                       set_opt(ctx, POSIX_ACL);
+                       set_opt(&ctx->opt, POSIX_ACL);
                else
-                       clear_opt(ctx, POSIX_ACL);
+                       clear_opt(&ctx->opt, POSIX_ACL);
 #else
                errorfc(fc, "{,no}acl options not supported");
 #endif
                break;
        case Opt_cache_strategy:
 #ifdef CONFIG_EROFS_FS_ZIP
-               ctx->cache_strategy = result.uint_32;
+               ctx->opt.cache_strategy = result.uint_32;
 #else
                errorfc(fc, "compression not supported, cache_strategy ignored");
 #endif
@@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
                if (!erofs_fc_set_dax_mode(fc, result.uint_32))
                        return -EINVAL;
                break;
+       case Opt_device:
+               dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+               if (!dif)
+                       return -ENOMEM;
+               dif->path = kstrdup(param->string, GFP_KERNEL);
+               if (!dif->path) {
+                       kfree(dif);
+                       return -ENOMEM;
+               }
+               down_write(&ctx->devs->rwsem);
+               ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+               up_write(&ctx->devs->rwsem);
+               if (ret < 0) {
+                       kfree(dif->path);
+                       kfree(dif);
+                       return ret;
+               }
+               ++ctx->devs->extra_devices;
+               break;
        default:
                return -ENOPARAM;
        }
@@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
                return -ENOMEM;
 
        sb->s_fs_info = sbi;
+       sbi->opt = ctx->opt;
        sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+       sbi->devs = ctx->devs;
+       ctx->devs = NULL;
+
        err = erofs_read_superblock(sb);
        if (err)
                return err;
 
-       if (test_opt(ctx, DAX_ALWAYS) &&
+       if (test_opt(&sbi->opt, DAX_ALWAYS) &&
            !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
                errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
-               clear_opt(ctx, DAX_ALWAYS);
+               clear_opt(&sbi->opt, DAX_ALWAYS);
        }
        sb->s_flags |= SB_RDONLY | SB_NOATIME;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
        sb->s_op = &erofs_sops;
        sb->s_xattr = erofs_xattr_handlers;
 
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(&sbi->opt, POSIX_ACL))
                sb->s_flags |= SB_POSIXACL;
        else
                sb->s_flags &= ~SB_POSIXACL;
 
-       sbi->ctx = *ctx;
-
 #ifdef CONFIG_EROFS_FS_ZIP
        xa_init(&sbi->managed_pslots);
 #endif
@@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
 
        DBG_BUGON(!sb_rdonly(sb));
 
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(&ctx->opt, POSIX_ACL))
                fc->sb_flags |= SB_POSIXACL;
        else
                fc->sb_flags &= ~SB_POSIXACL;
 
-       sbi->ctx = *ctx;
+       sbi->opt = ctx->opt;
 
        fc->sb_flags |= SB_RDONLY;
        return 0;
 }
 
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+       struct erofs_device_info *dif = ptr;
+
+       fs_put_dax(dif->dax_dev);
+       if (dif->bdev)
+               blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+       kfree(dif->path);
+       kfree(dif);
+       return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+       if (!devs)
+               return;
+       idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+       idr_destroy(&devs->tree);
+       kfree(devs);
+}
+
 static void erofs_fc_free(struct fs_context *fc)
 {
-       kfree(fc->fs_private);
+       struct erofs_fs_context *ctx = fc->fs_private;
+
+       erofs_free_dev_context(ctx->devs);
+       kfree(ctx);
 }
 
 static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = {
 
 static int erofs_init_fs_context(struct fs_context *fc)
 {
-       fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
-       if (!fc->fs_private)
-               return -ENOMEM;
+       struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 
-       /* set default mount options */
-       erofs_default_options(fc->fs_private);
+       if (!ctx)
+               return -ENOMEM;
+       ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+       if (!ctx->devs) {
+               kfree(ctx);
+               return -ENOMEM;
+       }
+       fc->fs_private = ctx;
 
+       idr_init(&ctx->devs->tree);
+       init_rwsem(&ctx->devs->rwsem);
+       erofs_default_options(ctx);
        fc->ops = &erofs_context_ops;
-
        return 0;
 }
 
@@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb)
        sbi = EROFS_SB(sb);
        if (!sbi)
                return;
+
+       erofs_free_dev_context(sbi->devs);
        fs_put_dax(sbi->dax_dev);
        kfree(sbi);
        sb->s_fs_info = NULL;
@@ -706,6 +843,10 @@ static int __init erofs_module_init(void)
        if (err)
                goto shrinker_err;
 
+       err = z_erofs_lzma_init();
+       if (err)
+               goto lzma_err;
+
        erofs_pcpubuf_init();
        err = z_erofs_init_zip_subsystem();
        if (err)
@@ -720,6 +861,8 @@ static int __init erofs_module_init(void)
 fs_err:
        z_erofs_exit_zip_subsystem();
 zip_err:
+       z_erofs_lzma_exit();
+lzma_err:
        erofs_exit_shrinker();
 shrinker_err:
        kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +873,13 @@ icache_err:
 static void __exit erofs_module_exit(void)
 {
        unregister_filesystem(&erofs_fs_type);
-       z_erofs_exit_zip_subsystem();
-       erofs_exit_shrinker();
 
-       /* Ensure all RCU free inodes are safe before cache is destroyed. */
+       /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
        rcu_barrier();
+
+       z_erofs_exit_zip_subsystem();
+       z_erofs_lzma_exit();
+       erofs_exit_shrinker();
        kmem_cache_destroy(erofs_inode_cachep);
        erofs_pcpubuf_exit();
 }
@@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
        buf->f_type = sb->s_magic;
        buf->f_bsize = EROFS_BLKSIZ;
-       buf->f_blocks = sbi->blocks;
+       buf->f_blocks = sbi->total_blocks;
        buf->f_bfree = buf->f_bavail = 0;
 
        buf->f_files = ULLONG_MAX;
@@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
-       struct erofs_fs_context *ctx = &sbi->ctx;
+       struct erofs_mount_opts *opt = &sbi->opt;
 
 #ifdef CONFIG_EROFS_FS_XATTR
-       if (test_opt(ctx, XATTR_USER))
+       if (test_opt(opt, XATTR_USER))
                seq_puts(seq, ",user_xattr");
        else
                seq_puts(seq, ",nouser_xattr");
 #endif
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(opt, POSIX_ACL))
                seq_puts(seq, ",acl");
        else
                seq_puts(seq, ",noacl");
 #endif
 #ifdef CONFIG_EROFS_FS_ZIP
-       if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+       if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
                seq_puts(seq, ",cache_strategy=disabled");
-       else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+       else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
                seq_puts(seq, ",cache_strategy=readahead");
-       else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+       else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
                seq_puts(seq, ",cache_strategy=readaround");
 #endif
-       if (test_opt(ctx, DAX_ALWAYS))
+       if (test_opt(opt, DAX_ALWAYS))
                seq_puts(seq, ",dax=always");
-       if (test_opt(ctx, DAX_NEVER))
+       if (test_opt(opt, DAX_NEVER))
                seq_puts(seq, ",dax=never");
        return 0;
 }
index bd86067..84da2c2 100644 (file)
@@ -6,20 +6,29 @@
 #include "internal.h"
 #include <linux/pagevec.h>
 
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
 {
-       struct page *page;
+       struct page *page = *pagepool;
 
-       if (!list_empty(pool)) {
-               page = lru_to_page(pool);
+       if (page) {
                DBG_BUGON(page_ref_count(page) != 1);
-               list_del(&page->lru);
+               *pagepool = (struct page *)page_private(page);
        } else {
                page = alloc_page(gfp);
        }
        return page;
 }
 
+void erofs_release_pages(struct page **pagepool)
+{
+       while (*pagepool) {
+               struct page *page = *pagepool;
+
+               *pagepool = (struct page *)page_private(page);
+               put_page(page);
+       }
+}
+
 #ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;
index 778f2c5..01c581e 100644 (file)
@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 
 static bool erofs_xattr_user_list(struct dentry *dentry)
 {
-       return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+       return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
 }
 
 static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
 
        switch (handler->flags) {
        case EROFS_XATTR_INDEX_USER:
-               if (!test_opt(&sbi->ctx, XATTR_USER))
+               if (!test_opt(&sbi->opt, XATTR_USER))
                        return -EOPNOTSUPP;
                break;
        case EROFS_XATTR_INDEX_TRUSTED:
index 11c7a1a..bcb1b91 100644 (file)
@@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
 static void preload_compressed_pages(struct z_erofs_collector *clt,
                                     struct address_space *mc,
                                     enum z_erofs_cache_alloctype type,
-                                    struct list_head *pagepool)
+                                    struct page **pagepool)
 {
        struct z_erofs_pcluster *pcl = clt->pcl;
        bool standalone = true;
@@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
                if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
                        continue;
 
-               if (page) {
+               if (page)
                        put_page(page);
-               } else if (newpage) {
-                       set_page_private(newpage, 0);
-                       list_add(&newpage->lru, pagepool);
-               }
+               else if (newpage)
+                       erofs_pagepool_add(pagepool, newpage);
        }
 
        /*
@@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
        struct erofs_workgroup *grp;
        int err;
 
+       if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+               DBG_BUGON(1);
+               return -EFSCORRUPTED;
+       }
+
        /* no available pcluster, let's allocate one */
        pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
        if (IS_ERR(pcl))
@@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 
        atomic_set(&pcl->obj.refcount, 1);
        pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+       pcl->algorithmformat = map->m_algorithmformat;
        pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
                (map->m_flags & EROFS_MAP_FULL_MAPPED ?
                        Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
 
-       if (map->m_flags & EROFS_MAP_ZIPPED)
-               pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
-       else
-               pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
        /* new pclusters should be claimed as type 1, primary and followed */
        pcl->next = clt->owned_head;
        clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
 }
 
 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-                               struct page *page, struct list_head *pagepool)
+                               struct page *page, struct page **pagepool)
 {
        struct inode *const inode = fe->inode;
        struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -695,7 +693,7 @@ restart_now:
                goto err_out;
 
        /* preload all compressed pages (maybe downgrade role if necessary) */
-       if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+       if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
                cache_strategy = TRYALLOC;
        else
                cache_strategy = DONTALLOC;
@@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
        /* Use workqueue and sync decompression for atomic contexts only */
        if (in_atomic() || irqs_disabled()) {
                queue_work(z_erofs_workqueue, &io->u.work);
-               sbi->ctx.readahead_sync_decompress = true;
+               sbi->opt.readahead_sync_decompress = true;
                return;
        }
        z_erofs_decompressqueue_work(&io->u.work);
@@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
 
 static int z_erofs_decompress_pcluster(struct super_block *sb,
                                       struct z_erofs_pcluster *pcl,
-                                      struct list_head *pagepool)
+                                      struct page **pagepool)
 {
        struct erofs_sb_info *const sbi = EROFS_SB(sb);
        struct z_erofs_pagevec_ctor ctor;
@@ -1036,7 +1034,7 @@ out:
 }
 
 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
-                                    struct list_head *pagepool)
+                                    struct page **pagepool)
 {
        z_erofs_next_pcluster_t owned = io->head;
 
@@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
 {
        struct z_erofs_decompressqueue *bgq =
                container_of(work, struct z_erofs_decompressqueue, u.work);
-       LIST_HEAD(pagepool);
+       struct page *pagepool = NULL;
 
        DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
        z_erofs_decompress_queue(bgq, &pagepool);
 
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
        kvfree(bgq);
 }
 
 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
                                               unsigned int nr,
-                                              struct list_head *pagepool,
+                                              struct page **pagepool,
                                               struct address_space *mc,
                                               gfp_t gfp)
 {
@@ -1173,7 +1171,7 @@ repeat:
 out_allocpage:
        page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
        if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
-               list_add(&page->lru, pagepool);
+               erofs_pagepool_add(pagepool, page);
                cond_resched();
                goto repeat;
        }
@@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
 
 static void z_erofs_submit_queue(struct super_block *sb,
                                 struct z_erofs_decompress_frontend *f,
-                                struct list_head *pagepool,
+                                struct page **pagepool,
                                 struct z_erofs_decompressqueue *fgq,
                                 bool *force_fg)
 {
@@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
        struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
        void *bi_private;
        z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
-       /* since bio will be NULL, no need to initialize last_index */
+       /* bio is NULL initially, so no need to initialize last_{index,bdev} */
        pgoff_t last_index;
+       struct block_device *last_bdev;
        unsigned int nr_bios = 0;
        struct bio *bio = NULL;
 
@@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
        q[JQ_SUBMIT]->head = owned_head;
 
        do {
+               struct erofs_map_dev mdev;
                struct z_erofs_pcluster *pcl;
                pgoff_t cur, end;
                unsigned int i = 0;
@@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
 
                pcl = container_of(owned_head, struct z_erofs_pcluster, next);
 
-               cur = pcl->obj.index;
+               /* no device id here, thus it will always succeed */
+               mdev = (struct erofs_map_dev) {
+                       .m_pa = blknr_to_addr(pcl->obj.index),
+               };
+               (void)erofs_map_dev(sb, &mdev);
+
+               cur = erofs_blknr(mdev.m_pa);
                end = cur + pcl->pclusterpages;
 
                /* close the main owned chain at first */
@@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
                        if (!page)
                                continue;
 
-                       if (bio && cur != last_index + 1) {
+                       if (bio && (cur != last_index + 1 ||
+                                   last_bdev != mdev.m_bdev)) {
 submit_bio_retry:
                                submit_bio(bio);
                                bio = NULL;
@@ -1314,9 +1321,10 @@ submit_bio_retry:
 
                        if (!bio) {
                                bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
                                bio->bi_end_io = z_erofs_decompressqueue_endio;
-                               bio_set_dev(bio, sb->s_bdev);
+
+                               bio_set_dev(bio, mdev.m_bdev);
+                               last_bdev = mdev.m_bdev;
                                bio->bi_iter.bi_sector = (sector_t)cur <<
                                        LOG_SECTORS_PER_BLOCK;
                                bio->bi_private = bi_private;
@@ -1355,7 +1363,7 @@ submit_bio_retry:
 
 static void z_erofs_runqueue(struct super_block *sb,
                             struct z_erofs_decompress_frontend *f,
-                            struct list_head *pagepool, bool force_fg)
+                            struct page **pagepool, bool force_fg)
 {
        struct z_erofs_decompressqueue io[NR_JOBQUEUES];
 
@@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb,
        z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
 }
 
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+                                     struct readahead_control *rac,
+                                     erofs_off_t end,
+                                     struct page **pagepool,
+                                     bool backmost)
+{
+       struct inode *inode = f->inode;
+       struct erofs_map_blocks *map = &f->map;
+       erofs_off_t cur;
+       int err;
+
+       if (backmost) {
+               map->m_la = end;
+               err = z_erofs_map_blocks_iter(inode, map,
+                                             EROFS_GET_BLOCKS_READMORE);
+               if (err)
+                       return;
+
+               /* expend ra for the trailing edge if readahead */
+               if (rac) {
+                       loff_t newstart = readahead_pos(rac);
+
+                       cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+                       readahead_expand(rac, newstart, cur - newstart);
+                       return;
+               }
+               end = round_up(end, PAGE_SIZE);
+       } else {
+               end = round_up(map->m_la, PAGE_SIZE);
+
+               if (!map->m_llen)
+                       return;
+       }
+
+       cur = map->m_la + map->m_llen - 1;
+       while (cur >= end) {
+               pgoff_t index = cur >> PAGE_SHIFT;
+               struct page *page;
+
+               page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+               if (!page)
+                       goto skip;
+
+               if (PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto skip;
+               }
+
+               err = z_erofs_do_read_page(f, page, pagepool);
+               if (err)
+                       erofs_err(inode->i_sb,
+                                 "readmore error at page %lu @ nid %llu",
+                                 index, EROFS_I(inode)->nid);
+               put_page(page);
+skip:
+               if (cur < PAGE_SIZE)
+                       break;
+               cur = (index << PAGE_SHIFT) - 1;
+       }
+}
+
 static int z_erofs_readpage(struct file *file, struct page *page)
 {
        struct inode *const inode = page->mapping->host;
        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+       struct page *pagepool = NULL;
        int err;
-       LIST_HEAD(pagepool);
 
        trace_erofs_readpage(page, false);
-
        f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
 
+       z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+                                 &pagepool, true);
        err = z_erofs_do_read_page(&f, page, &pagepool);
+       z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
        (void)z_erofs_collector_end(&f.clt);
 
        /* if some compressed cluster ready, need submit them anyway */
@@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
        if (f.map.mpage)
                put_page(f.map.mpage);
 
-       /* clean up the remaining free pages */
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
        return err;
 }
 
@@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
 {
        struct inode *const inode = rac->mapping->host;
        struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
-       unsigned int nr_pages = readahead_count(rac);
-       bool sync = (sbi->ctx.readahead_sync_decompress &&
-                       nr_pages <= sbi->ctx.max_sync_decompress_pages);
        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
-       struct page *page, *head = NULL;
-       LIST_HEAD(pagepool);
-
-       trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+       struct page *pagepool = NULL, *head = NULL, *page;
+       unsigned int nr_pages;
 
        f.readahead = true;
        f.headoffset = readahead_pos(rac);
 
-       while ((page = readahead_page(rac))) {
-               prefetchw(&page->flags);
-
-               /*
-                * A pure asynchronous readahead is indicated if
-                * a PG_readahead marked page is hitted at first.
-                * Let's also do asynchronous decompression for this case.
-                */
-               sync &= !(PageReadahead(page) && !head);
+       z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+                                 readahead_length(rac) - 1, &pagepool, true);
+       nr_pages = readahead_count(rac);
+       trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
 
+       while ((page = readahead_page(rac))) {
                set_page_private(page, (unsigned long)head);
                head = page;
        }
@@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac)
                                  page->index, EROFS_I(inode)->nid);
                put_page(page);
        }
-
+       z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
        (void)z_erofs_collector_end(&f.clt);
 
-       z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
+       z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+                        sbi->opt.readahead_sync_decompress &&
+                        nr_pages <= sbi->opt.max_sync_decompress_pages);
        if (f.map.mpage)
                put_page(f.map.mpage);
-
-       /* clean up the remaining free pages */
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
 }
 
 const struct address_space_operations z_erofs_aops = {
index 3a008f1..879df53 100644 (file)
@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
        } u;
 };
 
-#define MNGD_MAPPING(sbi)      ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
-                                        struct page *page)
-{
-       return page->mapping == MNGD_MAPPING(sbi);
-}
-
 #define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
 #define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
 #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
index 7a6df35..660489a 100644 (file)
@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 {
        struct erofs_inode *const vi = EROFS_I(inode);
        struct super_block *const sb = inode->i_sb;
-       int err;
+       int err, headnr;
        erofs_off_t pos;
        struct page *page;
        void *kaddr;
@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
        vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
        vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
 
-       if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
-               erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
-                         vi->z_algorithmtype[0], vi->nid);
+       headnr = 0;
+       if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+           vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+               erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+                         headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
                err = -EOPNOTSUPP;
                goto unmap_done;
        }
@@ -111,7 +113,7 @@ struct z_erofs_maprecorder {
 
        unsigned long lcn;
        /* compression extent information gathered */
-       u8  type;
+       u8  type, headtype;
        u16 clusterofs;
        u16 delta[2];
        erofs_blk_t pblk, compressedlcs;
@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
                m->clusterofs = 1 << vi->z_logical_clusterbits;
                m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
                if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
-                       if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                       if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+                                       Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
                                DBG_BUGON(1);
                                return -EFSCORRUPTED;
                        }
@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
                m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
                break;
        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                m->clusterofs = le16_to_cpu(di->di_clusterofs);
                m->pblk = le32_to_cpu(di->di_u.blkaddr);
                break;
@@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
                }
                return z_erofs_extent_lookback(m, m->delta[0]);
        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-               map->m_flags &= ~EROFS_MAP_ZIPPED;
-               fallthrough;
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+               m->headtype = m->type;
                map->m_la = (lcn << lclusterbits) | m->clusterofs;
                break;
        default:
@@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
        int err;
 
        DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
-                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
-       if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
-           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+       DBG_BUGON(m->type != m->headtype);
+
+       if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+           ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+            !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+           ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+            !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
                map->m_plen = 1 << lclusterbits;
                return 0;
        }
-
        lcn = m->lcn + 1;
        if (m->compressedlcs)
                goto out;
@@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 
        switch (m->type) {
        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                /*
                 * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
                 * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
                        DBG_BUGON(!m->delta[1] &&
                                  m->clusterofs != 1 << lclusterbits);
                } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
-                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
                        /* go on until the next HEAD lcluster */
                        if (lcn != headlcn)
                                break;
@@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
        if (err)
                goto unmap_out;
 
-       map->m_flags = EROFS_MAP_ZIPPED;        /* by default, compressed */
+       map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
        end = (m.lcn + 1ULL) << lclusterbits;
 
        switch (m.type) {
        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-               if (endoff >= m.clusterofs)
-                       map->m_flags &= ~EROFS_MAP_ZIPPED;
-               fallthrough;
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                if (endoff >= m.clusterofs) {
+                       m.headtype = m.type;
                        map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
                        break;
                }
@@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 
        map->m_llen = end - map->m_la;
        map->m_pa = blknr_to_addr(m.pblk);
-       map->m_flags |= EROFS_MAP_MAPPED;
 
        err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
        if (err)
                goto out;
 
-       if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+       if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+               map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+       else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+               map->m_algorithmformat = vi->z_algorithmtype[1];
+       else
+               map->m_algorithmformat = vi->z_algorithmtype[0];
+
+       if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+           ((flags & EROFS_GET_BLOCKS_READMORE) &&
+            map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+            map->m_llen >= EROFS_BLKSIZ)) {
                err = z_erofs_get_extent_decompressedlen(&m);
                if (!err)
                        map->m_flags |= EROFS_MAP_FULL_MAPPED;
index ca37d43..1c7aa1e 100644 (file)
@@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
        exfat_save_attr(inode, info->attr);
 
        inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
-               ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+               ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits;
        inode->i_mtime = info->mtime;
        inode->i_ctime = info->mtime;
        ei->i_crtime = info->crtime;
index ac0e11b..9c5559f 100644 (file)
@@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = {
        .llseek         = ext4_llseek,
        .read_iter      = ext4_file_read_iter,
        .write_iter     = ext4_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
        .unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
index 88d5d27..79b6a0c 100644 (file)
@@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = {
        .set_context            = ext4_set_context,
        .get_dummy_policy       = ext4_get_dummy_policy,
        .empty_dir              = ext4_empty_dir,
-       .max_namelen            = EXT4_NAME_LEN,
        .has_stable_inodes      = ext4_has_stable_inodes,
        .get_ino_and_lblk_bits  = ext4_get_ino_and_lblk_bits,
 };
@@ -4474,7 +4473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto cantfind_ext4;
 
        /* check blocks count against device size */
-       blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+       blocks_count = sb_bdev_nr_blocks(sb);
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                       "exceeds size of device (%llu blocks)",
index c1bf9ad..20a083d 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/lzo.h>
index 78ebc30..cf049a0 100644 (file)
@@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
        .set_context            = f2fs_set_context,
        .get_dummy_policy       = f2fs_get_dummy_policy,
        .empty_dir              = f2fs_empty_dir,
-       .max_namelen            = F2FS_NAME_LEN,
        .has_stable_inodes      = f2fs_has_stable_inodes,
        .get_ino_and_lblk_bits  = f2fs_get_ino_and_lblk_bits,
        .get_num_devices        = f2fs_get_num_devices,
index de0c9b0..a6f1c6d 100644 (file)
@@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb,
        struct fat_bios_param_block *bpb)
 {
        static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+       sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
        struct fat_floppy_defaults *fdefaults = NULL;
        int error = -EINVAL;
-       sector_t bd_sects;
        unsigned i;
 
-       bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
        /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
        if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
                if (!silent)
@@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
                ret = writeback_inode(i1);
        if (!ret && i2)
                ret = writeback_inode(i2);
-       if (!ret) {
-               struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-               ret = filemap_flush(mapping);
-       }
+       if (!ret)
+               ret = sync_blockdev_nowait(sb->s_bdev);
        return ret;
 }
 EXPORT_SYMBOL_GPL(fat_flush_inodes);
index 81ec192..4124a89 100644 (file)
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
-                       blk_flush_plug(current);
+                       if (current->plug)
+                               blk_flush_plug(current->plug, false);
                        cond_resched();
                }
 
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
         * If we are expecting writeback progress we must submit plugged IO.
         */
        if (blk_needs_flush_plug(current))
-               blk_schedule_flush_plug(current);
+               blk_flush_plug(current->plug, true);
 
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
index 11404f8..e6039f2 100644 (file)
@@ -687,7 +687,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                        spin_unlock(&fi->lock);
                }
 
-               io->iocb->ki_complete(io->iocb, res, 0);
+               io->iocb->ki_complete(io->iocb, res);
        }
 
        kref_put(&io->refcnt, fuse_io_release);
index c559827..5436a68 100644 (file)
@@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
-       if (fl->fl_type & LOCK_MAND)
-               return -EOPNOTSUPP;
 
        if (fl->fl_type == F_UNLCK) {
                do_unflock(file, fl);
@@ -1353,7 +1351,7 @@ const struct file_operations gfs2_file_fops = {
        .llseek         = gfs2_llseek,
        .read_iter      = gfs2_file_read_iter,
        .write_iter     = gfs2_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
        .unlocked_ioctl = gfs2_ioctl,
        .compat_ioctl   = gfs2_compat_ioctl,
        .mmap           = gfs2_mmap,
@@ -1386,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .llseek         = gfs2_llseek,
        .read_iter      = gfs2_file_read_iter,
        .write_iter     = gfs2_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
        .unlocked_ioctl = gfs2_ioctl,
        .compat_ioctl   = gfs2_compat_ioctl,
        .mmap           = gfs2_mmap,
index cdf0ede..5beb826 100644 (file)
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
 
        /* default values */
        *start = 0;
-       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       *size = bdev_nr_sectors(sb->s_bdev);
 
        if (HFS_SB(sb)->session >= 0) {
                struct cdrom_tocentry te;
index 0350dc7..51ae6f1 100644 (file)
@@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 
        /* default values */
        *start = 0;
-       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       *size = bdev_nr_sectors(sb->s_bdev);
 
        if (HFSPLUS_SB(sb)->session >= 0) {
                struct cdrom_tocentry te;
index 3cd065c..cdd83d4 100644 (file)
@@ -23,22 +23,11 @@ struct pipe_inode_info;
 #ifdef CONFIG_BLOCK
 extern void __init bdev_cache_init(void);
 
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
 void emergency_thaw_bdev(struct super_block *sb);
 #else
 static inline void bdev_cache_init(void)
 {
 }
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
-       return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
-               void *arg)
-{
-}
 static inline int emergency_thaw_bdev(struct super_block *sb)
 {
        return 0;
index 422a7ed..38b33ad 100644 (file)
@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
 static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct,
                                        struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
 
 static bool io_worker_get(struct io_worker *worker)
 {
@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
                complete(&wq->worker_done);
 }
 
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+       struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+       struct io_wqe *wqe = worker->wqe;
+       struct io_wq *wq = wqe->wq;
+
+       atomic_dec(&acct->nr_running);
+       raw_spin_lock(&worker->wqe->lock);
+       acct->nr_workers--;
+       raw_spin_unlock(&worker->wqe->lock);
+       io_worker_ref_put(wq);
+       clear_bit_unlock(0, &worker->create_state);
+       io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+       struct io_worker *worker;
+
+       if (cb->func != create_worker_cb)
+               return false;
+       worker = container_of(cb, struct io_worker, create_work);
+       return worker == data;
+}
+
 static void io_worker_exit(struct io_worker *worker)
 {
        struct io_wqe *wqe = worker->wqe;
+       struct io_wq *wq = wqe->wq;
 
-       if (refcount_dec_and_test(&worker->ref))
-               complete(&worker->ref_done);
+       while (1) {
+               struct callback_head *cb = task_work_cancel_match(wq->task,
+                                               io_task_worker_match, worker);
+
+               if (!cb)
+                       break;
+               io_worker_cancel_cb(worker);
+       }
+
+       io_worker_release(worker);
        wait_for_completion(&worker->ref_done);
 
        raw_spin_lock(&wqe->lock);
@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
 
        init_task_work(&worker->create_work, func);
        worker->create_index = acct->index;
-       if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+       if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+               clear_bit_unlock(0, &worker->create_state);
                return true;
+       }
        clear_bit_unlock(0, &worker->create_state);
 fail_release:
        io_worker_release(worker);
@@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work)
        struct io_worker *worker = container_of(work, struct io_worker, work);
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 
-       if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
-               clear_bit_unlock(0, &worker->create_state);
-               io_worker_release(worker);
+       if (!io_queue_worker_create(worker, acct, create_worker_cont))
                kfree(worker);
-       }
 }
 
 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
 
        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                struct io_worker *worker;
-               struct io_wqe_acct *acct;
 
                worker = container_of(cb, struct io_worker, create_work);
-               acct = io_wqe_get_acct(worker);
-               atomic_dec(&acct->nr_running);
-               raw_spin_lock(&worker->wqe->lock);
-               acct->nr_workers--;
-               raw_spin_unlock(&worker->wqe->lock);
-               io_worker_ref_put(wq);
-               clear_bit_unlock(0, &worker->create_state);
-               io_worker_release(worker);
+               io_worker_cancel_cb(worker);
        }
 
        rcu_read_lock();
index bf5c4c5..41bf376 100644 (file)
@@ -29,6 +29,17 @@ struct io_wq_work_list {
        struct io_wq_work_node *last;
 };
 
+#define wq_list_for_each(pos, prv, head)                       \
+       for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv)                      \
+       for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list)    (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list)     do {                            \
+       (list)->first = NULL;                                   \
+} while (0)
+
 static inline void wq_list_add_after(struct io_wq_work_node *node,
                                     struct io_wq_work_node *pos,
                                     struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
        }
 }
 
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+                                   struct io_wq_work_list *list)
+{
+       node->next = list->first;
+       if (!node->next)
+               list->last = node;
+       WRITE_ONCE(list->first, node);
+}
+
 static inline void wq_list_cut(struct io_wq_work_list *list,
                               struct io_wq_work_node *last,
                               struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
        last->next = NULL;
 }
 
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+                                   struct io_wq_work_node *to)
+{
+       list->last->next = to->next;
+       to->next = list->first;
+       INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+                                 struct io_wq_work_node *to)
+{
+       if (!wq_list_empty(list)) {
+               __wq_list_splice(list, to);
+               return true;
+       }
+       return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+                                    struct io_wq_work_node *stack)
+{
+       node->next = stack->next;
+       stack->next = node;
+}
+
 static inline void wq_list_del(struct io_wq_work_list *list,
                               struct io_wq_work_node *node,
                               struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
        wq_list_cut(list, node, prev);
 }
 
-#define wq_list_for_each(pos, prv, head)                       \
-       for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+       struct io_wq_work_node *node = stack->next;
 
-#define wq_list_empty(list)    (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list)     do {                            \
-       (list)->first = NULL;                                   \
-       (list)->last = NULL;                                    \
-} while (0)
+       stack->next = node->next;
+       return node;
+}
 
 struct io_wq_work {
        struct io_wq_work_node list;
index bc18af5..3a4af97 100644 (file)
 
 #define IORING_MAX_REG_BUFFERS (1U << 14)
 
-#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
-                               IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
-                               IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+                         IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS        (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
-                               REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+                               REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+                               REQ_F_ASYNC_DATA)
 
 #define IO_TCTX_REFS_CACHE_NR  (1U << 10)
 
@@ -195,8 +198,10 @@ struct io_rings {
 };
 
 enum io_uring_cmd_flags {
-       IO_URING_F_NONBLOCK             = 1,
-       IO_URING_F_COMPLETE_DEFER       = 2,
+       IO_URING_F_COMPLETE_DEFER       = 1,
+       IO_URING_F_UNLOCKED             = 2,
+       /* int's last bit, sign checks are usually faster than a bit test */
+       IO_URING_F_NONBLOCK             = INT_MIN,
 };
 
 struct io_mapped_ubuf {
@@ -305,26 +310,16 @@ struct io_submit_link {
 };
 
 struct io_submit_state {
-       struct blk_plug         plug;
+       /* inline/task_work completion list, under ->uring_lock */
+       struct io_wq_work_node  free_list;
+       /* batch completion logic */
+       struct io_wq_work_list  compl_reqs;
        struct io_submit_link   link;
 
-       /*
-        * io_kiocb alloc cache
-        */
-       void                    *reqs[IO_REQ_CACHE_SIZE];
-       unsigned int            free_reqs;
-
        bool                    plug_started;
-
-       /*
-        * Batch completion logic
-        */
-       struct io_kiocb         *compl_reqs[IO_COMPL_BATCH];
-       unsigned int            compl_nr;
-       /* inline/task_work completion list, under ->uring_lock */
-       struct list_head        free_list;
-
-       unsigned int            ios_left;
+       bool                    need_plug;
+       unsigned short          submit_nr;
+       struct blk_plug         plug;
 };
 
 struct io_ring_ctx {
@@ -368,6 +363,7 @@ struct io_ring_ctx {
                 * uring_lock, and updated through io_uring_register(2)
                 */
                struct io_rsrc_node     *rsrc_node;
+               int                     rsrc_cached_refs;
                struct io_file_table    file_table;
                unsigned                nr_user_files;
                unsigned                nr_user_bufs;
@@ -384,7 +380,7 @@ struct io_ring_ctx {
        } ____cacheline_aligned_in_smp;
 
        /* IRQ completion list, under ->completion_lock */
-       struct list_head        locked_free_list;
+       struct io_wq_work_list  locked_free_list;
        unsigned int            locked_free_nr;
 
        const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
@@ -399,7 +395,6 @@ struct io_ring_ctx {
                unsigned                cached_cq_tail;
                unsigned                cq_entries;
                struct eventfd_ctx      *cq_ev_fd;
-               struct wait_queue_head  poll_wait;
                struct wait_queue_head  cq_wait;
                unsigned                cq_extra;
                atomic_t                cq_timeouts;
@@ -417,7 +412,7 @@ struct io_ring_ctx {
                 * For SQPOLL, only the single threaded io_sq_thread() will
                 * manipulate the list, hence no extra locking is needed there.
                 */
-               struct list_head        iopoll_list;
+               struct io_wq_work_list  iopoll_list;
                struct hlist_head       *cancel_hash;
                unsigned                cancel_hash_bits;
                bool                    poll_multi_queue;
@@ -580,7 +575,6 @@ struct io_sr_msg {
        int                             msg_flags;
        int                             bgid;
        size_t                          len;
-       struct io_buffer                *kbuf;
 };
 
 struct io_open {
@@ -692,11 +686,6 @@ struct io_hardlink {
        int                             flags;
 };
 
-struct io_completion {
-       struct file                     *file;
-       u32                             cflags;
-};
-
 struct io_async_connect {
        struct sockaddr_storage         address;
 };
@@ -710,11 +699,15 @@ struct io_async_msghdr {
        struct sockaddr_storage         addr;
 };
 
-struct io_async_rw {
-       struct iovec                    fast_iov[UIO_FASTIOV];
-       const struct iovec              *free_iovec;
+struct io_rw_state {
        struct iov_iter                 iter;
        struct iov_iter_state           iter_state;
+       struct iovec                    fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+       struct io_rw_state              s;
+       const struct iovec              *free_iovec;
        size_t                          bytes_done;
        struct wait_page_queue          wpq;
 };
@@ -741,9 +734,9 @@ enum {
        REQ_F_CREDS_BIT,
        REQ_F_REFCOUNT_BIT,
        REQ_F_ARM_LTIMEOUT_BIT,
+       REQ_F_ASYNC_DATA_BIT,
        /* keep async read/write and isreg together and in order */
-       REQ_F_NOWAIT_READ_BIT,
-       REQ_F_NOWAIT_WRITE_BIT,
+       REQ_F_SUPPORT_NOWAIT_BIT,
        REQ_F_ISREG_BIT,
 
        /* not a real bit, just to check we're not overflowing the space */
@@ -784,10 +777,8 @@ enum {
        REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
        /* caller should reissue async */
        REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
-       /* supports async reads */
-       REQ_F_NOWAIT_READ       = BIT(REQ_F_NOWAIT_READ_BIT),
-       /* supports async writes */
-       REQ_F_NOWAIT_WRITE      = BIT(REQ_F_NOWAIT_WRITE_BIT),
+       /* supports async reads/writes */
+       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
        /* regular file */
        REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
        /* has creds assigned */
@@ -796,6 +787,8 @@ enum {
        REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
        /* there is a linked timeout that has to be armed */
        REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+       /* ->async_data allocated */
+       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
 };
 
 struct async_poll {
@@ -852,39 +845,41 @@ struct io_kiocb {
                struct io_mkdir         mkdir;
                struct io_symlink       symlink;
                struct io_hardlink      hardlink;
-               /* use only after cleaning per-op data, see io_clean_op() */
-               struct io_completion    compl;
        };
 
-       /* opcode allocated if it needs to store data for async defer */
-       void                            *async_data;
        u8                              opcode;
        /* polled IO has completed */
        u8                              iopoll_completed;
-
        u16                             buf_index;
+       unsigned int                    flags;
+
+       u64                             user_data;
        u32                             result;
+       u32                             cflags;
 
        struct io_ring_ctx              *ctx;
-       unsigned int                    flags;
-       atomic_t                        refs;
        struct task_struct              *task;
-       u64                             user_data;
 
-       struct io_kiocb                 *link;
        struct percpu_ref               *fixed_rsrc_refs;
+       /* store used ubuf, so we can prevent reloading */
+       struct io_mapped_ubuf           *imu;
 
-       /* used with ctx->iopoll_list with reads/writes */
-       struct list_head                inflight_entry;
+       /* used by request caches, completion batching and iopoll */
+       struct io_wq_work_node          comp_list;
+       atomic_t                        refs;
+       struct io_kiocb                 *link;
        struct io_task_work             io_task_work;
        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
        struct hlist_node               hash_node;
+       /* internal polling, see IORING_FEAT_FAST_POLL */
        struct async_poll               *apoll;
+       /* opcode allocated if it needs to store data for async defer */
+       void                            *async_data;
        struct io_wq_work               work;
+       /* custom credentials, valid IFF REQ_F_CREDS is set */
        const struct cred               *creds;
-
-       /* store used ubuf, so we can prevent reloading */
-       struct io_mapped_ubuf           *imu;
+       /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+       struct io_buffer                *kbuf;
 };
 
 struct io_tctx_node {
@@ -902,12 +897,12 @@ struct io_defer_entry {
 struct io_op_def {
        /* needs req->file assigned */
        unsigned                needs_file : 1;
+       /* should block plug */
+       unsigned                plug : 1;
        /* hash wq insertion if file is a regular file */
        unsigned                hash_reg_file : 1;
        /* unbound wq insertion if file is a non-regular file */
        unsigned                unbound_nonreg_file : 1;
-       /* opcode is not supported by this kernel */
-       unsigned                not_supported : 1;
        /* set if opcode supports polled "wait" */
        unsigned                pollin : 1;
        unsigned                pollout : 1;
@@ -915,8 +910,8 @@ struct io_op_def {
        unsigned                buffer_select : 1;
        /* do prep async if is going to be punted */
        unsigned                needs_async_setup : 1;
-       /* should block plug */
-       unsigned                plug : 1;
+       /* opcode is not supported by this kernel */
+       unsigned                not_supported : 1;
        /* size of async data needed, if any */
        unsigned short          async_size;
 };
@@ -1080,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                long res, unsigned int cflags);
+                                s32 res, u32 cflags);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
@@ -1095,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1167,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req)
        atomic_inc(&req->refs);
 }
 
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+       if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+               __io_submit_flush_completions(ctx);
+}
+
 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
 {
        if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1180,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
        __io_req_set_refcount(req, 1);
 }
 
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH      100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+                                         struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
 {
-       struct io_ring_ctx *ctx = req->ctx;
+       struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+       if (ref) {
+               if (ref == &ctx->rsrc_node->refs)
+                       ctx->rsrc_cached_refs++;
+               else
+                       percpu_ref_put(ref);
+       }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+       if (req->fixed_rsrc_refs)
+               percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       if (ctx->rsrc_cached_refs) {
+               percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+               ctx->rsrc_cached_refs = 0;
+       }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+       percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
 
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+                                       struct io_ring_ctx *ctx)
+{
        if (!req->fixed_rsrc_refs) {
                req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
-               percpu_ref_get(req->fixed_rsrc_refs);
+               ctx->rsrc_cached_refs--;
+               if (unlikely(ctx->rsrc_cached_refs < 0))
+                       io_rsrc_refs_refill(ctx);
        }
 }
 
@@ -1219,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
        return false;
 }
 
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+       return req->flags & REQ_F_ASYNC_DATA;
+}
+
 static inline void req_set_fail(struct io_kiocb *req)
 {
        req->flags |= REQ_F_FAIL;
@@ -1230,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
        req->result = res;
 }
 
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 
@@ -1242,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
        return !req->timeout.off;
 }
 
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
 {
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
                                                fallback_work.work);
@@ -1255,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work)
                req->io_task_work.func(req, &locked);
 
        if (locked) {
-               if (ctx->submit_state.compl_nr)
-                       io_submit_flush_completions(ctx);
+               io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
        }
        percpu_ref_put(&ctx->refs);
-
 }
 
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
        struct io_ring_ctx *ctx;
        int hash_bits;
@@ -1300,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        ctx->flags = p->flags;
        init_waitqueue_head(&ctx->sqo_sq_wait);
        INIT_LIST_HEAD(&ctx->sqd_list);
-       init_waitqueue_head(&ctx->poll_wait);
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        init_completion(&ctx->ref_comp);
        xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1309,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        init_waitqueue_head(&ctx->cq_wait);
        spin_lock_init(&ctx->completion_lock);
        spin_lock_init(&ctx->timeout_lock);
-       INIT_LIST_HEAD(&ctx->iopoll_list);
+       INIT_WQ_LIST(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1318,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
        init_llist_head(&ctx->rsrc_put_llist);
        INIT_LIST_HEAD(&ctx->tctx_list);
-       INIT_LIST_HEAD(&ctx->submit_state.free_list);
-       INIT_LIST_HEAD(&ctx->locked_free_list);
+       ctx->submit_state.free_list.next = NULL;
+       INIT_WQ_LIST(&ctx->locked_free_list);
        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+       INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
        return ctx;
 err:
        kfree(ctx->dummy_ubuf);
@@ -1348,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
        return false;
 }
 
-#define FFS_ASYNC_READ         0x1UL
-#define FFS_ASYNC_WRITE                0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG              0x4UL
-#else
-#define FFS_ISREG              0x0UL
-#endif
-#define FFS_MASK               ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT             0x1UL
+#define FFS_ISREG              0x2UL
+#define FFS_MASK               ~(FFS_NOWAIT|FFS_ISREG)
 
 static inline bool io_req_ffs_set(struct io_kiocb *req)
 {
-       return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+       return req->flags & REQ_F_FIXED_FILE;
 }
 
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
 {
        if (!(req->flags & REQ_F_INFLIGHT)) {
                req->flags |= REQ_F_INFLIGHT;
@@ -1440,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req)
        }
 }
 
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+       struct io_submit_state *state = &req->ctx->submit_state;
+
+       wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link = io_prep_linked_timeout(req);
        struct io_uring_task *tctx = req->task->io_uring;
 
-       /* must not take the lock, NULL it as a precaution */
-       locked = NULL;
-
        BUG_ON(!tctx);
        BUG_ON(!tctx->io_wq);
 
@@ -1489,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
        }
 }
 
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 {
        while (!list_empty(&ctx->defer_list)) {
                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1503,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
        }
 }
 
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
        __must_hold(&ctx->completion_lock)
 {
        u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1536,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
        spin_unlock_irq(&ctx->timeout_lock);
 }
 
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
        if (ctx->off_timeout_used)
                io_flush_timeouts(ctx);
@@ -1606,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
         */
        if (wq_has_sleeper(&ctx->cq_wait))
                wake_up_all(&ctx->cq_wait);
-       if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
-               wake_up(&ctx->sq_data->wait);
        if (io_should_trigger_evfd(ctx))
                eventfd_signal(ctx->cq_ev_fd, 1);
-       if (waitqueue_active(&ctx->poll_wait))
-               wake_up_interruptible(&ctx->poll_wait);
 }
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1625,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
        }
        if (io_should_trigger_evfd(ctx))
                eventfd_signal(ctx->cq_ev_fd, 1);
-       if (waitqueue_active(&ctx->poll_wait))
-               wake_up_interruptible(&ctx->poll_wait);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -1722,7 +1758,7 @@ static inline void io_get_task_refs(int nr)
 }
 
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
-                                    long res, unsigned int cflags)
+                                    s32 res, u32 cflags)
 {
        struct io_overflow_cqe *ocqe;
 
@@ -1750,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 }
 
 static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                         long res, unsigned int cflags)
+                                         s32 res, u32 cflags)
 {
        struct io_uring_cqe *cqe;
 
@@ -1773,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
 
 /* not as hot to bloat with inlining */
 static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                         long res, unsigned int cflags)
+                                         s32 res, u32 cflags)
 {
        return __io_cqring_fill_event(ctx, user_data, res, cflags);
 }
 
-static void io_req_complete_post(struct io_kiocb *req, long res,
-                                unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+                                u32 cflags)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
@@ -1798,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
                                req->link = NULL;
                        }
                }
+               io_req_put_rsrc(req, ctx);
                io_dismantle_req(req);
                io_put_task(req->task, 1);
-               list_add(&req->inflight_entry, &ctx->locked_free_list);
+               wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
                ctx->locked_free_nr++;
-       } else {
-               if (!percpu_ref_tryget(&ctx->refs))
-                       req = NULL;
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
-
-       if (req) {
-               io_cqring_ev_posted(ctx);
-               percpu_ref_put(&ctx->refs);
-       }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
-       return req->flags & IO_REQ_CLEAN_FLAGS;
+       io_cqring_ev_posted(ctx);
 }
 
-static void io_req_complete_state(struct io_kiocb *req, long res,
-                                 unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+                                        u32 cflags)
 {
-       if (io_req_needs_clean(req))
-               io_clean_op(req);
        req->result = res;
-       req->compl.cflags = cflags;
+       req->cflags = cflags;
        req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
-                                    long res, unsigned cflags)
+                                    s32 res, u32 cflags)
 {
        if (issue_flags & IO_URING_F_COMPLETE_DEFER)
                io_req_complete_state(req, res, cflags);
@@ -1839,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
                io_req_complete_post(req, res, cflags);
 }
 
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
 {
        __io_req_complete(req, 0, res, 0);
 }
 
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
 {
        req_set_fail(req);
        io_req_complete_post(req, res, 0);
@@ -1878,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
                                        struct io_submit_state *state)
 {
        spin_lock(&ctx->completion_lock);
-       list_splice_init(&ctx->locked_free_list, &state->free_list);
+       wq_list_splice(&ctx->locked_free_list, &state->free_list);
        ctx->locked_free_nr = 0;
        spin_unlock(&ctx->completion_lock);
 }
@@ -1887,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
        struct io_submit_state *state = &ctx->submit_state;
-       int nr;
 
        /*
         * If we have more than a batch's worth of requests in our IRQ side
@@ -1896,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
         */
        if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
                io_flush_cached_locked_reqs(ctx, state);
-
-       nr = state->free_reqs;
-       while (!list_empty(&state->free_list)) {
-               struct io_kiocb *req = list_first_entry(&state->free_list,
-                                       struct io_kiocb, inflight_entry);
-
-               list_del(&req->inflight_entry);
-               state->reqs[nr++] = req;
-               if (nr == ARRAY_SIZE(state->reqs))
-                       break;
-       }
-
-       state->free_reqs = nr;
-       return nr != 0;
+       return !!state->free_list.next;
 }
 
 /*
@@ -1918,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
  * Because of that, io_alloc_req() should be called only under ->uring_lock
  * and with extra caution to not get a request that is still worked on.
  */
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
 {
        struct io_submit_state *state = &ctx->submit_state;
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+       void *reqs[IO_REQ_ALLOC_BATCH];
+       struct io_kiocb *req;
        int ret, i;
 
-       BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
-       if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
-               goto got_req;
+       if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+               return true;
 
-       ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
-                                   state->reqs);
+       ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 
        /*
         * Bulk alloc is all-or-nothing. If we fail to get a batch,
         * retry single alloc to be on the safe side.
         */
        if (unlikely(ret <= 0)) {
-               state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-               if (!state->reqs[0])
-                       return NULL;
+               reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+               if (!reqs[0])
+                       return false;
                ret = 1;
        }
 
-       for (i = 0; i < ret; i++)
-               io_preinit_req(state->reqs[i], ctx);
-       state->free_reqs = ret;
-got_req:
-       state->free_reqs--;
-       return state->reqs[state->free_reqs];
+       percpu_ref_get_many(&ctx->refs, ret);
+       for (i = 0; i < ret; i++) {
+               req = reqs[i];
+
+               io_preinit_req(req, ctx);
+               wq_stack_add_head(&req->comp_list, &state->free_list);
+       }
+       return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+       if (unlikely(!ctx->submit_state.free_list.next))
+               return __io_alloc_req_refill(ctx);
+       return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+       struct io_wq_work_node *node;
+
+       node = wq_stack_extract(&ctx->submit_state.free_list);
+       return container_of(node, struct io_kiocb, comp_list);
 }
 
 static inline void io_put_file(struct file *file)
@@ -1958,35 +1983,28 @@ static inline void io_put_file(struct file *file)
                fput(file);
 }
 
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
 {
        unsigned int flags = req->flags;
 
-       if (io_req_needs_clean(req))
+       if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
                io_clean_op(req);
        if (!(flags & REQ_F_FIXED_FILE))
                io_put_file(req->file);
-       if (req->fixed_rsrc_refs)
-               percpu_ref_put(req->fixed_rsrc_refs);
-       if (req->async_data) {
-               kfree(req->async_data);
-               req->async_data = NULL;
-       }
 }
 
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
+       io_req_put_rsrc(req, ctx);
        io_dismantle_req(req);
        io_put_task(req->task, 1);
 
        spin_lock(&ctx->completion_lock);
-       list_add(&req->inflight_entry, &ctx->locked_free_list);
+       wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
        ctx->locked_free_nr++;
        spin_unlock(&ctx->completion_lock);
-
-       percpu_ref_put(&ctx->refs);
 }
 
 static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2072,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req)
        return posted;
 }
 
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       bool posted;
+
+       spin_lock(&ctx->completion_lock);
+       posted = io_disarm_next(req);
+       if (posted)
+               io_commit_cqring(req->ctx);
+       spin_unlock(&ctx->completion_lock);
+       if (posted)
+               io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 {
        struct io_kiocb *nxt;
 
+       if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+               return NULL;
        /*
         * If LINK is set, we have dependent requests in this chain. If we
         * didn't fail this request, queue the first one up, moving any other
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
-       if (req->flags & IO_DISARM_MASK) {
-               struct io_ring_ctx *ctx = req->ctx;
-               bool posted;
-
-               spin_lock(&ctx->completion_lock);
-               posted = io_disarm_next(req);
-               if (posted)
-                       io_commit_cqring(req->ctx);
-               spin_unlock(&ctx->completion_lock);
-               if (posted)
-                       io_cqring_ev_posted(ctx);
-       }
+       if (unlikely(req->flags & IO_DISARM_MASK))
+               __io_req_find_next_prep(req);
        nxt = req->link;
        req->link = NULL;
        return nxt;
 }
 
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
-       if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
-               return NULL;
-       return __io_req_find_next(req);
-}
-
 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 {
        if (!ctx)
                return;
        if (*locked) {
-               if (ctx->submit_state.compl_nr)
-                       io_submit_flush_completions(ctx);
+               io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
                *locked = false;
        }
@@ -2129,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb)
        while (1) {
                struct io_wq_work_node *node;
 
-               if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+               if (!tctx->task_list.first && locked)
                        io_submit_flush_completions(ctx);
 
                spin_lock_irq(&tctx->task_lock);
@@ -2192,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
         * will do the job.
         */
        notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
-       if (!task_work_add(tsk, &tctx->task_work, notify)) {
-               wake_up_process(tsk);
+       if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+               if (notify == TWA_NONE)
+                       wake_up_process(tsk);
                return;
        }
 
@@ -2271,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
        io_free_req(req);
 }
 
-struct req_batch {
-       struct task_struct      *task;
-       int                     task_refs;
-       int                     ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+                               struct io_wq_work_node *node)
+       __must_hold(&ctx->uring_lock)
 {
-       rb->task_refs = 0;
-       rb->ctx_refs = 0;
-       rb->task = NULL;
-}
+       struct task_struct *task = NULL;
+       int task_refs = 0;
 
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
-                                    struct req_batch *rb)
-{
-       if (rb->ctx_refs)
-               percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
-       if (rb->task)
-               io_put_task(rb->task, rb->task_refs);
-}
+       do {
+               struct io_kiocb *req = container_of(node, struct io_kiocb,
+                                                   comp_list);
 
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
-                             struct io_submit_state *state)
-{
-       io_queue_next(req);
-       io_dismantle_req(req);
+               if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+                       node = req->comp_list.next;
+                       if (!req_ref_put_and_test(req))
+                               continue;
+               }
 
-       if (req->task != rb->task) {
-               if (rb->task)
-                       io_put_task(rb->task, rb->task_refs);
-               rb->task = req->task;
-               rb->task_refs = 0;
-       }
-       rb->task_refs++;
-       rb->ctx_refs++;
+               io_req_put_rsrc_locked(req, ctx);
+               io_queue_next(req);
+               io_dismantle_req(req);
 
-       if (state->free_reqs != ARRAY_SIZE(state->reqs))
-               state->reqs[state->free_reqs++] = req;
-       else
-               list_add(&req->inflight_entry, &state->free_list);
+               if (req->task != task) {
+                       if (task)
+                               io_put_task(task, task_refs);
+                       task = req->task;
+                       task_refs = 0;
+               }
+               task_refs++;
+               node = req->comp_list.next;
+               wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+       } while (node);
+
+       if (task)
+               io_put_task(task, task_refs);
 }
 
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
 {
+       struct io_wq_work_node *node, *prev;
        struct io_submit_state *state = &ctx->submit_state;
-       int i, nr = state->compl_nr;
-       struct req_batch rb;
 
        spin_lock(&ctx->completion_lock);
-       for (i = 0; i < nr; i++) {
-               struct io_kiocb *req = state->compl_reqs[i];
+       wq_list_for_each(node, prev, &state->compl_reqs) {
+               struct io_kiocb *req = container_of(node, struct io_kiocb,
+                                                   comp_list);
 
                __io_cqring_fill_event(ctx, req->user_data, req->result,
-                                       req->compl.cflags);
+                                       req->cflags);
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        io_cqring_ev_posted(ctx);
 
-       io_init_req_batch(&rb);
-       for (i = 0; i < nr; i++) {
-               struct io_kiocb *req = state->compl_reqs[i];
-
-               if (req_ref_put_and_test(req))
-                       io_req_free_batch(&rb, req, &ctx->submit_state);
-       }
-
-       io_req_free_batch_finish(ctx, &rb);
-       state->compl_nr = 0;
+       io_free_batch_list(ctx, state->compl_reqs.first);
+       INIT_WQ_LIST(&state->compl_reqs);
 }
 
 /*
@@ -2401,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
 
 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
 {
-       struct io_buffer *kbuf;
-
        if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
                return 0;
-       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
-       return io_put_kbuf(req, kbuf);
+       return io_put_kbuf(req, req->kbuf);
 }
 
 static inline bool io_run_task_work(void)
@@ -2420,50 +2419,22 @@ static inline bool io_run_task_work(void)
        return false;
 }
 
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                              struct list_head *done)
-{
-       struct req_batch rb;
-       struct io_kiocb *req;
-
-       /* order with ->result store in io_complete_rw_iopoll() */
-       smp_rmb();
-
-       io_init_req_batch(&rb);
-       while (!list_empty(done)) {
-               req = list_first_entry(done, struct io_kiocb, inflight_entry);
-               list_del(&req->inflight_entry);
-
-               __io_cqring_fill_event(ctx, req->user_data, req->result,
-                                       io_put_rw_kbuf(req));
-               (*nr_events)++;
-
-               if (req_ref_put_and_test(req))
-                       io_req_free_batch(&rb, req, &ctx->submit_state);
-       }
-
-       io_commit_cqring(ctx);
-       io_cqring_ev_posted_iopoll(ctx);
-       io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                       long min)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
-       struct io_kiocb *req, *tmp;
-       LIST_HEAD(done);
-       bool spin;
+       struct io_wq_work_node *pos, *start, *prev;
+       unsigned int poll_flags = BLK_POLL_NOSLEEP;
+       DEFINE_IO_COMP_BATCH(iob);
+       int nr_events = 0;
 
        /*
         * Only spin for completions if we don't have multiple devices hanging
-        * off our complete list, and we're under the requested amount.
+        * off our complete list.
         */
-       spin = !ctx->poll_multi_queue && *nr_events < min;
+       if (ctx->poll_multi_queue || force_nonspin)
+               poll_flags |= BLK_POLL_ONESHOT;
 
-       list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+       wq_list_for_each(pos, start, &ctx->iopoll_list) {
+               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
                struct kiocb *kiocb = &req->rw.kiocb;
                int ret;
 
@@ -2472,47 +2443,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                 * If we find a request that requires polling, break out
                 * and complete those lists first, if we have entries there.
                 */
-               if (READ_ONCE(req->iopoll_completed)) {
-                       list_move_tail(&req->inflight_entry, &done);
-                       continue;
-               }
-               if (!list_empty(&done))
+               if (READ_ONCE(req->iopoll_completed))
                        break;
 
-               ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+               ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
                if (unlikely(ret < 0))
                        return ret;
                else if (ret)
-                       spin = false;
+                       poll_flags |= BLK_POLL_ONESHOT;
 
                /* iopoll may have completed current req */
-               if (READ_ONCE(req->iopoll_completed))
-                       list_move_tail(&req->inflight_entry, &done);
+               if (!rq_list_empty(iob.req_list) ||
+                   READ_ONCE(req->iopoll_completed))
+                       break;
        }
 
-       if (!list_empty(&done))
-               io_iopoll_complete(ctx, nr_events, &done);
+       if (!rq_list_empty(iob.req_list))
+               iob.complete(&iob);
+       else if (!pos)
+               return 0;
+
+       prev = start;
+       wq_list_for_each_resume(pos, prev) {
+               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 
-       return 0;
+               /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+               if (!smp_load_acquire(&req->iopoll_completed))
+                       break;
+               __io_cqring_fill_event(ctx, req->user_data, req->result,
+                                       io_put_rw_kbuf(req));
+               nr_events++;
+       }
+
+       if (unlikely(!nr_events))
+               return 0;
+
+       io_commit_cqring(ctx);
+       io_cqring_ev_posted_iopoll(ctx);
+       pos = start ? start->next : ctx->iopoll_list.first;
+       wq_list_cut(&ctx->iopoll_list, prev, start);
+       io_free_batch_list(ctx, pos);
+       return nr_events;
 }
 
 /*
  * We can't just wait for polled events to come to us, we have to actively
  * find and complete them.
  */
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 {
        if (!(ctx->flags & IORING_SETUP_IOPOLL))
                return;
 
        mutex_lock(&ctx->uring_lock);
-       while (!list_empty(&ctx->iopoll_list)) {
-               unsigned int nr_events = 0;
-
-               io_do_iopoll(ctx, &nr_events, 0);
-
+       while (!wq_list_empty(&ctx->iopoll_list)) {
                /* let it sleep and repeat later if can't complete a request */
-               if (nr_events == 0)
+               if (io_do_iopoll(ctx, true) == 0)
                        break;
                /*
                 * Ensure we allow local-to-the-cpu processing to take place,
@@ -2559,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
                 * forever, while the workqueue is stuck trying to acquire the
                 * very same mutex.
                 */
-               if (list_empty(&ctx->iopoll_list)) {
+               if (wq_list_empty(&ctx->iopoll_list)) {
                        u32 tail = ctx->cached_cq_tail;
 
                        mutex_unlock(&ctx->uring_lock);
@@ -2568,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
                        /* some requests don't go through iopoll_list */
                        if (tail != ctx->cached_cq_tail ||
-                           list_empty(&ctx->iopoll_list))
+                           wq_list_empty(&ctx->iopoll_list))
                                break;
                }
-               ret = io_do_iopoll(ctx, &nr_events, min);
-       } while (!ret && nr_events < min && !need_resched());
+               ret = io_do_iopoll(ctx, !min);
+               if (ret < 0)
+                       break;
+               nr_events += ret;
+               ret = 0;
+       } while (nr_events < min && !need_resched());
 out:
        mutex_unlock(&ctx->uring_lock);
        return ret;
@@ -2597,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 {
        struct io_async_rw *rw = req->async_data;
 
-       if (!rw)
+       if (!req_has_async_data(req))
                return !io_req_prep_async(req);
-       iov_iter_restore(&rw->iter, &rw->iter_state);
+       iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
        return true;
 }
 
@@ -2643,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
        if (req->rw.kiocb.ki_flags & IOCB_WRITE)
                kiocb_end_write(req);
-       if (res != req->result) {
+       if (unlikely(res != req->result)) {
                if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
                    io_rw_should_reissue(req)) {
                        req->flags |= REQ_F_REISSUE;
@@ -2658,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 static void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
        unsigned int cflags = io_put_rw_kbuf(req);
-       long res = req->result;
+       int res = req->result;
 
        if (*locked) {
-               struct io_ring_ctx *ctx = req->ctx;
-               struct io_submit_state *state = &ctx->submit_state;
-
                io_req_complete_state(req, res, cflags);
-               state->compl_reqs[state->compl_nr++] = req;
-               if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
-                       io_submit_flush_completions(ctx);
+               io_req_add_compl_list(req);
        } else {
                io_req_complete_post(req, res, cflags);
        }
@@ -2681,7 +2666,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
        __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
 }
 
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
 {
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
@@ -2692,7 +2677,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
        io_req_task_work_add(req);
 }
 
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 {
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
@@ -2703,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
                        req->flags |= REQ_F_REISSUE;
                        return;
                }
+               req->result = res;
        }
 
-       WRITE_ONCE(req->result, res);
-       /* order with io_iopoll_complete() checking ->result */
-       smp_wmb();
-       WRITE_ONCE(req->iopoll_completed, 1);
+       /* order with io_iopoll_complete() checking ->iopoll_completed */
+       smp_store_release(&req->iopoll_completed, 1);
 }
 
 /*
@@ -2717,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  * find it from a io_do_iopoll() thread before the issuer is done
  * accessing the kiocb cookie.
  */
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_ring_ctx *ctx = req->ctx;
-       const bool in_async = io_wq_current_is_worker();
+       const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
        /* workqueue context doesn't hold uring_lock, grab it now */
-       if (unlikely(in_async))
+       if (unlikely(needs_lock))
                mutex_lock(&ctx->uring_lock);
 
        /*
@@ -2731,23 +2715,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
         * how we do polling eventually, not spinning if we're on potentially
         * different devices.
         */
-       if (list_empty(&ctx->iopoll_list)) {
+       if (wq_list_empty(&ctx->iopoll_list)) {
                ctx->poll_multi_queue = false;
        } else if (!ctx->poll_multi_queue) {
                struct io_kiocb *list_req;
-               unsigned int queue_num0, queue_num1;
 
-               list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
-                                               inflight_entry);
-
-               if (list_req->file != req->file) {
+               list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+                                       comp_list);
+               if (list_req->file != req->file)
                        ctx->poll_multi_queue = true;
-               } else {
-                       queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
-                       queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
-                       if (queue_num0 != queue_num1)
-                               ctx->poll_multi_queue = true;
-               }
        }
 
        /*
@@ -2755,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
         * it to the front so we find it first.
         */
        if (READ_ONCE(req->iopoll_completed))
-               list_add(&req->inflight_entry, &ctx->iopoll_list);
+               wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
        else
-               list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+               wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
 
-       if (unlikely(in_async)) {
+       if (unlikely(needs_lock)) {
                /*
                 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
                 * in sq thread task context or in io worker task context. If
@@ -2784,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
 {
-       umode_t mode = file_inode(file)->i_mode;
-
        if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK) &&
                    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2807,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
        /* any ->read/write should understand O_NONBLOCK */
        if (file->f_flags & O_NONBLOCK)
                return true;
+       return file->f_mode & FMODE_NOWAIT;
+}
 
-       if (!(file->f_mode & FMODE_NOWAIT))
-               return false;
-
-       if (rw == READ)
-               return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+       umode_t mode = file_inode(file)->i_mode;
+       unsigned int res = 0;
 
-       return file->f_op->write_iter != NULL;
+       if (S_ISREG(mode))
+               res |= FFS_ISREG;
+       if (__io_file_supports_nowait(file, mode))
+               res |= FFS_NOWAIT;
+       return res;
 }
 
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
 {
-       if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
-               return true;
-       else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
-               return true;
-
-       return __io_file_supports_nowait(req->file, rw);
+       return req->flags & REQ_F_SUPPORT_NOWAIT;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-                     int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct kiocb *kiocb = &req->rw.kiocb;
@@ -2836,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        unsigned ioprio;
        int ret;
 
-       if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
-               req->flags |= REQ_F_ISREG;
+       if (!io_req_ffs_set(req))
+               req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
        kiocb->ki_pos = READ_ONCE(sqe->off);
        if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
                req->flags |= REQ_F_CUR_POS;
                kiocb->ki_pos = file->f_pos;
        }
-       kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
-       kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+       kiocb->ki_flags = iocb_flags(file);
        ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
        if (unlikely(ret))
                return ret;
@@ -2856,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
         */
        if ((kiocb->ki_flags & IOCB_NOWAIT) ||
-           ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
+           ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
                req->flags |= REQ_F_NOWAIT;
 
-       ioprio = READ_ONCE(sqe->ioprio);
-       if (ioprio) {
-               ret = ioprio_check_cap(ioprio);
-               if (ret)
-                       return ret;
-
-               kiocb->ki_ioprio = ioprio;
-       } else
-               kiocb->ki_ioprio = get_current_ioprio();
-
        if (ctx->flags & IORING_SETUP_IOPOLL) {
-               if (!(kiocb->ki_flags & IOCB_DIRECT) ||
-                   !kiocb->ki_filp->f_op->iopoll)
+               if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
                        return -EOPNOTSUPP;
 
                kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2883,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                kiocb->ki_complete = io_complete_rw;
        }
 
-       if (req->opcode == IORING_OP_READ_FIXED ||
-           req->opcode == IORING_OP_WRITE_FIXED) {
-               req->imu = NULL;
-               io_req_set_rsrc_node(req);
+       ioprio = READ_ONCE(sqe->ioprio);
+       if (ioprio) {
+               ret = ioprio_check_cap(ioprio);
+               if (ret)
+                       return ret;
+
+               kiocb->ki_ioprio = ioprio;
+       } else {
+               kiocb->ki_ioprio = get_current_ioprio();
        }
 
+       req->imu = NULL;
        req->rw.addr = READ_ONCE(sqe->addr);
        req->rw.len = READ_ONCE(sqe->len);
        req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2912,7 +2884,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
                ret = -EINTR;
                fallthrough;
        default:
-               kiocb->ki_complete(kiocb, ret, 0);
+               kiocb->ki_complete(kiocb, ret);
        }
 }
 
@@ -2923,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
        struct io_async_rw *io = req->async_data;
 
        /* add previously done IO, if any */
-       if (io && io->bytes_done > 0) {
+       if (req_has_async_data(req) && io->bytes_done > 0) {
                if (ret < 0)
                        ret = io->bytes_done;
                else
@@ -2946,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
                        struct io_ring_ctx *ctx = req->ctx;
 
                        req_set_fail(req);
-                       if (!(issue_flags & IO_URING_F_NONBLOCK)) {
+                       if (issue_flags & IO_URING_F_UNLOCKED) {
                                mutex_lock(&ctx->uring_lock);
                                __io_req_complete(req, issue_flags, ret, cflags);
                                mutex_unlock(&ctx->uring_lock);
@@ -3017,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
 
 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 {
-       struct io_ring_ctx *ctx = req->ctx;
        struct io_mapped_ubuf *imu = req->imu;
        u16 index, buf_index = req->buf_index;
 
        if (likely(!imu)) {
+               struct io_ring_ctx *ctx = req->ctx;
+
                if (unlikely(buf_index >= ctx->nr_user_bufs))
                        return -EFAULT;
+               io_req_set_rsrc_node(req, ctx);
                index = array_index_nospec(buf_index, ctx->nr_user_bufs);
                imu = READ_ONCE(ctx->user_bufs[index]);
                req->imu = imu;
@@ -3050,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
 }
 
 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
-                                         int bgid, struct io_buffer *kbuf,
-                                         bool needs_lock)
+                                         int bgid, unsigned int issue_flags)
 {
+       struct io_buffer *kbuf = req->kbuf;
        struct io_buffer *head;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
        if (req->flags & REQ_F_BUFFER_SELECTED)
                return kbuf;
@@ -3074,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
                }
                if (*len > kbuf->len)
                        *len = kbuf->len;
+               req->flags |= REQ_F_BUFFER_SELECTED;
+               req->kbuf = kbuf;
        } else {
                kbuf = ERR_PTR(-ENOBUFS);
        }
 
        io_ring_submit_unlock(req->ctx, needs_lock);
-
        return kbuf;
 }
 
 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
-                                       bool needs_lock)
+                                       unsigned int issue_flags)
 {
        struct io_buffer *kbuf;
        u16 bgid;
 
-       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
        bgid = req->buf_index;
-       kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+       kbuf = io_buffer_select(req, len, bgid, issue_flags);
        if (IS_ERR(kbuf))
                return kbuf;
-       req->rw.addr = (u64) (unsigned long) kbuf;
-       req->flags |= REQ_F_BUFFER_SELECTED;
        return u64_to_user_ptr(kbuf->addr);
 }
 
 #ifdef CONFIG_COMPAT
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
-                               bool needs_lock)
+                               unsigned int issue_flags)
 {
        struct compat_iovec __user *uiov;
        compat_ssize_t clen;
@@ -3117,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
                return -EINVAL;
 
        len = clen;
-       buf = io_rw_buffer_select(req, &len, needs_lock);
+       buf = io_rw_buffer_select(req, &len, issue_flags);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
@@ -3127,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 #endif
 
 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-                                     bool needs_lock)
+                                     unsigned int issue_flags)
 {
        struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
        void __user *buf;
@@ -3139,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
        len = iov[0].iov_len;
        if (len < 0)
                return -EINVAL;
-       buf = io_rw_buffer_select(req, &len, needs_lock);
+       buf = io_rw_buffer_select(req, &len, issue_flags);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
@@ -3148,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 }
 
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-                                   bool needs_lock)
+                                   unsigned int issue_flags)
 {
        if (req->flags & REQ_F_BUFFER_SELECTED) {
-               struct io_buffer *kbuf;
+               struct io_buffer *kbuf = req->kbuf;
 
-               kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
                iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                iov[0].iov_len = kbuf->len;
                return 0;
@@ -3163,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 
 #ifdef CONFIG_COMPAT
        if (req->ctx->compat)
-               return io_compat_import(req, iov, needs_lock);
+               return io_compat_import(req, iov, issue_flags);
 #endif
 
-       return __io_iov_buffer_select(req, iov, needs_lock);
+       return __io_iov_buffer_select(req, iov, issue_flags);
 }
 
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
-                          struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+                                      struct io_rw_state *s,
+                                      unsigned int issue_flags)
 {
-       void __user *buf = u64_to_user_ptr(req->rw.addr);
-       size_t sqe_len = req->rw.len;
+       struct iov_iter *iter = &s->iter;
        u8 opcode = req->opcode;
+       struct iovec *iovec;
+       void __user *buf;
+       size_t sqe_len;
        ssize_t ret;
 
-       if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-               *iovec = NULL;
-               return io_import_fixed(req, rw, iter);
-       }
+       BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+       if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+               return ERR_PTR(io_import_fixed(req, rw, iter));
 
        /* buffer index only valid with fixed read/write, or buffer select  */
-       if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
-               return -EINVAL;
+       if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+               return ERR_PTR(-EINVAL);
+
+       buf = u64_to_user_ptr(req->rw.addr);
+       sqe_len = req->rw.len;
 
        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
                if (req->flags & REQ_F_BUFFER_SELECT) {
-                       buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+                       buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
                        if (IS_ERR(buf))
-                               return PTR_ERR(buf);
+                               return ERR_CAST(buf);
                        req->rw.len = sqe_len;
                }
 
-               ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
-               *iovec = NULL;
-               return ret;
+               ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+               return ERR_PTR(ret);
        }
 
+       iovec = s->fast_iov;
        if (req->flags & REQ_F_BUFFER_SELECT) {
-               ret = io_iov_buffer_select(req, *iovec, needs_lock);
+               ret = io_iov_buffer_select(req, iovec, issue_flags);
                if (!ret)
-                       iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
-               *iovec = NULL;
-               return ret;
+                       iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+               return ERR_PTR(ret);
        }
 
-       return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+       ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
                              req->ctx->compat);
+       if (unlikely(ret < 0))
+               return ERR_PTR(ret);
+       return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+                                 struct iovec **iovec, struct io_rw_state *s,
+                                 unsigned int issue_flags)
+{
+       *iovec = __io_import_iovec(rw, req, s, issue_flags);
+       if (unlikely(IS_ERR(*iovec)))
+               return PTR_ERR(*iovec);
+
+       iov_iter_save_state(&s->iter, &s->iter_state);
+       return 0;
 }
 
 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3233,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
         */
        if (kiocb->ki_flags & IOCB_HIPRI)
                return -EOPNOTSUPP;
-       if (kiocb->ki_flags & IOCB_NOWAIT)
+       if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+           !(kiocb->ki_filp->f_flags & O_NONBLOCK))
                return -EAGAIN;
 
        while (iov_iter_count(iter)) {
@@ -3279,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 {
        struct io_async_rw *rw = req->async_data;
 
-       memcpy(&rw->iter, iter, sizeof(*iter));
+       memcpy(&rw->s.iter, iter, sizeof(*iter));
        rw->free_iovec = iovec;
        rw->bytes_done = 0;
        /* can only be fixed buffers, no need to do anything */
@@ -3288,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
        if (!iovec) {
                unsigned iov_off = 0;
 
-               rw->iter.iov = rw->fast_iov;
+               rw->s.iter.iov = rw->s.fast_iov;
                if (iter->iov != fast_iov) {
                        iov_off = iter->iov - fast_iov;
-                       rw->iter.iov += iov_off;
+                       rw->s.iter.iov += iov_off;
                }
-               if (rw->fast_iov != fast_iov)
-                       memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+               if (rw->s.fast_iov != fast_iov)
+                       memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
                               sizeof(struct iovec) * iter->nr_segs);
        } else {
                req->flags |= REQ_F_NEED_CLEANUP;
        }
 }
 
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
 {
        WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
        req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
-       return req->async_data == NULL;
+       if (req->async_data) {
+               req->flags |= REQ_F_ASYNC_DATA;
+               return false;
+       }
+       return true;
 }
 
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-                            const struct iovec *fast_iov,
-                            struct iov_iter *iter, bool force)
+                            struct io_rw_state *s, bool force)
 {
        if (!force && !io_op_defs[req->opcode].needs_async_setup)
                return 0;
-       if (!req->async_data) {
+       if (!req_has_async_data(req)) {
                struct io_async_rw *iorw;
 
                if (io_alloc_async_data(req)) {
@@ -3322,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
                        return -ENOMEM;
                }
 
-               io_req_map_rw(req, iovec, fast_iov, iter);
+               io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
                iorw = req->async_data;
                /* we've copied and mapped the iter, ensure state is saved */
-               iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+               iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
        }
        return 0;
 }
@@ -3333,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 {
        struct io_async_rw *iorw = req->async_data;
-       struct iovec *iov = iorw->fast_iov;
+       struct iovec *iov;
        int ret;
 
-       ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+       /* submission path, ->uring_lock should already be taken */
+       ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
        if (unlikely(ret < 0))
                return ret;
 
@@ -3344,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
        iorw->free_iovec = iov;
        if (iov)
                req->flags |= REQ_F_NEED_CLEANUP;
-       iov_iter_save_state(&iorw->iter, &iorw->iter_state);
        return 0;
 }
 
@@ -3352,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        if (unlikely(!(req->file->f_mode & FMODE_READ)))
                return -EBADF;
-       return io_prep_rw(req, sqe, READ);
+       return io_prep_rw(req, sqe);
 }
 
 /*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
  * when we initially tried to do the IO with the iocb armed our waitqueue.
  * This gets called when the page is unlocked, and we generally expect that to
  * happen when the page IO is completed and the page is now uptodate. This will
@@ -3428,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 
 static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 {
-       if (req->file->f_op->read_iter)
+       if (likely(req->file->f_op->read_iter))
                return call_read_iter(req->file, &req->rw.kiocb, iter);
        else if (req->file->f_op->read)
                return loop_rw_iter(READ, req, iter);
@@ -3444,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req)
 
 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
-       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct io_rw_state __s, *s = &__s;
+       struct iovec *iovec;
        struct kiocb *kiocb = &req->rw.kiocb;
-       struct iov_iter __iter, *iter = &__iter;
-       struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-       struct iov_iter_state __state, *state;
+       struct io_async_rw *rw;
        ssize_t ret, ret2;
 
-       if (rw) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
+       if (!req_has_async_data(req)) {
+               ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+               if (unlikely(ret < 0))
+                       return ret;
+       } else {
+               rw = req->async_data;
+               s = &rw->s;
                /*
                 * We come here from an earlier attempt, restore our state to
                 * match in case it doesn't. It's cheap enough that we don't
                 * need to make this conditional.
                 */
-               iov_iter_restore(iter, state);
+               iov_iter_restore(&s->iter, &s->iter_state);
                iovec = NULL;
-       } else {
-               ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
-               if (ret < 0)
-                       return ret;
-               state = &__state;
-               iov_iter_save_state(iter, state);
        }
-       req->result = iov_iter_count(iter);
+       req->result = iov_iter_count(&s->iter);
 
-       /* Ensure we clear previously set non-block flag */
-       if (!force_nonblock)
-               kiocb->ki_flags &= ~IOCB_NOWAIT;
-       else
+       if (force_nonblock) {
+               /* If the file doesn't support async, just async punt */
+               if (unlikely(!io_file_supports_nowait(req))) {
+                       ret = io_setup_async_rw(req, iovec, s, true);
+                       return ret ?: -EAGAIN;
+               }
                kiocb->ki_flags |= IOCB_NOWAIT;
-
-       /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_nowait(req, READ)) {
-               ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-               return ret ?: -EAGAIN;
+       } else {
+               /* Ensure we clear previously set non-block flag */
+               kiocb->ki_flags &= ~IOCB_NOWAIT;
        }
 
        ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
@@ -3489,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                return ret;
        }
 
-       ret = io_iter_do_read(req, iter);
+       ret = io_iter_do_read(req, &s->iter);
 
        if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                req->flags &= ~REQ_F_REISSUE;
@@ -3502,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                ret = 0;
        } else if (ret == -EIOCBQUEUED) {
                goto out_free;
-       } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+       } else if (ret == req->result || ret <= 0 || !force_nonblock ||
                   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
                /* read all, failed, already did sync or don't want to retry */
                goto done;
@@ -3513,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
         * untouched in case of error. Restore it and we'll advance it
         * manually if we need to.
         */
-       iov_iter_restore(iter, state);
+       iov_iter_restore(&s->iter, &s->iter_state);
 
-       ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+       ret2 = io_setup_async_rw(req, iovec, s, true);
        if (ret2)
                return ret2;
 
        iovec = NULL;
        rw = req->async_data;
+       s = &rw->s;
        /*
         * Now use our persistent iterator and state, if we aren't already.
         * We've restored and mapped the iter to match.
         */
-       if (iter != &rw->iter) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
-       }
 
        do {
                /*
@@ -3536,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                 * above or inside this loop. Advance the iter by the bytes
                 * that were consumed.
                 */
-               iov_iter_advance(iter, ret);
-               if (!iov_iter_count(iter))
+               iov_iter_advance(&s->iter, ret);
+               if (!iov_iter_count(&s->iter))
                        break;
                rw->bytes_done += ret;
-               iov_iter_save_state(iter, state);
+               iov_iter_save_state(&s->iter, &s->iter_state);
 
                /* if we can retry, do so with the callbacks armed */
                if (!io_rw_should_retry(req)) {
@@ -3554,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                 * desired page gets unlocked. We can also get a partial read
                 * here, and if we do, then just retry at the new offset.
                 */
-               ret = io_iter_do_read(req, iter);
+               ret = io_iter_do_read(req, &s->iter);
                if (ret == -EIOCBQUEUED)
                        return 0;
                /* we got some bytes, but not all. retry. */
                kiocb->ki_flags &= ~IOCB_WAITQ;
-               iov_iter_restore(iter, state);
+               iov_iter_restore(&s->iter, &s->iter_state);
        } while (ret > 0);
 done:
        kiocb_done(kiocb, ret, issue_flags);
@@ -3574,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                return -EBADF;
-       return io_prep_rw(req, sqe, WRITE);
+       req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
+       return io_prep_rw(req, sqe);
 }
 
 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
-       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct io_rw_state __s, *s = &__s;
+       struct iovec *iovec;
        struct kiocb *kiocb = &req->rw.kiocb;
-       struct iov_iter __iter, *iter = &__iter;
-       struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-       struct iov_iter_state __state, *state;
        ssize_t ret, ret2;
 
-       if (rw) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
-               iov_iter_restore(iter, state);
+       if (!req_has_async_data(req)) {
+               ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+               if (unlikely(ret < 0))
+                       return ret;
+       } else {
+               struct io_async_rw *rw = req->async_data;
+
+               s = &rw->s;
+               iov_iter_restore(&s->iter, &s->iter_state);
                iovec = NULL;
-       } else {
-               ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
-               if (ret < 0)
-                       return ret;
-               state = &__state;
-               iov_iter_save_state(iter, state);
        }
-       req->result = iov_iter_count(iter);
+       req->result = iov_iter_count(&s->iter);
 
-       /* Ensure we clear previously set non-block flag */
-       if (!force_nonblock)
-               kiocb->ki_flags &= ~IOCB_NOWAIT;
-       else
-               kiocb->ki_flags |= IOCB_NOWAIT;
+       if (force_nonblock) {
+               /* If the file doesn't support async, just async punt */
+               if (unlikely(!io_file_supports_nowait(req)))
+                       goto copy_iov;
 
-       /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_nowait(req, WRITE))
-               goto copy_iov;
+               /* file path doesn't support NOWAIT for non-direct_IO */
+               if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+                   (req->flags & REQ_F_ISREG))
+                       goto copy_iov;
 
-       /* file path doesn't support NOWAIT for non-direct_IO */
-       if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
-           (req->flags & REQ_F_ISREG))
-               goto copy_iov;
+               kiocb->ki_flags |= IOCB_NOWAIT;
+       } else {
+               /* Ensure we clear previously set non-block flag */
+               kiocb->ki_flags &= ~IOCB_NOWAIT;
+       }
 
        ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
        if (unlikely(ret))
@@ -3634,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
        }
        kiocb->ki_flags |= IOCB_WRITE;
 
-       if (req->file->f_op->write_iter)
-               ret2 = call_write_iter(req->file, kiocb, iter);
+       if (likely(req->file->f_op->write_iter))
+               ret2 = call_write_iter(req->file, kiocb, &s->iter);
        else if (req->file->f_op->write)
-               ret2 = loop_rw_iter(WRITE, req, iter);
+               ret2 = loop_rw_iter(WRITE, req, &s->iter);
        else
                ret2 = -EINVAL;
 
@@ -3657,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
                goto done;
        if (!force_nonblock || ret2 != -EAGAIN) {
                /* IOPOLL retry should happen for io-wq threads */
-               if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+               if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
                        goto copy_iov;
 done:
                kiocb_done(kiocb, ret2, issue_flags);
        } else {
 copy_iov:
-               iov_iter_restore(iter, state);
-               ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+               iov_iter_restore(&s->iter, &s->iter_state);
+               ret = io_setup_async_rw(req, iovec, s, false);
                return ret ?: -EAGAIN;
        }
 out_free:
@@ -3800,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
        return 0;
 }
 
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_mkdir *mkd = &req->mkdir;
        int ret;
@@ -3849,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
        return 0;
 }
 
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_symlink *sl = &req->symlink;
        int ret;
@@ -3899,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req,
        return 0;
 }
 
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_hardlink *lnk = &req->hardlink;
        int ret;
@@ -4318,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head;
        int ret = 0;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
 
        lockdep_assert_held(&ctx->uring_lock);
 
@@ -4333,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
        return 0;
 }
 
@@ -4405,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head, *list;
        int ret = 0;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
 
        lockdep_assert_held(&ctx->uring_lock);
 
@@ -4423,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
                req_set_fail(req);
        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
        return 0;
 }
 
@@ -4756,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
        if (unlikely(!sock))
                return -ENOTSOCK;
 
-       kmsg = req->async_data;
-       if (!kmsg) {
+       if (req_has_async_data(req)) {
+               kmsg = req->async_data;
+       } else {
                ret = io_sendmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
@@ -4916,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 }
 
 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
-                                              bool needs_lock)
+                                              unsigned int issue_flags)
 {
        struct io_sr_msg *sr = &req->sr_msg;
-       struct io_buffer *kbuf;
-
-       kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
-       if (IS_ERR(kbuf))
-               return kbuf;
 
-       sr->kbuf = kbuf;
-       req->flags |= REQ_F_BUFFER_SELECTED;
-       return kbuf;
+       return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
 }
 
 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
 {
-       return io_put_kbuf(req, req->sr_msg.kbuf);
+       return io_put_kbuf(req, req->kbuf);
 }
 
 static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4980,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
        if (unlikely(!sock))
                return -ENOTSOCK;
 
-       kmsg = req->async_data;
-       if (!kmsg) {
+       if (req_has_async_data(req)) {
+               kmsg = req->async_data;
+       } else {
                ret = io_recvmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
@@ -4989,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
        }
 
        if (req->flags & REQ_F_BUFFER_SELECT) {
-               kbuf = io_recv_buffer_select(req, !force_nonblock);
+               kbuf = io_recv_buffer_select(req, issue_flags);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5041,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
                return -ENOTSOCK;
 
        if (req->flags & REQ_F_BUFFER_SELECT) {
-               kbuf = io_recv_buffer_select(req, !force_nonblock);
+               kbuf = io_recv_buffer_select(req, issue_flags);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                buf = u64_to_user_ptr(kbuf->addr);
@@ -5172,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
        int ret;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
-       if (req->async_data) {
+       if (req_has_async_data(req)) {
                io = req->async_data;
        } else {
                ret = move_addr_to_kernel(req->connect.addr,
@@ -5188,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
        ret = __sys_connect_file(req->file, &io->address,
                                        req->connect.addr_len, file_flags);
        if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
-               if (req->async_data)
+               if (req_has_async_data(req))
                        return -EAGAIN;
                if (io_alloc_async_data(req)) {
                        ret = -ENOMEM;
@@ -5348,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
        return !(flags & IORING_CQE_F_MORE);
 }
 
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
-       __must_hold(&req->ctx->completion_lock)
-{
-       bool done;
-
-       done = __io_poll_complete(req, mask);
-       io_commit_cqring(req->ctx);
-       return done;
-}
-
 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
        struct io_ring_ctx *ctx = req->ctx;
@@ -5479,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
                req_ref_get(req);
                poll->wait.private = req;
+
                *poll_ptr = poll;
+               if (req->opcode == IORING_OP_POLL_ADD)
+                       req->flags |= REQ_F_ASYNC_DATA;
        }
 
        pt->nr_entries++;
@@ -5603,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
        struct async_poll *apoll;
        struct io_poll_table ipt;
        __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
-       int rw;
 
-       if (!req->file || !file_can_poll(req->file))
-               return IO_APOLL_ABORTED;
-       if (req->flags & REQ_F_POLLED)
-               return IO_APOLL_ABORTED;
        if (!def->pollin && !def->pollout)
                return IO_APOLL_ABORTED;
+       if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+               return IO_APOLL_ABORTED;
 
        if (def->pollin) {
-               rw = READ;
                mask |= POLLIN | POLLRDNORM;
 
                /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5621,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
                    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
                        mask &= ~POLLIN;
        } else {
-               rw = WRITE;
                mask |= POLLOUT | POLLWRNORM;
        }
 
-       /* if we can't nonblock try, then no point in arming a poll handler */
-       if (!io_file_supports_nowait(req, rw))
-               return IO_APOLL_ABORTED;
-
        apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
        if (unlikely(!apoll))
                return IO_APOLL_ABORTED;
@@ -5689,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 /*
  * Returns true if we found and killed one or more poll requests
  */
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
-                              bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+                                     struct task_struct *tsk, bool cancel_all)
 {
        struct hlist_node *tmp;
        struct io_kiocb *req;
@@ -5844,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
        if (mask) { /* no async, we'd stolen it */
                ipt.error = 0;
-               done = io_poll_complete(req, mask);
+               done = __io_poll_complete(req, mask);
+               io_commit_cqring(req->ctx);
        }
        spin_unlock(&ctx->completion_lock);
 
@@ -5920,7 +5889,10 @@ err:
 
 static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
 {
-       req_set_fail(req);
+       struct io_timeout_data *data = req->async_data;
+
+       if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+               req_set_fail(req);
        io_req_complete_post(req, -ETIME, 0);
 }
 
@@ -6126,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        if (off && is_timeout_link)
                return -EINVAL;
        flags = READ_ONCE(sqe->timeout_flags);
-       if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+       if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+                     IORING_TIMEOUT_ETIME_SUCCESS))
                return -EINVAL;
        /* more than one clock specified is invalid, obviously */
        if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6137,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        if (unlikely(off && !req->ctx->off_timeout_used))
                req->ctx->off_timeout_used = true;
 
-       if (!req->async_data && io_alloc_async_data(req))
+       if (WARN_ON_ONCE(req_has_async_data(req)))
+               return -EFAULT;
+       if (io_alloc_async_data(req))
                return -ENOMEM;
 
        data = req->async_data;
@@ -6294,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_ring_ctx *ctx = req->ctx;
        u64 sqe_addr = req->cancel.addr;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
        struct io_tctx_node *node;
        int ret;
 
@@ -6302,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
                goto done;
 
        /* slow path, try all io-wq's */
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
        ret = -ENOENT;
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;
@@ -6311,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
                if (ret != -ENOENT)
                        break;
        }
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
 done:
        if (ret < 0)
                req_set_fail(req);
@@ -6338,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
        struct io_ring_ctx *ctx = req->ctx;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
        struct io_uring_rsrc_update2 up;
        int ret;
 
@@ -6347,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
        up.tags = 0;
        up.resv = 0;
 
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
        ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
                                        &up, req->rsrc_update.nr_args);
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
 
        if (ret < 0)
                req_set_fail(req);
@@ -6446,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req)
 {
        if (!io_op_defs[req->opcode].needs_async_setup)
                return 0;
-       if (WARN_ON_ONCE(req->async_data))
+       if (WARN_ON_ONCE(req_has_async_data(req)))
                return -EFAULT;
        if (io_alloc_async_data(req))
                return -EAGAIN;
@@ -6478,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
        return seq;
 }
 
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
 {
-       struct io_kiocb *pos;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_defer_entry *de;
        int ret;
-       u32 seq;
-
-       if (req->flags & REQ_F_FAIL) {
-               io_req_complete_fail_submit(req);
-               return true;
-       }
-
-       /*
-        * If we need to drain a request in the middle of a link, drain the
-        * head request and the next request/link after the current link.
-        * Considering sequential execution of links, IOSQE_IO_DRAIN will be
-        * maintained for every request of our link.
-        */
-       if (ctx->drain_next) {
-               req->flags |= REQ_F_IO_DRAIN;
-               ctx->drain_next = false;
-       }
-       /* not interested in head, start from the first linked */
-       io_for_each_link(pos, req->link) {
-               if (pos->flags & REQ_F_IO_DRAIN) {
-                       ctx->drain_next = true;
-                       req->flags |= REQ_F_IO_DRAIN;
-                       break;
-               }
-       }
+       u32 seq = io_get_sequence(req);
 
        /* Still need defer if there is pending req in defer list. */
-       if (likely(list_empty_careful(&ctx->defer_list) &&
-               !(req->flags & REQ_F_IO_DRAIN))) {
+       if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
                ctx->drain_active = false;
-               return false;
+               io_req_task_queue(req);
+               return;
        }
 
-       seq = io_get_sequence(req);
-       /* Still a chance to pass the sequence check */
-       if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
-               return false;
-
        ret = io_req_prep_async(req);
-       if (ret)
-               goto fail;
+       if (ret) {
+fail:
+               io_req_complete_failed(req, ret);
+               return;
+       }
        io_prep_async_link(req);
        de = kmalloc(sizeof(*de), GFP_KERNEL);
        if (!de) {
                ret = -ENOMEM;
-fail:
-               io_req_complete_failed(req, ret);
-               return true;
+               goto fail;
        }
 
        spin_lock(&ctx->completion_lock);
        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                spin_unlock(&ctx->completion_lock);
                kfree(de);
-               io_queue_async_work(req, NULL);
-               return true;
+               goto queue;
        }
 
        trace_io_uring_defer(ctx, req, req->user_data);
@@ -6547,23 +6495,13 @@ fail:
        de->seq = seq;
        list_add_tail(&de->list, &ctx->defer_list);
        spin_unlock(&ctx->completion_lock);
-       return true;
 }
 
 static void io_clean_op(struct io_kiocb *req)
 {
        if (req->flags & REQ_F_BUFFER_SELECTED) {
-               switch (req->opcode) {
-               case IORING_OP_READV:
-               case IORING_OP_READ_FIXED:
-               case IORING_OP_READ:
-                       kfree((void *)(unsigned long)req->rw.addr);
-                       break;
-               case IORING_OP_RECVMSG:
-               case IORING_OP_RECV:
-                       kfree(req->sr_msg.kbuf);
-                       break;
-               }
+               kfree(req->kbuf);
+               req->kbuf = NULL;
        }
 
        if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6628,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req)
        }
        if (req->flags & REQ_F_CREDS)
                put_cred(req->creds);
-
+       if (req->flags & REQ_F_ASYNC_DATA) {
+               kfree(req->async_data);
+               req->async_data = NULL;
+       }
        req->flags &= ~IO_REQ_CLEAN_FLAGS;
 }
 
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
-       struct io_ring_ctx *ctx = req->ctx;
        const struct cred *creds = NULL;
        int ret;
 
-       if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+       if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
                creds = override_creds(req->creds);
 
        switch (req->opcode) {
@@ -6761,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
        if (ret)
                return ret;
        /* If the op doesn't have a file, we're not polling for it */
-       if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
-               io_iopoll_req_issued(req);
+       if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+               io_iopoll_req_issued(req, issue_flags);
 
        return 0;
 }
@@ -6778,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 static void io_wq_submit_work(struct io_wq_work *work)
 {
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+       unsigned int issue_flags = IO_URING_F_UNLOCKED;
+       bool needs_poll = false;
        struct io_kiocb *timeout;
        int ret = 0;
 
@@ -6792,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
                io_queue_linked_timeout(timeout);
 
        /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
-       if (work->flags & IO_WQ_WORK_CANCEL)
-               ret = -ECANCELED;
+       if (work->flags & IO_WQ_WORK_CANCEL) {
+               io_req_task_queue_fail(req, -ECANCELED);
+               return;
+       }
 
-       if (!ret) {
-               do {
-                       ret = io_issue_sqe(req, 0);
-                       /*
-                        * We can get EAGAIN for polled IO even though we're
-                        * forcing a sync submission from here, since we can't
-                        * wait for request slots on the block side.
-                        */
-                       if (ret != -EAGAIN)
-                               break;
-                       cond_resched();
-               } while (1);
+       if (req->flags & REQ_F_FORCE_ASYNC) {
+               const struct io_op_def *def = &io_op_defs[req->opcode];
+               bool opcode_poll = def->pollin || def->pollout;
+
+               if (opcode_poll && file_can_poll(req->file)) {
+                       needs_poll = true;
+                       issue_flags |= IO_URING_F_NONBLOCK;
+               }
        }
 
+       do {
+               ret = io_issue_sqe(req, issue_flags);
+               if (ret != -EAGAIN)
+                       break;
+               /*
+                * We can get EAGAIN for iopolled IO even though we're
+                * forcing a sync submission from here, since we can't
+                * wait for request slots on the block side.
+                */
+               if (!needs_poll) {
+                       cond_resched();
+                       continue;
+               }
+
+               if (io_arm_poll_handler(req) == IO_APOLL_OK)
+                       return;
+               /* aborted or ready, in either case retry blocking */
+               needs_poll = false;
+               issue_flags &= ~IO_URING_F_NONBLOCK;
+       } while (1);
+
        /* avoid locking problems by failing it from a clean context */
        if (ret)
                io_req_task_queue_fail(req, ret);
@@ -6832,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 {
        unsigned long file_ptr = (unsigned long) file;
 
-       if (__io_file_supports_nowait(file, READ))
-               file_ptr |= FFS_ASYNC_READ;
-       if (__io_file_supports_nowait(file, WRITE))
-               file_ptr |= FFS_ASYNC_WRITE;
-       if (S_ISREG(file_inode(file)->i_mode))
-               file_ptr |= FFS_ISREG;
+       file_ptr |= io_file_get_flags(file);
        file_slot->file_ptr = file_ptr;
 }
 
@@ -6854,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
        file = (struct file *) (file_ptr & FFS_MASK);
        file_ptr &= ~FFS_MASK;
        /* mask in overlapping REQ_F and FFS bits */
-       req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
-       io_req_set_rsrc_node(req);
+       req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+       io_req_set_rsrc_node(req, ctx);
        return file;
 }
 
@@ -6947,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
        io_put_req(req);
 }
 
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+       __must_hold(&req->ctx->uring_lock)
+{
+       struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+       switch (io_arm_poll_handler(req)) {
+       case IO_APOLL_READY:
+               if (linked_timeout) {
+                       io_queue_linked_timeout(linked_timeout);
+                       linked_timeout = NULL;
+               }
+               io_req_task_queue(req);
+               break;
+       case IO_APOLL_ABORTED:
+               /*
+                * Queued up for async execution, worker will release
+                * submit reference when the iocb is actually submitted.
+                */
+               io_queue_async_work(req, NULL);
+               break;
+       }
+
+       if (linked_timeout)
+               io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
 {
        struct io_kiocb *linked_timeout;
        int ret;
 
-issue_sqe:
        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
+       if (req->flags & REQ_F_COMPLETE_INLINE) {
+               io_req_add_compl_list(req);
+               return;
+       }
        /*
         * We async punt it if the file wasn't marked NOWAIT, or if the file
         * doesn't support non-blocking read/write attempts
         */
        if (likely(!ret)) {
-               if (req->flags & REQ_F_COMPLETE_INLINE) {
-                       struct io_ring_ctx *ctx = req->ctx;
-                       struct io_submit_state *state = &ctx->submit_state;
-
-                       state->compl_reqs[state->compl_nr++] = req;
-                       if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
-                               io_submit_flush_completions(ctx);
-                       return;
-               }
-
                linked_timeout = io_prep_linked_timeout(req);
                if (linked_timeout)
                        io_queue_linked_timeout(linked_timeout);
        } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-               linked_timeout = io_prep_linked_timeout(req);
-
-               switch (io_arm_poll_handler(req)) {
-               case IO_APOLL_READY:
-                       if (linked_timeout)
-                               io_queue_linked_timeout(linked_timeout);
-                       goto issue_sqe;
-               case IO_APOLL_ABORTED:
-                       /*
-                        * Queued up for async execution, worker will release
-                        * submit reference when the iocb is actually submitted.
-                        */
-                       io_queue_async_work(req, NULL);
-                       break;
-               }
-
-               if (linked_timeout)
-                       io_queue_linked_timeout(linked_timeout);
+               io_queue_sqe_arm_apoll(req);
        } else {
                io_req_complete_failed(req, ret);
        }
 }
 
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
 {
-       if (unlikely(req->ctx->drain_active) && io_drain_req(req))
-               return;
-
-       if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
-               __io_queue_sqe(req);
-       } else if (req->flags & REQ_F_FAIL) {
+       if (req->flags & REQ_F_FAIL) {
                io_req_complete_fail_submit(req);
+       } else if (unlikely(req->ctx->drain_active)) {
+               io_drain_req(req);
        } else {
                int ret = io_req_prep_async(req);
 
@@ -7018,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
        }
 }
 
+static inline void io_queue_sqe(struct io_kiocb *req)
+       __must_hold(&req->ctx->uring_lock)
+{
+       if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+               __io_queue_sqe(req);
+       else
+               io_queue_sqe_fallback(req);
+}
+
 /*
  * Check SQE restrictions (opcode and flags).
  *
@@ -7027,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
                                        struct io_kiocb *req,
                                        unsigned int sqe_flags)
 {
-       if (likely(!ctx->restricted))
-               return true;
-
        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
                return false;
 
@@ -7044,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
        return true;
 }
 
+static void io_init_req_drain(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_kiocb *head = ctx->submit_state.link.head;
+
+       ctx->drain_active = true;
+       if (head) {
+               /*
+                * If we need to drain a request in the middle of a link, drain
+                * the head request and the next request/link after the current
+                * link. Considering sequential execution of links,
+                * IOSQE_IO_DRAIN will be maintained for every request of our
+                * link.
+                */
+               head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+               ctx->drain_next = true;
+       }
+}
+
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
 {
-       struct io_submit_state *state;
        unsigned int sqe_flags;
-       int personality, ret = 0;
+       int personality;
+       u8 opcode;
 
        /* req is partially pre-initialised, see io_preinit_req() */
-       req->opcode = READ_ONCE(sqe->opcode);
+       req->opcode = opcode = READ_ONCE(sqe->opcode);
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        req->flags = sqe_flags = READ_ONCE(sqe->flags);
        req->user_data = READ_ONCE(sqe->user_data);
@@ -7061,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        req->fixed_rsrc_refs = NULL;
        req->task = current;
 
-       /* enforce forwards compatibility on users */
-       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+       if (unlikely(opcode >= IORING_OP_LAST)) {
+               req->opcode = 0;
                return -EINVAL;
-       if (unlikely(req->opcode >= IORING_OP_LAST))
-               return -EINVAL;
-       if (!io_check_restriction(ctx, req, sqe_flags))
-               return -EACCES;
+       }
+       if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+               /* enforce forwards compatibility on users */
+               if (sqe_flags & ~SQE_VALID_FLAGS)
+                       return -EINVAL;
+               if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+                   !io_op_defs[opcode].buffer_select)
+                       return -EOPNOTSUPP;
+               if (sqe_flags & IOSQE_IO_DRAIN)
+                       io_init_req_drain(req);
+       }
+       if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+               if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+                       return -EACCES;
+               /* knock it to the slow queue path, will be drained there */
+               if (ctx->drain_active)
+                       req->flags |= REQ_F_FORCE_ASYNC;
+               /* if there is no link, we're at "next" request and need to drain */
+               if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+                       ctx->drain_next = false;
+                       ctx->drain_active = true;
+                       req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+               }
+       }
 
-       if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
-           !io_op_defs[req->opcode].buffer_select)
-               return -EOPNOTSUPP;
-       if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
-               ctx->drain_active = true;
+       if (io_op_defs[opcode].needs_file) {
+               struct io_submit_state *state = &ctx->submit_state;
+
+               /*
+                * Plug now if we have more than 2 IO left after this, and the
+                * target is potentially a read/write to block based storage.
+                */
+               if (state->need_plug && io_op_defs[opcode].plug) {
+                       state->plug_started = true;
+                       state->need_plug = false;
+                       blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+               }
+
+               req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+                                       (sqe_flags & IOSQE_FIXED_FILE));
+               if (unlikely(!req->file))
+                       return -EBADF;
+       }
 
        personality = READ_ONCE(sqe->personality);
        if (personality) {
@@ -7083,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                get_cred(req->creds);
                req->flags |= REQ_F_CREDS;
        }
-       state = &ctx->submit_state;
-
-       /*
-        * Plug now if we have more than 1 IO left after this, and the target
-        * is potentially a read/write to block based storage.
-        */
-       if (!state->plug_started && state->ios_left > 1 &&
-           io_op_defs[req->opcode].plug) {
-               blk_start_plug(&state->plug);
-               state->plug_started = true;
-       }
-
-       if (io_op_defs[req->opcode].needs_file) {
-               req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
-                                       (sqe_flags & IOSQE_FIXED_FILE));
-               if (unlikely(!req->file))
-                       ret = -EBADF;
-       }
 
-       state->ios_left--;
-       return ret;
+       return io_req_prep(req, sqe);
 }
 
 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7115,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
        ret = io_init_req(ctx, req, sqe);
        if (unlikely(ret)) {
-fail_req:
+               trace_io_uring_req_failed(sqe, ret);
+
                /* fail even hard links since we don't submit */
                if (link->head) {
                        /*
@@ -7138,10 +7133,6 @@ fail_req:
                        return ret;
                }
                req_fail_link_node(req, ret);
-       } else {
-               ret = io_req_prep(req, sqe);
-               if (unlikely(ret))
-                       goto fail_req;
        }
 
        /* don't need @sqe from now on */
@@ -7171,33 +7162,32 @@ fail_req:
                link->last->link = req;
                link->last = req;
 
+               if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+                       return 0;
                /* last request of a link, enqueue the link */
-               if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
-                       link->head = NULL;
-                       io_queue_sqe(head);
-               }
-       } else {
-               if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
-                       link->head = req;
-                       link->last = req;
-               } else {
-                       io_queue_sqe(req);
-               }
+               link->head = NULL;
+               req = head;
+       } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+               link->head = req;
+               link->last = req;
+               return 0;
        }
 
+       io_queue_sqe(req);
        return 0;
 }
 
 /*
  * Batched submission is done, ensure local IO is flushed out.
  */
-static void io_submit_state_end(struct io_submit_state *state,
-                               struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
 {
+       struct io_submit_state *state = &ctx->submit_state;
+
        if (state->link.head)
                io_queue_sqe(state->link.head);
-       if (state->compl_nr)
-               io_submit_flush_completions(ctx);
+       /* flush only after queuing links as they can generate completions */
+       io_submit_flush_completions(ctx);
        if (state->plug_started)
                blk_finish_plug(&state->plug);
 }
@@ -7209,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state,
                                  unsigned int max_ios)
 {
        state->plug_started = false;
-       state->ios_left = max_ios;
+       state->need_plug = max_ios > 2;
+       state->submit_nr = max_ios;
        /* set only head, no need to init link_last in advance */
        state->link.head = NULL;
 }
@@ -7261,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
        __must_hold(&ctx->uring_lock)
 {
+       unsigned int entries = io_sqring_entries(ctx);
        int submitted = 0;
 
+       if (unlikely(!entries))
+               return 0;
        /* make sure SQ entry isn't read before tail */
-       nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-       if (!percpu_ref_tryget_many(&ctx->refs, nr))
-               return -EAGAIN;
+       nr = min3(nr, ctx->sq_entries, entries);
        io_get_task_refs(nr);
 
        io_submit_state_start(&ctx->submit_state, nr);
-       while (submitted < nr) {
+       do {
                const struct io_uring_sqe *sqe;
                struct io_kiocb *req;
 
-               req = io_alloc_req(ctx);
-               if (unlikely(!req)) {
+               if (unlikely(!io_alloc_req_refill(ctx))) {
                        if (!submitted)
                                submitted = -EAGAIN;
                        break;
                }
+               req = io_alloc_req(ctx);
                sqe = io_get_sqe(ctx);
                if (unlikely(!sqe)) {
-                       list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+                       wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
                        break;
                }
                /* will complete beyond this point, count as submitted */
                submitted++;
                if (io_submit_sqe(ctx, req, sqe))
                        break;
-       }
+       } while (submitted < nr);
 
        if (unlikely(submitted != nr)) {
                int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
                int unused = nr - ref_used;
 
                current->io_uring->cached_refs += unused;
-               percpu_ref_put_many(&ctx->refs, unused);
        }
 
-       io_submit_state_end(&ctx->submit_state, ctx);
+       io_submit_state_end(ctx);
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
        io_commit_sqring(ctx);
 
@@ -7338,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
 
-       if (!list_empty(&ctx->iopoll_list) || to_submit) {
-               unsigned nr_events = 0;
+       if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
                const struct cred *creds = NULL;
 
                if (ctx->sq_creds != current_cred())
                        creds = override_creds(ctx->sq_creds);
 
                mutex_lock(&ctx->uring_lock);
-               if (!list_empty(&ctx->iopoll_list))
-                       io_do_iopoll(ctx, &nr_events, 0);
+               if (!wq_list_empty(&ctx->iopoll_list))
+                       io_do_iopoll(ctx, true);
 
                /*
                 * Don't submit if refs are dying, good for io_uring_register(),
@@ -7367,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
        return ret;
 }
 
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 {
        struct io_ring_ctx *ctx;
        unsigned sq_thread_idle = 0;
@@ -7424,7 +7414,7 @@ static int io_sq_thread(void *data)
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        int ret = __io_sq_thread(ctx, cap_entries);
 
-                       if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+                       if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
                }
                if (io_run_task_work())
@@ -7445,7 +7435,7 @@ static int io_sq_thread(void *data)
                                io_ring_set_wakeup_flag(ctx);
 
                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                                   !list_empty_careful(&ctx->iopoll_list)) {
+                                   !wq_list_empty(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                        break;
                                }
@@ -7621,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size)
        kfree(table);
 }
 
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
 {
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
        size_t init_size = size;
@@ -7650,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
        kfree(ref_node);
 }
 
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
        struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
        struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7696,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 
 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
                                struct io_rsrc_data *data_to_kill)
+       __must_hold(&ctx->uring_lock)
 {
        WARN_ON_ONCE(!ctx->rsrc_backup_node);
        WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 
+       io_rsrc_refs_drop(ctx);
+
        if (data_to_kill) {
                struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 
@@ -7727,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
        return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+                                     struct io_ring_ctx *ctx)
 {
        int ret;
 
@@ -7783,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
        kfree(data);
 }
 
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
-                             u64 __user *utags, unsigned nr,
-                             struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+                                    u64 __user *utags, unsigned nr,
+                                    struct io_rsrc_data **pdata)
 {
        struct io_rsrc_data *data;
        int ret = -ENOMEM;
@@ -8353,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                 unsigned int issue_flags, u32 slot_index)
 {
        struct io_ring_ctx *ctx = req->ctx;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
        bool needs_switch = false;
        struct io_fixed_file *file_slot;
        int ret = -EBADF;
 
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
        if (file->f_op == &io_uring_fops)
                goto err;
        ret = -ENXIO;
@@ -8399,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 err:
        if (needs_switch)
                io_rsrc_node_switch(ctx, ctx->file_data);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
        if (ret)
                fput(file);
        return ret;
@@ -8409,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
        unsigned int offset = req->close.file_slot - 1;
        struct io_ring_ctx *ctx = req->ctx;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
        struct io_fixed_file *file_slot;
        struct file *file;
        int ret, i;
 
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
        ret = -ENXIO;
        if (unlikely(!ctx->file_data))
                goto out;
@@ -8439,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
        io_rsrc_node_switch(ctx, ctx->file_data);
        ret = 0;
 out:
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
        return ret;
 }
 
@@ -8555,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
        return io_wq_create(concurrency, &data);
 }
 
-static int io_uring_alloc_task_context(struct task_struct *task,
-                                      struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+                                             struct io_ring_ctx *ctx)
 {
        struct io_uring_task *tctx;
        int ret;
@@ -8603,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk)
        tsk->io_uring = NULL;
 }
 
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
-                               struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+                                      struct io_uring_params *p)
 {
        int ret;
 
@@ -9215,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
        }
 }
 
-static void io_req_cache_free(struct list_head *list)
-{
-       struct io_kiocb *req, *nxt;
-
-       list_for_each_entry_safe(req, nxt, list, inflight_entry) {
-               list_del(&req->inflight_entry);
-               kmem_cache_free(req_cachep, req);
-       }
-}
-
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
        struct io_submit_state *state = &ctx->submit_state;
+       int nr = 0;
 
        mutex_lock(&ctx->uring_lock);
+       io_flush_cached_locked_reqs(ctx, state);
 
-       if (state->free_reqs) {
-               kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
-               state->free_reqs = 0;
-       }
+       while (state->free_list.next) {
+               struct io_wq_work_node *node;
+               struct io_kiocb *req;
 
-       io_flush_cached_locked_reqs(ctx, state);
-       io_req_cache_free(&state->free_list);
+               node = wq_stack_extract(&state->free_list);
+               req = container_of(node, struct io_kiocb, comp_list);
+               kmem_cache_free(req_cachep, req);
+               nr++;
+       }
+       if (nr)
+               percpu_ref_put_many(&ctx->refs, nr);
        mutex_unlock(&ctx->uring_lock);
 }
 
@@ -9247,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
                wait_for_completion(&data->done);
 }
 
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
        io_sq_thread_finish(ctx);
 
@@ -9256,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
                ctx->mm_account = NULL;
        }
 
+       io_rsrc_refs_drop(ctx);
        /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
        io_wait_rsrc_data(ctx->buf_data);
        io_wait_rsrc_data(ctx->file_data);
@@ -9279,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        if (ctx->rsrc_backup_node)
                io_rsrc_node_destroy(ctx->rsrc_backup_node);
        flush_delayed_work(&ctx->rsrc_put_work);
+       flush_delayed_work(&ctx->fallback_work);
 
        WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
        WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9309,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
        struct io_ring_ctx *ctx = file->private_data;
        __poll_t mask = 0;
 
-       poll_wait(file, &ctx->poll_wait, wait);
+       poll_wait(file, &ctx->cq_wait, wait);
        /*
         * synchronizes with barrier from wq_has_sleeper call in
         * io_commit_cqring
@@ -9356,7 +9349,7 @@ struct io_tctx_exit {
        struct io_ring_ctx              *ctx;
 };
 
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
 {
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_exit *work;
@@ -9371,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
        complete(&work->completion);
 }
 
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
 {
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
        return req->ctx == data;
 }
 
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
 {
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
        unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9407,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work)
                        io_sq_thread_unpark(sqd);
                }
 
+               io_req_caches_free(ctx);
+
                if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
                        /* there is little hope left, don't run it too often */
                        interval = HZ * 60;
@@ -9433,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work)
                ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                if (WARN_ON_ONCE(ret))
                        continue;
-               wake_up_process(node->task);
 
                mutex_unlock(&ctx->uring_lock);
                wait_for_completion(&exit.completion);
@@ -9447,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work)
 }
 
 /* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
-                            bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+                                   struct task_struct *tsk, bool cancel_all)
 {
        struct io_kiocb *req, *tmp;
        int canceled = 0;
@@ -9470,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
        return canceled != 0;
 }
 
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        unsigned long index;
        struct creds *creds;
@@ -9532,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
        return ret;
 }
 
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
-                                 struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+                                        struct task_struct *task,
+                                        bool cancel_all)
 {
        struct io_defer_entry *de;
        LIST_HEAD(list);
@@ -9558,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
        return true;
 }
 
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 {
        struct io_tctx_node *node;
        enum io_wq_cancel cret;
@@ -9582,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
        return ret;
 }
 
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
-                                        struct task_struct *task,
-                                        bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+                                               struct task_struct *task,
+                                               bool cancel_all)
 {
        struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
        struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9608,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                /* SQPOLL thread does its own polling */
                if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
                    (ctx->sq_data && ctx->sq_data->thread == current)) {
-                       while (!list_empty_careful(&ctx->iopoll_list)) {
+                       while (!wq_list_empty(&ctx->iopoll_list)) {
                                io_iopoll_try_reap_events(ctx);
                                ret = true;
                        }
@@ -9683,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 /*
  * Remove this io_uring_file -> task mapping.
  */
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
 {
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;
@@ -9706,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index)
        kfree(node);
 }
 
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
 {
        struct io_wq *wq = tctx->io_wq;
        struct io_tctx_node *node;
@@ -9733,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
        return percpu_counter_sum(&tctx->inflight);
 }
 
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 {
        struct io_uring_task *tctx = task->io_uring;
        unsigned int refs = tctx->cached_refs;
@@ -9749,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
  * Find any io_uring ctx that this task has registered or done IO on, and cancel
  * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
  */
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+                                          struct io_sq_data *sqd)
 {
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx;
@@ -9842,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
 
 #ifdef CONFIG_MMU
 
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
        size_t sz = vma->vm_end - vma->vm_start;
        unsigned long pfn;
@@ -10027,7 +10023,7 @@ out_fput:
 }
 
 #ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
                const struct cred *cred)
 {
        struct user_namespace *uns = seq_user_ns(m);
@@ -10059,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
        return 0;
 }
 
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+                                         struct seq_file *m)
 {
        struct io_sq_data *sq = NULL;
+       struct io_overflow_cqe *ocqe;
+       struct io_rings *r = ctx->rings;
+       unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+       unsigned int sq_head = READ_ONCE(r->sq.head);
+       unsigned int sq_tail = READ_ONCE(r->sq.tail);
+       unsigned int cq_head = READ_ONCE(r->cq.head);
+       unsigned int cq_tail = READ_ONCE(r->cq.tail);
+       unsigned int sq_entries, cq_entries;
        bool has_lock;
-       int i;
+       unsigned int i;
+
+       /*
+        * we may get imprecise sqe and cqe info if uring is actively running
+        * since we get cached_sq_head and cached_cq_tail without uring_lock
+        * and sq_tail and cq_head are changed by userspace. But it's ok since
+        * we usually use these info when it is stuck.
+        */
+       seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+       seq_printf(m, "SqHead:\t%u\n", sq_head);
+       seq_printf(m, "SqTail:\t%u\n", sq_tail);
+       seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+       seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+       seq_printf(m, "CqHead:\t%u\n", cq_head);
+       seq_printf(m, "CqTail:\t%u\n", cq_tail);
+       seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+       seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+       sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+       for (i = 0; i < sq_entries; i++) {
+               unsigned int entry = i + sq_head;
+               unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+               struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
+
+               if (sq_idx > sq_mask)
+                       continue;
+               sqe = &ctx->sq_sqes[sq_idx];
+               seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+                          sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+                          sqe->user_data);
+       }
+       seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+       cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+       for (i = 0; i < cq_entries; i++) {
+               unsigned int entry = i + cq_head;
+               struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+               seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+                          entry & cq_mask, cqe->user_data, cqe->res,
+                          cqe->flags);
+       }
 
        /*
         * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10105,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                xa_for_each(&ctx->personalities, index, cred)
                        io_uring_show_cred(m, index, cred);
        }
-       seq_printf(m, "PollList:\n");
+       if (has_lock)
+               mutex_unlock(&ctx->uring_lock);
+
+       seq_puts(m, "PollList:\n");
        spin_lock(&ctx->completion_lock);
        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10115,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                        seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
                                        req->task->task_works != NULL);
        }
+
+       seq_puts(m, "CqOverflowList:\n");
+       list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+               struct io_uring_cqe *cqe = &ocqe->cqe;
+
+               seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
+                          cqe->user_data, cqe->res, cqe->flags);
+
+       }
+
        spin_unlock(&ctx->completion_lock);
-       if (has_lock)
-               mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 {
        struct io_ring_ctx *ctx = f->private_data;
 
@@ -10144,8 +10199,8 @@ static const struct file_operations io_uring_fops = {
 #endif
 };
 
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
-                                 struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+                                        struct io_uring_params *p)
 {
        struct io_rings *rings;
        size_t size, sq_array_offset;
@@ -10234,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
        return file;
 }
 
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
-                          struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+                                 struct io_uring_params __user *params)
 {
        struct io_ring_ctx *ctx;
        struct file *file;
@@ -10393,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
        return io_uring_setup(entries, params);
 }
 
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+                          unsigned nr_args)
 {
        struct io_uring_probe *p;
        size_t size;
@@ -10449,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
        return id;
 }
 
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
-                                   unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+                                          void __user *arg, unsigned int nr_args)
 {
        struct io_uring_restriction *res;
        size_t size;
@@ -10584,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
        return __io_register_rsrc_update(ctx, type, &up, up.nr);
 }
 
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned int size, unsigned int type)
 {
        struct io_uring_rsrc_register rr;
@@ -10610,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
        return -EINVAL;
 }
 
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
-                               unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+                                      void __user *arg, unsigned len)
 {
        struct io_uring_task *tctx = current->io_uring;
        cpumask_var_t new_mask;
@@ -10637,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
        return ret;
 }
 
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
 {
        struct io_uring_task *tctx = current->io_uring;
 
@@ -10647,8 +10703,8 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
        return io_wq_cpu_affinity(tctx->io_wq, NULL);
 }
 
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
-                                       void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+                                              void __user *arg)
        __must_hold(&ctx->uring_lock)
 {
        struct io_tctx_node *node;
@@ -10753,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op)
        }
 }
 
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
 {
        long ret;
 
@@ -10768,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
         */
        mutex_unlock(&ctx->uring_lock);
        do {
-               ret = wait_for_completion_interruptible(&ctx->ref_comp);
-               if (!ret)
+               ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+               if (ret) {
+                       ret = min(0L, ret);
                        break;
+               }
+
                ret = io_run_task_work_sig();
+               io_req_caches_free(ctx);
        } while (ret >= 0);
        mutex_lock(&ctx->uring_lock);
 
@@ -11002,6 +11062,8 @@ static int __init io_uring_init(void)
 
        /* should fit into one byte */
        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+       BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+       BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
 
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
index 4ecd255..811c898 100644 (file)
@@ -38,8 +38,7 @@ struct iomap_dio {
                struct {
                        struct iov_iter         *iter;
                        struct task_struct      *waiter;
-                       struct request_queue    *last_queue;
-                       blk_qc_t                cookie;
+                       struct bio              *poll_bio;
                } submit;
 
                /* used for aio completion: */
@@ -49,29 +48,20 @@ struct iomap_dio {
        };
 };
 
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
-       struct request_queue *q = READ_ONCE(kiocb->private);
-
-       if (!q)
-               return 0;
-       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
 static void iomap_dio_submit_bio(const struct iomap_iter *iter,
                struct iomap_dio *dio, struct bio *bio, loff_t pos)
 {
        atomic_inc(&dio->ref);
 
-       if (dio->iocb->ki_flags & IOCB_HIPRI)
+       if (dio->iocb->ki_flags & IOCB_HIPRI) {
                bio_set_polled(bio, dio->iocb);
+               dio->submit.poll_bio = bio;
+       }
 
-       dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
        if (dio->dops && dio->dops->submit_io)
-               dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+               dio->dops->submit_io(iter, bio, pos);
        else
-               dio->submit.cookie = submit_bio(bio);
+               submit_bio(bio);
 }
 
 ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -135,7 +125,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
        struct kiocb *iocb = dio->iocb;
 
-       iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+       iocb->ki_complete(iocb, iomap_dio_complete(dio));
 }
 
 /*
@@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
                } else if (dio->flags & IOMAP_DIO_WRITE) {
                        struct inode *inode = file_inode(dio->iocb->ki_filp);
 
+                       WRITE_ONCE(dio->iocb->private, NULL);
                        INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
                        queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
                } else {
+                       WRITE_ONCE(dio->iocb->private, NULL);
                        iomap_dio_complete_work(&dio->aio.work);
                }
        }
@@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
        if (!iov_iter_count(dio->submit.iter))
                goto out;
 
+       /*
+        * We can only poll for single bio I/Os.
+        */
+       if (need_zeroout ||
+           ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+               dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
        if (need_zeroout) {
                /* zero out from the start of the block to the write offset */
                pad = pos & (fs_block_size - 1);
@@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 
                nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
                                                 BIO_MAX_VECS);
+               /*
+                * We can only poll for single bio I/Os.
+                */
+               if (nr_pages)
+                       dio->iocb->ki_flags &= ~IOCB_HIPRI;
                iomap_dio_submit_bio(iter, dio, bio, pos);
                pos += n;
        } while (nr_pages);
@@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
        dio->submit.iter = iter;
        dio->submit.waiter = current;
-       dio->submit.cookie = BLK_QC_T_NONE;
-       dio->submit.last_queue = NULL;
+       dio->submit.poll_bio = NULL;
 
        if (iov_iter_rw(iter) == READ) {
                if (iomi.pos >= dio->i_size)
@@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        inode_dio_begin(inode);
 
        blk_start_plug(&plug);
-       while ((ret = iomap_iter(&iomi, ops)) > 0)
+       while ((ret = iomap_iter(&iomi, ops)) > 0) {
                iomi.processed = iomap_dio_iter(&iomi, dio);
+
+               /*
+                * We can only poll for single bio I/Os.
+                */
+               iocb->ki_flags &= ~IOCB_HIPRI;
+       }
+
        blk_finish_plug(&plug);
 
        /*
@@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        if (dio->flags & IOMAP_DIO_WRITE_FUA)
                dio->flags &= ~IOMAP_DIO_NEED_SYNC;
 
-       WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
-       WRITE_ONCE(iocb->private, dio->submit.last_queue);
+       WRITE_ONCE(iocb->private, dio->submit.poll_bio);
 
        /*
         * We are about to drop our additional submission reference, which
@@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                        if (!READ_ONCE(dio->submit.waiter))
                                break;
 
-                       if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                           !dio->submit.last_queue ||
-                           !blk_poll(dio->submit.last_queue,
-                                        dio->submit.cookie, true))
+                       if (!dio->submit.poll_bio ||
+                           !bio_poll(dio->submit.poll_bio, NULL, 0))
                                blk_io_schedule();
                }
                __set_current_state(TASK_RUNNING);
index 176580f..104ae69 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/seq_file.h>
+#include <linux/writeback.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
index bde787c..8b9a72a 100644 (file)
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
                goto out;
        }
 
-       VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+       VolumeSize = sb_bdev_nr_blocks(sb);
        if (VolumeSize) {
                if (newLVSize > VolumeSize) {
                        printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        txQuiesce(sb);
 
        /* Reset size of direct inode */
-       sbi->direct_inode->i_size =  i_size_read(sb->s_bdev->bd_inode);
+       sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
 
        if (sbi->mntflag & JFS_INLINELOG) {
                /*
index 9030aea..24cbc99 100644 (file)
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
                }
                case Opt_resize_nosize:
                {
-                       *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits;
+                       *newLVSize = sb_bdev_nr_blocks(sb);
                        if (*newLVSize == 0)
                                pr_err("JFS: Cannot determine volume size\n");
                        break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -ENOMEM;
                goto out_unload;
        }
-       inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+       inode->i_size = bdev_nr_bytes(sb->s_bdev);
        inode->i_mapping->a_ops = &jfs_metapage_aops;
        inode_fake_hash(inode);
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
index 3d6fb4a..0fca9d6 100644 (file)
 /*
  *  linux/fs/locks.c
  *
- *  Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- *  Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases.  For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
  *
- *  Deadlock detection added.
- *  FIXME: one thing isn't handled yet:
- *     - mandatory locks (requires lots of changes elsewhere)
- *  Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- *  Converted file_lock_table to a linked list from an array, which eliminates
- *  the limits on how many active file locks are open.
- *  Chad Page (pageone@netcom.com), November 27, 1994
- *
- *  Removed dependency on file descriptors. dup()'ed file descriptors now
- *  get the same locks as the original file descriptors, and a close() on
- *  any file descriptor removes ALL the locks on the file for the current
- *  process. Since locks still depend on the process id, locks are inherited
- *  after an exec() but not after a fork(). This agrees with POSIX, and both
- *  BSD and SVR4 practice.
- *  Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- *  Scrapped free list which is redundant now that we allocate locks
- *  dynamically with kmalloc()/kfree().
- *  Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- *  Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- *  FL_POSIX locks are created with calls to fcntl() and lockf() through the
- *  fcntl() system call. They have the semantics described above.
- *
- *  FL_FLOCK locks are created with calls to flock(), through the flock()
- *  system call, which is new. Old C libraries implement flock() via fcntl()
- *  and will continue to use the old, broken implementation.
- *
- *  FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- *  with a file pointer (filp). As a result they can be shared by a parent
- *  process and its children after a fork(). They are removed when the last
- *  file descriptor referring to the file pointer is closed (unless explicitly
- *  unlocked).
- *
- *  FL_FLOCK locks never deadlock, an existing lock is always removed before
- *  upgrading from shared to exclusive (or vice versa). When this happens
- *  any processes blocked by the current lock are woken up and allowed to
- *  run before the new lock is applied.
- *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- *  Removed some race conditions in flock_lock_file(), marked other possible
- *  races. Just grep for FIXME to see them.
- *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- *  Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- *  once we've checked for blocking and deadlocking.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- *  Initial implementation of mandatory locks. SunOS turned out to be
- *  a rotten model, so I implemented the "obvious" semantics.
- *  See 'Documentation/filesystems/mandatory-locking.rst' for details.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- *  check if a file has mandatory locks, used by mmap(), open() and creat() to
- *  see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- *  Manual, Section 2.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- *  Tidied up block list handling. Added '/proc/locks' interface.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- *  Fixed deadlock condition for pathological code that mixes calls to
- *  flock() and fcntl().
- *  Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- *  Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- *  for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- *  guarantee sensible behaviour in the case where file system modules might
- *  be compiled with different options than the kernel itself.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- *  Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- *  (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- *  Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- *  locks. Changed process synchronisation to avoid dereferencing locks that
- *  have already been freed.
- *  Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- *  Made the block list a circular list to minimise searching in the list.
- *  Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- *  Made mandatory locking a mount option. Default is not to allow mandatory
- *  locking.
- *  Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- *  Some adaptations for NFS support.
- *  Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- *  Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- *  Use slab allocator instead of kmalloc/kfree.
- *  Use generic list implementation from <linux/list.h>.
- *  Sped up posix_locks_deadlock by only considering blocked locks.
- *  Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- *  Leases and LOCK_MAND
- *  Matthew Wilcox <willy@debian.org>, June, 2000.
- *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
  *
  * Locking conflicts and dependencies:
  * If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
 }
 
 static inline int flock_translate_cmd(int cmd) {
-       if (cmd & LOCK_MAND)
-               return cmd & (LOCK_MAND | LOCK_RW);
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
         */
        if (caller_fl->fl_file == sys_fl->fl_file)
                return false;
-       if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
-               return false;
 
        return locks_conflict(caller_fl, sys_fl);
 }
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
  *     - %LOCK_SH -- a shared lock.
  *     - %LOCK_EX -- an exclusive lock.
  *     - %LOCK_UN -- remove an existing lock.
- *     - %LOCK_MAND -- a 'mandatory' flock.
- *       This exists to emulate Windows Share Modes.
+ *     - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
  *
- *     %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- *     processes read and write access respectively.
+ *     %LOCK_MAND support has been removed from the kernel.
  */
 SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
        cmd &= ~LOCK_NB;
        unlock = (cmd == LOCK_UN);
 
-       if (!unlock && !(cmd & LOCK_MAND) &&
-           !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+       if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+               goto out_putf;
+
+       /*
+        * LOCK_MAND locks were broken for a long time in that they never
+        * conflicted with one another and didn't prevent any sort of open,
+        * read or write activity.
+        *
+        * Just ignore these requests now, to preserve legacy behavior, but
+        * throw a warning to let people know that they don't actually work.
+        */
+       if (cmd & LOCK_MAND) {
+               pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+               error = 0;
                goto out_putf;
+       }
 
        lock = flock_make_lock(f.file, cmd, NULL);
        if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        struct inode *inode = NULL;
        unsigned int fl_pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+       int type;
 
        fl_pid = locks_translate_pid(fl, proc_pidns);
        /*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
        } else if (IS_FLOCK(fl)) {
-               if (fl->fl_type & LOCK_MAND) {
-                       seq_puts(f, "FLOCK  MSNFS     ");
-               } else {
-                       seq_puts(f, "FLOCK  ADVISORY  ");
-               }
+               seq_puts(f, "FLOCK  ADVISORY  ");
        } else if (IS_LEASE(fl)) {
                if (fl->fl_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }
-       if (fl->fl_type & LOCK_MAND) {
-               seq_printf(f, "%s ",
-                              (fl->fl_type & LOCK_READ)
-                              ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
-                              : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
-       } else {
-               int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+       type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
 
-               seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
-                                    (type == F_RDLCK) ? "READ" : "UNLCK");
-       }
+       seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+                            (type == F_RDLCK) ? "READ" : "UNLCK");
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
index 1946d96..1f9d218 100644 (file)
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
        int error = get_write_access(inode);
        if (error)
                return error;
-       /*
-        * Refuse to truncate files with mandatory locks held on them.
-        */
+
        error = security_path_truncate(path);
        if (!error) {
                error = do_truncate(mnt_userns, path->dentry, 0,
index acb1d22..5e56da7 100644 (file)
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
        d->bdev = bdev;
 
 
-       d->len = i_size_read(d->bdev->bd_inode);
+       d->len = bdev_nr_bytes(d->bdev);
        d->map = bl_map_simple;
 
        printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
                return PTR_ERR(bdev);
        d->bdev = bdev;
 
-       d->len = i_size_read(d->bdev->bd_inode);
+       d->len = bdev_nr_bytes(d->bdev);
        d->map = bl_map_simple;
        d->pr_key = v->scsi.pr_key;
 
index 2e894fe..7a5f287 100644 (file)
@@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
                        res = (long) dreq->count;
                        WARN_ON_ONCE(dreq->count < 0);
                }
-               dreq->iocb->ki_complete(dreq->iocb, res, 0);
+               dreq->iocb->ki_complete(dreq->iocb, res);
        }
 
        complete(&dreq->completion);
index aa353fd..24e7dcc 100644 (file)
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
 
-       /*
-        * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
-        * any standard. In principle we might be able to support LOCK_MAND
-        * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
-        * NFS code is not set up for it.
-        */
-       if (fl->fl_type & LOCK_MAND)
-               return -EINVAL;
-
        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
                is_local = 1;
 
index 6e9ea4e..3d1d172 100644 (file)
@@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT
        depends on NFSD_V4 && BLOCK
        select NFSD_PNFS
        select EXPORTFS_BLOCK_OPS
-       select SCSI_COMMON
        help
          This option enables support for the exporting pNFS SCSI layouts
          in the kernel's NFS server. The pNFS SCSI layout enables NFS
index c99dee9..e5c0982 100644 (file)
@@ -9,9 +9,6 @@
 #include <linux/pr.h>
 
 #include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
 #endif /* CONFIG_NFSD_BLOCKLAYOUT */
 
 #ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
-               struct pnfs_block_volume *b)
-{
-       struct request_queue *q = bdev->bd_disk->queue;
-       struct request *rq;
-       struct scsi_request *req;
-       /*
-        * The allocation length (passed in bytes 3 and 4 of the INQUIRY
-        * command descriptor block) specifies the number of bytes that have
-        * been allocated for the data-in buffer.
-        * 252 is the highest one-byte value that is a multiple of 4.
-        * 65532 is the highest two-byte value that is a multiple of 4.
-        */
-       size_t bufflen = 252, maxlen = 65532, len, id_len;
-       u8 *buf, *d, type, assoc;
-       int retries = 1, error;
-
-       if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
-               return -EINVAL;
-
-again:
-       buf = kzalloc(bufflen, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-
-       rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
-       if (IS_ERR(rq)) {
-               error = -ENOMEM;
-               goto out_free_buf;
-       }
-       req = scsi_req(rq);
-
-       error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
-       if (error)
-               goto out_put_request;
-
-       req->cmd[0] = INQUIRY;
-       req->cmd[1] = 1;
-       req->cmd[2] = 0x83;
-       req->cmd[3] = bufflen >> 8;
-       req->cmd[4] = bufflen & 0xff;
-       req->cmd_len = COMMAND_SIZE(INQUIRY);
-
-       blk_execute_rq(NULL, rq, 1);
-       if (req->result) {
-               pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
-                       req->result);
-               error = -EIO;
-               goto out_put_request;
-       }
-
-       len = (buf[2] << 8) + buf[3] + 4;
-       if (len > bufflen) {
-               if (len <= maxlen && retries--) {
-                       blk_put_request(rq);
-                       kfree(buf);
-                       bufflen = len;
-                       goto again;
-               }
-               pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
-                       len);
-               goto out_put_request;
-       }
-
-       d = buf + 4;
-       for (d = buf + 4; d < buf + len; d += id_len + 4) {
-               id_len = d[3];
-               type = d[1] & 0xf;
-               assoc = (d[1] >> 4) & 0x3;
-
-               /*
-                * We only care about a EUI-64 and NAA designator types
-                * with LU association.
-                */
-               if (assoc != 0x00)
-                       continue;
-               if (type != 0x02 && type != 0x03)
-                       continue;
-               if (id_len != 8 && id_len != 12 && id_len != 16)
-                       continue;
-
-               b->scsi.code_set = PS_CODE_SET_BINARY;
-               b->scsi.designator_type = type == 0x02 ?
-                       PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
-               b->scsi.designator_len = id_len;
-               memcpy(b->scsi.designator, d + 4, id_len);
-
-               /*
-                * If we found a 8 or 12 byte descriptor continue on to
-                * see if a 16 byte one is available.  If we find a
-                * 16 byte descriptor we're done.
-                */
-               if (id_len == 16)
-                       break;
-       }
-
-out_put_request:
-       blk_put_request(rq);
-out_free_buf:
-       kfree(buf);
-       return error;
-}
-
 #define NFSD_MDS_PR_KEY                0x0100000000000000ULL
 
 /*
@@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
        return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
 }
 
+static const u8 designator_types[] = {
+       PS_DESIGNATOR_EUI64,
+       PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+       int ret, i;
+
+       for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+               u8 type = designator_types[i];
+
+               ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+               if (ret > 0) {
+                       b->scsi.code_set = PS_CODE_SET_BINARY;
+                       b->scsi.designator_type = type;
+                       b->scsi.designator_len = ret;
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
 static int
 nfsd4_block_get_device_info_scsi(struct super_block *sb,
                struct nfs4_client *clp,
@@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
        struct pnfs_block_deviceaddr *dev;
        struct pnfs_block_volume *b;
        const struct pr_ops *ops;
-       int error;
+       int ret;
 
        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
        b->type = PNFS_BLOCK_VOLUME_SCSI;
        b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
 
-       error = nfsd4_scsi_identify_device(sb->s_bdev, b);
-       if (error)
-               return error;
+       ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+       if (ret < 0)
+               goto out_free_dev;
 
+       ret = -EINVAL;
        ops = sb->s_bdev->bd_disk->fops->pr_ops;
        if (!ops) {
                pr_err("pNFS: device %s does not support PRs.\n",
                        sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
        }
 
-       error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
-       if (error) {
+       ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+       if (ret) {
                pr_err("pNFS: failed to register key for device %s.\n",
                        sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
        }
 
-       error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+       ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
                        PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
-       if (error) {
+       if (ret) {
                pr_err("pNFS: failed to reserve device %s.\n",
                        sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
        }
 
        return 0;
+
+out_free_dev:
+       kfree(dev);
+       return ret;
 }
 
 static __be32
index a97873f..6d1b5bb 100644 (file)
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 #ifdef CONFIG_NFSD_SCSILAYOUT
        if (sb->s_export_op->map_blocks &&
            sb->s_export_op->commit_blocks &&
-           sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
-               blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+           sb->s_bdev &&
+           sb->s_bdev->bd_disk->fops->pr_ops &&
+           sb->s_bdev->bd_disk->fops->get_unique_id)
                exp->ex_layout_types |= 1 << LAYOUT_SCSI;
 #endif
 }
index 640ac8f..1d0583c 100644 (file)
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
                goto out;
 
        ret = -ERANGE;
-       if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+       if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
                goto out;
 
        segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
index f6b2d28..3134c0e 100644 (file)
@@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
        int ret;
 
        ret = -ERANGE;
-       devsize = i_size_read(sb->s_bdev->bd_inode);
+       devsize = bdev_nr_bytes(sb->s_bdev);
        if (newsize > devsize)
                goto out;
 
index c8bfc01..1bfcb5d 100644 (file)
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 {
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        struct buffer_head **sbh = nilfs->ns_sbh;
-       u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+       u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
        int valid[2], swp = 0;
 
        sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
index ab4f336..373dbb6 100644 (file)
@@ -5,6 +5,7 @@
  * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
  */
 
+#include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
index 0d7e948..5ae8de0 100644 (file)
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
        ntfs_debug("Set device block size to %i bytes (block size bits %i).",
                        blocksize, sb->s_blocksize_bits);
        /* Determine the size of the device in units of block_size bytes. */
-       if (!i_size_read(sb->s_bdev->bd_inode)) {
+       vol->nr_blocks = sb_bdev_nr_blocks(sb);
+       if (!vol->nr_blocks) {
                if (!silent)
                        ntfs_error(sb, "Unable to determine device size.");
                goto err_out_now;
        }
-       vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                       sb->s_blocksize_bits;
        /* Read the boot sector and return unlocked buffer head to it. */
        if (!(bh = read_ntfs_boot_sector(sb, silent))) {
                if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                        goto err_out_now;
                }
                BUG_ON(blocksize != sb->s_blocksize);
-               vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits;
+               vol->nr_blocks = sb_bdev_nr_blocks(sb);
                ntfs_debug("Changed device block size to %i bytes (block size "
                                "bits %i) to match volume sector size.",
                                blocksize, sb->s_blocksize_bits);
index 43b1451..a3cd3c3 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <linux/falloc.h>
index 859951d..a87ab3a 100644 (file)
@@ -1046,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
        if (!ret && i2)
                ret = writeback_inode(i2);
        if (!ret)
-               ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+               ret = sync_blockdev_nowait(sb->s_bdev);
        return ret;
 }
 
index d41d769..2981320 100644 (file)
@@ -921,7 +921,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
        /* Parse boot. */
        err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
-                                 bdev->bd_inode->i_size);
+                                 bdev_nr_bytes(bdev));
        if (err)
                goto out;
 
index 8521942..481017e 100644 (file)
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 {
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct journal_head *jh;
-       int ret;
+       int ret = 1;
 
        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
                return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
        if (!buffer_jbd(bg_bh))
                return 1;
 
-       jh = bh2jh(bg_bh);
-       spin_lock(&jh->b_state_lock);
-       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
-       if (bg)
-               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
-       else
-               ret = 1;
-       spin_unlock(&jh->b_state_lock);
+       jbd_lock_bh_journal_head(bg_bh);
+       if (buffer_jbd(bg_bh)) {
+               jh = bh2jh(bg_bh);
+               spin_lock(&jh->b_state_lock);
+               bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+               if (bg)
+                       ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+               else
+                       ret = 1;
+               spin_unlock(&jh->b_state_lock);
+       }
+       jbd_unlock_bh_journal_head(bg_bh);
 
        return ret;
 }
index c1bb4c4..e5e3e50 100644 (file)
@@ -10,7 +10,7 @@
  *  Linux VFS inode operations.
  */
 
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
 #include <linux/fileattr.h>
 #include "protocol.h"
 #include "orangefs-kernel.h"
index 2f2e430..8bb0a53 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/parser.h>
 #include <linux/hashtable.h>
+#include <linux/seq_file.h>
 
 /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
 static struct kmem_cache *orangefs_inode_cache;
index c88ac57..ac461a4 100644 (file)
@@ -272,14 +272,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
        kmem_cache_free(ovl_aio_request_cachep, aio_req);
 }
 
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
 {
        struct ovl_aio_req *aio_req = container_of(iocb,
                                                   struct ovl_aio_req, iocb);
        struct kiocb *orig_iocb = aio_req->orig_iocb;
 
        ovl_aio_cleanup_handler(aio_req);
-       orig_iocb->ki_complete(orig_iocb, res, res2);
+       orig_iocb->ki_complete(orig_iocb, res);
 }
 
 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
index 04ce58c..5d1fbaf 100644 (file)
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
 static int __register_pstore_blk(struct pstore_device_info *dev,
                                 const char *devpath)
 {
-       struct inode *inode;
        int ret = -ENODEV;
 
        lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
                goto err;
        }
 
-       inode = file_inode(psblk_file);
-       if (!S_ISBLK(inode->i_mode)) {
+       if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
                pr_err("'%s' is not block device!\n", devpath);
                goto err_fput;
        }
 
-       inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
-       dev->zone.total_size = i_size_read(inode);
+       dev->zone.total_size =
+               bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
 
        ret = __register_pstore_device(dev);
        if (ret)
index 2bcc9a6..052f143 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 #include <asm/current.h>
+#include <linux/blkdev.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/security.h>
index 65e7e56..e230234 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 struct ramfs_mount_opts {
index af057c5..0074afa 100644 (file)
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
        if (unlikely((ssize_t) count < 0))
                return -EINVAL;
 
-       /*
-        * ranged mandatory locking does not apply to streams - it makes sense
-        * only for files where position has a meaning.
-        */
        if (ppos) {
                loff_t pos = *ppos;
 
index 58481f8..076f9ab 100644 (file)
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
 
                        if (!strcmp(arg, "auto")) {
                                /* From JFS code, to auto-get the size. */
-                               *blocks =
-                                   i_size_read(s->s_bdev->bd_inode) >> s->
-                                   s_blocksize_bits;
+                               *blocks = sb_bdev_nr_blocks(s);
                        } else {
                                *blocks = simple_strtoul(arg, &p, 0);
                                if (*p != '\0') {
@@ -1986,9 +1984,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
         * smaller than the filesystem. If the check fails then abort and
         * scream, because bad stuff will happen otherwise.
         */
-       if (s->s_bdev && s->s_bdev->bd_inode
-           && i_size_read(s->s_bdev->bd_inode) <
-           sb_block_count(rs) * sb_blocksize(rs)) {
+       if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
                SWARN(silent, s, "", "Filesystem cannot be "
                      "mounted because it is bigger than the device");
                SWARN(silent, s, "", "You may need to run fsck "
index 60d6951..bb44ff4 100644 (file)
@@ -16,6 +16,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
@@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
-       if (msblk->bytes_used < 0 || msblk->bytes_used >
-                       i_size_read(sb->s_bdev->bd_inode))
+       if (msblk->bytes_used < 0 ||
+           msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
                goto failed_mount;
 
        /* Check block size for sanity */
index 1373a61..3ce8e21 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
  * High-level sync()-related operations
  */
 
+#include <linux/blkdev.h>
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
                        SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
-       if (wait)
-               sync_inodes_sb(sb);
-       else
-               writeback_inodes_sb(sb, WB_REASON_SYNC);
-
-       if (sb->s_op->sync_fs)
-               sb->s_op->sync_fs(sb, wait);
-       return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
  * Write out and wait upon all dirty data associated with this
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
@@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb)
        if (sb_rdonly(sb))
                return 0;
 
-       ret = __sync_filesystem(sb, 0);
+       /*
+        * Do the filesystem syncing work.  For simple filesystems
+        * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+        * to submit I/O for these buffers via sync_blockdev().  This also
+        * speeds up the wait == 1 case since in that case write_inode()
+        * methods call sync_dirty_buffer() and thus effectively write one block
+        * at a time.
+        */
+       writeback_inodes_sb(sb, WB_REASON_SYNC);
+       if (sb->s_op->sync_fs)
+               sb->s_op->sync_fs(sb, 0);
+       ret = sync_blockdev_nowait(sb->s_bdev);
        if (ret < 0)
                return ret;
-       return __sync_filesystem(sb, 1);
+
+       sync_inodes_sb(sb);
+       if (sb->s_op->sync_fs)
+               sb->s_op->sync_fs(sb, 1);
+       return sync_blockdev(sb->s_bdev);
 }
 EXPORT_SYMBOL(sync_filesystem);
 
@@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
                sb->s_op->sync_fs(sb, *(int *)arg);
 }
 
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
-       filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
-       /*
-        * We keep the error status of individual mapping so that
-        * applications can catch the writeback error using fsync(2).
-        * See filemap_fdatawait_keep_errors() for details.
-        */
-       filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +96,8 @@ void ksys_sync(void)
        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
-       iterate_bdevs(fdatawait_one_bdev, NULL);
+       sync_bdevs(false);
+       sync_bdevs(true);
        if (unlikely(laptop_mode))
                laptop_sync_completion();
 }
@@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work)
         */
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
+       sync_bdevs(false);
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
+       sync_bdevs(false);
        printk("Emergency Sync complete\n");
        kfree(work);
 }
index 22be7ae..c57b46a 100644 (file)
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
        .get_context            = ubifs_crypt_get_context,
        .set_context            = ubifs_crypt_set_context,
        .empty_dir              = ubifs_crypt_empty_dir,
-       .max_namelen            = UBIFS_MAX_NLEN,
 };
index f1094cd..46d6971 100644 (file)
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
 
 unsigned long udf_get_last_block(struct super_block *sb)
 {
-       struct block_device *bdev = sb->s_bdev;
-       struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+       struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
        unsigned long lblock = 0;
 
        /*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
         * Try using the device size...
         */
        if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
-               lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+               lblock = sb_bdev_nr_blocks(sb);
 
        if (lblock)
                return lblock - 1;
index b2d7c57..34247fb 100644 (file)
@@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
-       sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                         sb->s_blocksize_bits;
+       sector_t blocks = sb_bdev_nr_blocks(sb);
 
        udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
        if (!sbi->s_vat_inode &&
@@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
        int ret;
 
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-           udf_fixed_to_variable(block) >=
-           i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+           udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
                return -EAGAIN;
 
        bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
                last[last_count++] = *lastblock - 152;
 
        for (i = 0; i < last_count; i++) {
-               if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits)
+               if (last[i] >= sb_bdev_nr_blocks(sb))
                        continue;
                ret = udf_check_anchor_block(sb, last[i], fileset);
                if (ret != -EAGAIN) {
index 7aa943e..62e7fbe 100644 (file)
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
        .write_iter     = xfs_file_write_iter,
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
        .unlocked_ioctl = xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
index ddc346a..3ce5f47 100644 (file)
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
        .write_iter     = zonefs_file_write_iter,
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
 };
 
 static struct kmem_cache *zonefs_inode_cachep;
index 4a674db..fedc0df 100644 (file)
@@ -49,9 +49,15 @@ static inline void flush_cache_page(struct vm_area_struct *vma,
 static inline void flush_dcache_page(struct page *page)
 {
 }
+
+static inline void flush_dcache_folio(struct folio *folio) { }
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
 #endif
 
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio);
+#endif
 
 #ifndef flush_dcache_mmap_lock
 static inline void flush_dcache_mmap_lock(struct address_space *mapping)
index 1b44f40..199e47e 100644 (file)
@@ -329,6 +329,7 @@ enum {
        ATA_LOG_SECURITY          = 0x06,
        ATA_LOG_SATA_SETTINGS     = 0x08,
        ATA_LOG_ZONED_INFORMATION = 0x09,
+       ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47,
 
        /* Identify device SATA settings log:*/
        ATA_LOG_DEVSLP_OFFSET     = 0x30,
index ac7f231..9c14f0a 100644 (file)
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/writeback.h>
-#include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
 
+struct blkcg;
+
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
        kref_get(&bdi->refcnt);
@@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
        return atomic_long_read(&bdi->tot_write_bandwidth);
 }
 
-static inline void __add_wb_stat(struct bdi_writeback *wb,
+static inline void wb_stat_mod(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
 {
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
@@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb,
 
 static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-       __add_wb_stat(wb, item, 1);
+       wb_stat_mod(wb, item, 1);
 }
 
 static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-       __add_wb_stat(wb, item, -1);
+       wb_stat_mod(wb, item, -1);
 }
 
 static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
@@ -133,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb)
        return test_bit(WB_writeback_running, &wb->state);
 }
 
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-       struct super_block *sb;
-
-       if (!inode)
-               return &noop_backing_dev_info;
-
-       sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-       if (sb_is_blkdev_sb(sb))
-               return I_BDEV(inode)->bd_disk->bdi;
-#endif
-       return sb->s_bdi;
-}
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
 
 static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
index 00952e9..fe6bdfb 100644 (file)
@@ -6,19 +6,10 @@
 #define __LINUX_BIO_H
 
 #include <linux/mempool.h>
-#include <linux/ioprio.h>
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
 #include <linux/uio.h>
 
-#define BIO_DEBUG
-
-#ifdef BIO_DEBUG
-#define BIO_BUG_ON     BUG_ON
-#else
-#define BIO_BUG_ON
-#endif
-
 #define BIO_MAX_VECS           256U
 
 static inline unsigned int bio_max_segs(unsigned int nr_segs)
@@ -78,22 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
               bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
-static inline bool bio_mergeable(struct bio *bio)
-{
-       if (bio->bi_opf & REQ_NOMERGE_FLAGS)
-               return false;
-
-       return true;
-}
-
-static inline unsigned int bio_cur_bytes(struct bio *bio)
-{
-       if (bio_has_data(bio))
-               return bio_iovec(bio).bv_len;
-       else /* dataless requests such as discard */
-               return bio->bi_iter.bi_size;
-}
-
 static inline void *bio_data(struct bio *bio)
 {
        if (bio_has_data(bio))
@@ -102,25 +77,6 @@ static inline void *bio_data(struct bio *bio)
        return NULL;
 }
 
-/**
- * bio_full - check if the bio is full
- * @bio:       bio to check
- * @len:       length of one segment to be added
- *
- * Return true if @bio is full and one segment with @len bytes can't be
- * added to the bio, otherwise return false
- */
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-       if (bio->bi_vcnt >= bio->bi_max_vecs)
-               return true;
-
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
-               return true;
-
-       return false;
-}
-
 static inline bool bio_next_segment(const struct bio *bio,
                                    struct bvec_iter_all *iter)
 {
@@ -163,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio,
                bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
 }
 
+void __bio_advance(struct bio *, unsigned bytes);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:       bio to advance
+ * @bytes:     number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+static inline void bio_advance(struct bio *bio, unsigned int nbytes)
+{
+       if (nbytes == bio->bi_iter.bi_size) {
+               bio->bi_iter.bi_size = 0;
+               return;
+       }
+       __bio_advance(bio, nbytes);
+}
+
 #define __bio_for_each_segment(bvl, bio, iter, start)                  \
        for (iter = (start);                                            \
             (iter).bi_size &&                                          \
@@ -265,37 +243,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
        bio->bi_flags &= ~(1U << bit);
 }
 
-static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
-{
-       *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-}
-
-static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
-{
-       struct bvec_iter iter = bio->bi_iter;
-       int idx;
-
-       bio_get_first_bvec(bio, bv);
-       if (bv->bv_len == bio->bi_iter.bi_size)
-               return;         /* this bio only has a single bvec */
-
-       bio_advance_iter(bio, &iter, iter.bi_size);
-
-       if (!iter.bi_bvec_done)
-               idx = iter.bi_idx - 1;
-       else    /* in the middle of bvec */
-               idx = iter.bi_idx;
-
-       *bv = bio->bi_io_vec[idx];
-
-       /*
-        * iter.bi_bvec_done records actual length of the last bvec
-        * if this bio ends in the middle of one io vector
-        */
-       if (iter.bi_bvec_done)
-               bv->bv_len = iter.bi_bvec_done;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
@@ -424,7 +371,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
        return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
-extern blk_qc_t submit_bio(struct bio *);
+void submit_bio(struct bio *bio);
 
 extern void bio_endio(struct bio *);
 
@@ -456,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
-extern void bio_advance(struct bio *, unsigned);
-
 extern void bio_init(struct bio *bio, struct bio_vec *table,
                     unsigned short max_vecs);
 extern void bio_uninit(struct bio *);
@@ -469,12 +414,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
                           unsigned int, unsigned int);
 int bio_add_zone_append_page(struct bio *bio, struct page *page,
                             unsigned int len, unsigned int offset);
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
-               unsigned int len, unsigned int off, bool *same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-void bio_release_pages(struct bio *bio, bool mark_dirty);
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
@@ -482,27 +426,16 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                               struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
-void bio_truncate(struct bio *bio, unsigned new_size);
 void guard_bio_eod(struct bio *bio);
 void zero_fill_bio(struct bio *bio);
 
-extern const char *bio_devname(struct bio *bio, char *buffer);
+static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
+{
+       if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+               __bio_release_pages(bio, mark_dirty);
+}
 
-#define bio_set_dev(bio, bdev)                                 \
-do {                                                   \
-       bio_clear_flag(bio, BIO_REMAPPED);              \
-       if ((bio)->bi_bdev != (bdev))                   \
-               bio_clear_flag(bio, BIO_THROTTLED);     \
-       (bio)->bi_bdev = (bdev);                        \
-       bio_associate_blkg(bio);                        \
-} while (0)
-
-#define bio_copy_dev(dst, src)                 \
-do {                                           \
-       bio_clear_flag(dst, BIO_REMAPPED);              \
-       (dst)->bi_bdev = (src)->bi_bdev;        \
-       bio_clone_blkg_association(dst, src);   \
-} while (0)
+extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_dev(bio) \
        disk_devt((bio)->bi_bdev->bd_disk)
@@ -521,6 +454,22 @@ static inline void bio_clone_blkg_association(struct bio *dst,
                                              struct bio *src) { }
 #endif /* CONFIG_BLK_CGROUP */
 
+static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+       bio_clear_flag(bio, BIO_REMAPPED);
+       if (bio->bi_bdev != bdev)
+               bio_clear_flag(bio, BIO_THROTTLED);
+       bio->bi_bdev = bdev;
+       bio_associate_blkg(bio);
+}
+
+static inline void bio_copy_dev(struct bio *dst, struct bio *src)
+{
+       bio_clear_flag(dst, BIO_REMAPPED);
+       dst->bi_bdev = src->bi_bdev;
+       bio_clone_blkg_association(dst, src);
+}
+
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
@@ -784,7 +733,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
  */
 static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 {
-       bio->bi_opf |= REQ_HIPRI;
+       bio->bi_opf |= REQ_POLLED;
        if (!is_sync_kiocb(kiocb))
                bio->bi_opf |= REQ_NOWAIT;
 }
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
new file mode 100644 (file)
index 0000000..bbab65b
--- /dev/null
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_BLK_CRYPTO_PROFILE_H
+#define __LINUX_BLK_CRYPTO_PROFILE_H
+
+#include <linux/bio.h>
+#include <linux/blk-crypto.h>
+
+struct blk_crypto_profile;
+
+/**
+ * struct blk_crypto_ll_ops - functions to control inline encryption hardware
+ *
+ * Low-level operations for controlling inline encryption hardware.  This
+ * interface must be implemented by storage drivers that support inline
+ * encryption.  All functions may sleep, are serialized by profile->lock, and
+ * are never called while profile->dev (if set) is runtime-suspended.
+ */
+struct blk_crypto_ll_ops {
+
+       /**
+        * @keyslot_program: Program a key into the inline encryption hardware.
+        *
+        * Program @key into the specified @slot in the inline encryption
+        * hardware, overwriting any key that the keyslot may already contain.
+        * The keyslot is guaranteed to not be in-use by any I/O.
+        *
+        * This is required if the device has keyslots.  Otherwise (i.e. if the
+        * device is a layered device, or if the device is real hardware that
+        * simply doesn't have the concept of keyslots) it is never called.
+        *
+        * Must return 0 on success, or -errno on failure.
+        */
+       int (*keyslot_program)(struct blk_crypto_profile *profile,
+                              const struct blk_crypto_key *key,
+                              unsigned int slot);
+
+       /**
+        * @keyslot_evict: Evict a key from the inline encryption hardware.
+        *
+        * If the device has keyslots, this function must evict the key from the
+        * specified @slot.  The slot will contain @key, but there should be no
+        * need for the @key argument to be used as @slot should be sufficient.
+        * The keyslot is guaranteed to not be in-use by any I/O.
+        *
+        * If the device doesn't have keyslots itself, this function must evict
+        * @key from any underlying devices.  @slot won't be valid in this case.
+        *
+        * If there are no keyslots and no underlying devices, this function
+        * isn't required.
+        *
+        * Must return 0 on success, or -errno on failure.
+        */
+       int (*keyslot_evict)(struct blk_crypto_profile *profile,
+                            const struct blk_crypto_key *key,
+                            unsigned int slot);
+};
+
+/**
+ * struct blk_crypto_profile - inline encryption profile for a device
+ *
+ * This struct contains a storage device's inline encryption capabilities (e.g.
+ * the supported crypto algorithms), driver-provided functions to control the
+ * inline encryption hardware (e.g. programming and evicting keys), and optional
+ * device-independent keyslot management data.
+ */
+struct blk_crypto_profile {
+
+       /* public: Drivers must initialize the following fields. */
+
+       /**
+        * @ll_ops: Driver-provided functions to control the inline encryption
+        * hardware, e.g. program and evict keys.
+        */
+       struct blk_crypto_ll_ops ll_ops;
+
+       /**
+        * @max_dun_bytes_supported: The maximum number of bytes supported for
+        * specifying the data unit number (DUN).  Specifically, the range of
+        * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1.
+        */
+       unsigned int max_dun_bytes_supported;
+
+       /**
+        * @modes_supported: Array of bitmasks that specifies whether each
+        * combination of crypto mode and data unit size is supported.
+        * Specifically, the i'th bit of modes_supported[crypto_mode] is set if
+        * crypto_mode can be used with a data unit size of (1 << i).  Note that
+        * only data unit sizes that are powers of 2 can be supported.
+        */
+       unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX];
+
+       /**
+        * @dev: An optional device for runtime power management.  If the driver
+        * provides this device, it will be runtime-resumed before any function
+        * in @ll_ops is called and will remain resumed during the call.
+        */
+       struct device *dev;
+
+       /* private: The following fields shouldn't be accessed by drivers. */
+
+       /* Number of keyslots, or 0 if not applicable */
+       unsigned int num_slots;
+
+       /*
+        * Serializes all calls to functions in @ll_ops as well as all changes
+        * to @slot_hashtable.  This can also be taken in read mode to look up
+        * keyslots while ensuring that they can't be changed concurrently.
+        */
+       struct rw_semaphore lock;
+
+       /* List of idle slots, with least recently used slot at front */
+       wait_queue_head_t idle_slots_wait_queue;
+       struct list_head idle_slots;
+       spinlock_t idle_slots_lock;
+
+       /*
+        * Hash table which maps struct *blk_crypto_key to keyslots, so that we
+        * can find a key's keyslot in O(1) time rather than O(num_slots).
+        * Protected by 'lock'.
+        */
+       struct hlist_head *slot_hashtable;
+       unsigned int log_slot_ht_size;
+
+       /* Per-keyslot data */
+       struct blk_crypto_keyslot *slots;
+};
+
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+                           unsigned int num_slots);
+
+int devm_blk_crypto_profile_init(struct device *dev,
+                                struct blk_crypto_profile *profile,
+                                unsigned int num_slots);
+
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
+
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+                               const struct blk_crypto_config *cfg);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+                          const struct blk_crypto_key *key);
+
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
+
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+                                      const struct blk_crypto_profile *child);
+
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+                                const struct blk_crypto_profile *reference);
+
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+                                   const struct blk_crypto_profile *src);
+
+#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
new file mode 100644 (file)
index 0000000..8a038ea
--- /dev/null
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BLK_INTEGRITY_H
+#define _LINUX_BLK_INTEGRITY_H
+
+#include <linux/blk-mq.h>
+
+struct request;
+
+enum blk_integrity_flags {
+       BLK_INTEGRITY_VERIFY            = 1 << 0,
+       BLK_INTEGRITY_GENERATE          = 1 << 1,
+       BLK_INTEGRITY_DEVICE_CAPABLE    = 1 << 2,
+       BLK_INTEGRITY_IP_CHECKSUM       = 1 << 3,
+};
+
+struct blk_integrity_iter {
+       void                    *prot_buf;
+       void                    *data_buf;
+       sector_t                seed;
+       unsigned int            data_size;
+       unsigned short          interval;
+       const char              *disk_name;
+};
+
+typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef void (integrity_prepare_fn) (struct request *);
+typedef void (integrity_complete_fn) (struct request *, unsigned int);
+
+struct blk_integrity_profile {
+       integrity_processing_fn         *generate_fn;
+       integrity_processing_fn         *verify_fn;
+       integrity_prepare_fn            *prepare_fn;
+       integrity_complete_fn           *complete_fn;
+       const char                      *name;
+};
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+void blk_integrity_register(struct gendisk *, struct blk_integrity *);
+void blk_integrity_unregister(struct gendisk *);
+int blk_integrity_compare(struct gendisk *, struct gendisk *);
+int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+                                  struct scatterlist *);
+int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+       struct blk_integrity *bi = &disk->queue->integrity;
+
+       if (!bi->profile)
+               return NULL;
+
+       return bi;
+}
+
+static inline struct blk_integrity *
+bdev_get_integrity(struct block_device *bdev)
+{
+       return blk_get_integrity(bdev->bd_disk);
+}
+
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+       return q->integrity.profile;
+}
+
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+                                                   unsigned int segs)
+{
+       q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+       return q->limits.max_integrity_segments;
+}
+
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:                blk_integrity profile for device
+ * @sectors:   Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
+static inline bool blk_integrity_rq(struct request *rq)
+{
+       return rq->cmd_flags & REQ_INTEGRITY;
+}
+
+/*
+ * Return the first bvec that contains integrity data.  Only drivers that are
+ * limited to a single integrity segment should use this helper.
+ */
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+       if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
+               return NULL;
+       return rq->bio->bi_integrity->bip_vec;
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+                                           struct bio *b)
+{
+       return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+                                         struct bio *b,
+                                         struct scatterlist *s)
+{
+       return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+       return NULL;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+       return NULL;
+}
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+       return false;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+       return 0;
+}
+static inline void blk_integrity_register(struct gendisk *d,
+                                        struct blk_integrity *b)
+{
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+                                                   unsigned int segs)
+{
+}
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+       return 0;
+}
+
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return 0;
+}
+static inline int blk_integrity_rq(struct request *rq)
+{
+       return 0;
+}
+
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+       return NULL;
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+#endif /* _LINUX_BLK_INTEGRITY_H */
index 13ba186..8682663 100644 (file)
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
 #include <linux/lockdep.h>
+#include <linux/scatterlist.h>
+#include <linux/prefetch.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
 
+#define BLKDEV_MIN_RQ  4
+#define BLKDEV_DEFAULT_RQ      128
+
+typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+
+/*
+ * request flags */
+typedef __u32 __bitwise req_flags_t;
+
+/* drive already may have started this one */
+#define RQF_STARTED            ((__force req_flags_t)(1 << 1))
+/* may not be passed by ioscheduler */
+#define RQF_SOFTBARRIER                ((__force req_flags_t)(1 << 3))
+/* request for flush sequence */
+#define RQF_FLUSH_SEQ          ((__force req_flags_t)(1 << 4))
+/* merge of different types, fail separately */
+#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
+/* track inflight for MQ */
+#define RQF_MQ_INFLIGHT                ((__force req_flags_t)(1 << 6))
+/* don't call prep for this one */
+#define RQF_DONTPREP           ((__force req_flags_t)(1 << 7))
+/* vaguely specified driver internal error.  Ignored by the block layer */
+#define RQF_FAILED             ((__force req_flags_t)(1 << 10))
+/* don't warn about errors */
+#define RQF_QUIET              ((__force req_flags_t)(1 << 11))
+/* elevator private data attached */
+#define RQF_ELVPRIV            ((__force req_flags_t)(1 << 12))
+/* account into disk and partition IO statistics */
+#define RQF_IO_STAT            ((__force req_flags_t)(1 << 13))
+/* runtime pm request */
+#define RQF_PM                 ((__force req_flags_t)(1 << 15))
+/* on IO scheduler merge hash */
+#define RQF_HASHED             ((__force req_flags_t)(1 << 16))
+/* track IO completion time */
+#define RQF_STATS              ((__force req_flags_t)(1 << 17))
+/* Look at ->special_vec for the actual data payload instead of the
+   bio chain. */
+#define RQF_SPECIAL_PAYLOAD    ((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
+/* ->timeout has been called, don't expire again */
+#define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
+/* queue has elevator attached */
+#define RQF_ELV                        ((__force req_flags_t)(1 << 22))
+
+/* flags that prevent us from merging requests: */
+#define RQF_NOMERGE_FLAGS \
+       (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+
+enum mq_rq_state {
+       MQ_RQ_IDLE              = 0,
+       MQ_RQ_IN_FLIGHT         = 1,
+       MQ_RQ_COMPLETE          = 2,
+};
+
+/*
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
+ */
+struct request {
+       struct request_queue *q;
+       struct blk_mq_ctx *mq_ctx;
+       struct blk_mq_hw_ctx *mq_hctx;
+
+       unsigned int cmd_flags;         /* op and common flags */
+       req_flags_t rq_flags;
+
+       int tag;
+       int internal_tag;
+
+       unsigned int timeout;
+
+       /* the following two fields are internal, NEVER access directly */
+       unsigned int __data_len;        /* total data len */
+       sector_t __sector;              /* sector cursor */
+
+       struct bio *bio;
+       struct bio *biotail;
+
+       union {
+               struct list_head queuelist;
+               struct request *rq_next;
+       };
+
+       struct gendisk *rq_disk;
+       struct block_device *part;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+       /* Time that the first bio started allocating this request. */
+       u64 alloc_time_ns;
+#endif
+       /* Time that this request was allocated for this IO. */
+       u64 start_time_ns;
+       /* Time that I/O was submitted to the device. */
+       u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+       unsigned short wbt_flags;
+#endif
+       /*
+        * rq sectors used for blk stats. It has the same value
+        * with blk_rq_sectors(rq), except that it never be zeroed
+        * by completion.
+        */
+       unsigned short stats_sectors;
+
+       /*
+        * Number of scatter-gather DMA addr+len pairs after
+        * physical address coalescing is performed.
+        */
+       unsigned short nr_phys_segments;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       unsigned short nr_integrity_segments;
+#endif
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+       struct bio_crypt_ctx *crypt_ctx;
+       struct blk_crypto_keyslot *crypt_keyslot;
+#endif
+
+       unsigned short write_hint;
+       unsigned short ioprio;
+
+       enum mq_rq_state state;
+       refcount_t ref;
+
+       unsigned long deadline;
+
+       /*
+        * The hash is used inside the scheduler, and killed once the
+        * request reaches the dispatch list. The ipi_list is only used
+        * to queue the request for softirq completion, which is long
+        * after the request has been unhashed (and even removed from
+        * the dispatch list).
+        */
+       union {
+               struct hlist_node hash; /* merge hash */
+               struct llist_node ipi_list;
+       };
+
+       /*
+        * The rb_node is only used inside the io scheduler, requests
+        * are pruned when moved to the dispatch queue. So let the
+        * completion_data share space with the rb_node.
+        */
+       union {
+               struct rb_node rb_node; /* sort/lookup */
+               struct bio_vec special_vec;
+               void *completion_data;
+               int error_count; /* for legacy drivers, don't use */
+       };
+
+
+       /*
+        * Three pointers are available for the IO schedulers, if they need
+        * more they have to dynamically allocate it.  Flush requests are
+        * never put on the IO scheduler. So let the flush fields share
+        * space with the elevator data.
+        */
+       union {
+               struct {
+                       struct io_cq            *icq;
+                       void                    *priv[2];
+               } elv;
+
+               struct {
+                       unsigned int            seq;
+                       struct list_head        list;
+                       rq_end_io_fn            *saved_end_io;
+               } flush;
+       };
+
+       union {
+               struct __call_single_data csd;
+               u64 fifo_time;
+       };
+
+       /*
+        * completion callback.
+        */
+       rq_end_io_fn *end_io;
+       void *end_io_data;
+};
+
+#define req_op(req) \
+       ((req)->cmd_flags & REQ_OP_MASK)
+
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+       return blk_op_is_passthrough(req_op(rq));
+}
+
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+       return req->ioprio;
+}
+
+#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)
+
+#define rq_dma_dir(rq) \
+       (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
+enum blk_eh_timer_return {
+       BLK_EH_DONE,            /* drivers has completed the command */
+       BLK_EH_RESET_TIMER,     /* reset timer and try again */
+};
+
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
+
 /**
  * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
  * block device
@@ -126,9 +342,6 @@ struct blk_mq_hw_ctx {
        unsigned long           queued;
        /** @run: Number of dispatched requests. */
        unsigned long           run;
-#define BLK_MQ_MAX_DISPATCH_ORDER      7
-       /** @dispatched: Number of dispatch requests by queue. */
-       unsigned long           dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
        /** @numa_node: NUMA node the storage adapter has been connected to. */
        unsigned int            numa_node;
@@ -148,13 +361,6 @@ struct blk_mq_hw_ctx {
        /** @kobj: Kernel object for sysfs. */
        struct kobject          kobj;
 
-       /** @poll_considered: Count times blk_poll() was called. */
-       unsigned long           poll_considered;
-       /** @poll_invoked: Count how many requests blk_poll() polled. */
-       unsigned long           poll_invoked;
-       /** @poll_success: Count how many polled requests were completed. */
-       unsigned long           poll_success;
-
 #ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
@@ -232,13 +438,11 @@ enum hctx_type {
  * @flags:        Zero or more BLK_MQ_F_* flags.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *                tag set.
- * @active_queues_shared_sbitmap:
- *                number of active request queues per tag set.
- * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
- * @__breserved_tags:
- *                A shared reserved tags sbitmap, used over all hctx's
  * @tags:         Tag sets. One tag set per hardware queue. Has @nr_hw_queues
  *                elements.
+ * @shared_tags:
+ *                Shared set of tags. Has @nr_hw_queues elements. If set,
+ *                shared by all @tags.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:     List of the request queues that use this tag set. See also
  *                request_queue.tag_set_list.
@@ -255,12 +459,11 @@ struct blk_mq_tag_set {
        unsigned int            timeout;
        unsigned int            flags;
        void                    *driver_data;
-       atomic_t                active_queues_shared_sbitmap;
 
-       struct sbitmap_queue    __bitmap_tags;
-       struct sbitmap_queue    __breserved_tags;
        struct blk_mq_tags      **tags;
 
+       struct blk_mq_tags      *shared_tags;
+
        struct mutex            tag_list_lock;
        struct list_head        tag_list;
 };
@@ -330,7 +533,7 @@ struct blk_mq_ops {
        /**
         * @poll: Called to poll for completion of a specific tag.
         */
-       int (*poll)(struct blk_mq_hw_ctx *);
+       int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
 
        /**
         * @complete: Mark the request as complete.
@@ -364,11 +567,6 @@ struct blk_mq_ops {
                             unsigned int);
 
        /**
-        * @initialize_rq_fn: Called from inside blk_get_request().
-        */
-       void (*initialize_rq_fn)(struct request *rq);
-
-       /**
         * @cleanup_rq: Called before freeing one request which isn't completed
         * yet, and usually for freeing the driver private data.
         */
@@ -432,6 +630,8 @@ enum {
        ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
                << BLK_MQ_F_ALLOC_POLICY_START_BIT)
 
+#define BLK_MQ_NO_HCTX_IDX     (-1U)
+
 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
                struct lock_class_key *lkclass);
 #define blk_mq_alloc_disk(set, queuedata)                              \
@@ -451,8 +651,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                unsigned int set_flags);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
-
 void blk_mq_free_request(struct request *rq);
 
 bool blk_mq_queue_inflight(struct request_queue *q);
@@ -471,7 +669,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                unsigned int op, blk_mq_req_flags_t flags,
                unsigned int hctx_idx);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+       unsigned int nr_tags;
+       unsigned int nr_reserved_tags;
+
+       atomic_t active_queues;
+
+       struct sbitmap_queue bitmap_tags;
+       struct sbitmap_queue breserved_tags;
+
+       struct request **rqs;
+       struct request **static_rqs;
+       struct list_head page_list;
+
+       /*
+        * used to clear request reference in rqs[] before freeing one
+        * request pool
+        */
+       spinlock_t lock;
+};
+
+static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
+                                              unsigned int tag)
+{
+       if (tag < tags->nr_tags) {
+               prefetch(tags->rqs[tag]);
+               return tags->rqs[tag];
+       }
+
+       return NULL;
+}
 
 enum {
        BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -524,6 +755,35 @@ static inline void blk_mq_set_request_complete(struct request *rq)
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_comp_batch *ib);
+
+/*
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+}
+
+/*
+ * Batched completions only work when there is no I/O error and no special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+                                      struct io_comp_batch *iob, int ioerror,
+                                      void (*complete)(struct io_comp_batch *))
+{
+       if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+               return false;
+       if (!iob->complete)
+               iob->complete = complete;
+       else if (iob->complete != complete)
+               return false;
+       iob->need_ts |= blk_mq_need_time_stamp(req);
+       rq_list_add(&iob->req_list, req);
+       return true;
+}
 
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
@@ -605,16 +865,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
        for ((i) = 0; (i) < (hctx)->nr_ctx &&                           \
             ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
 
-static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
-               struct request *rq)
-{
-       if (rq->tag != -1)
-               return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
-
-       return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
-                       BLK_QC_T_INTERNAL;
-}
-
 static inline void blk_mq_cleanup_rq(struct request *rq)
 {
        if (rq->q->mq_ops->cleanup_rq)
@@ -633,8 +883,265 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
                rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
-blk_qc_t blk_mq_submit_bio(struct bio *bio);
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key);
 
+static inline bool rq_is_sync(struct request *rq)
+{
+       return op_is_sync(rq->cmd_flags);
+}
+
+void blk_rq_init(struct request_queue *q, struct request *rq);
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+               struct bio_set *bs, gfp_t gfp_mask,
+               int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
+void blk_rq_unprep_clone(struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request_queue *q,
+               struct request *rq);
+
+struct rq_map_data {
+       struct page **pages;
+       int page_order;
+       int nr_entries;
+       unsigned long offset;
+       int null_mapped;
+       int from_user;
+};
+
+int blk_rq_map_user(struct request_queue *, struct request *,
+               struct rq_map_data *, void __user *, unsigned long, gfp_t);
+int blk_rq_map_user_iov(struct request_queue *, struct request *,
+               struct rq_map_data *, const struct iov_iter *, gfp_t);
+int blk_rq_unmap_user(struct bio *);
+int blk_rq_map_kern(struct request_queue *, struct request *, void *,
+               unsigned int, gfp_t);
+int blk_rq_append_bio(struct request *rq, struct bio *bio);
+void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
+               rq_end_io_fn *);
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+               int at_head);
+
+struct req_iterator {
+       struct bvec_iter iter;
+       struct bio *bio;
+};
+
+#define __rq_for_each_bio(_bio, rq)    \
+       if ((rq->bio))                  \
+               for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
+
+#define rq_for_each_segment(bvl, _rq, _iter)                   \
+       __rq_for_each_bio(_iter.bio, _rq)                       \
+               bio_for_each_segment(bvl, _iter.bio, _iter.iter)
+
+#define rq_for_each_bvec(bvl, _rq, _iter)                      \
+       __rq_for_each_bio(_iter.bio, _rq)                       \
+               bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
+#define rq_iter_last(bvec, _iter)                              \
+               (_iter.bio->bi_next == NULL &&                  \
+                bio_iter_last(bvec, _iter.iter))
+
+/*
+ * blk_rq_pos()                        : the current sector
+ * blk_rq_bytes()              : bytes left in the entire request
+ * blk_rq_cur_bytes()          : bytes left in the current segment
+ * blk_rq_err_bytes()          : bytes left till the next error boundary
+ * blk_rq_sectors()            : sectors left in the entire request
+ * blk_rq_cur_sectors()                : sectors left in the current segment
+ * blk_rq_stats_sectors()      : sectors of the entire request used for stats
+ */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+       return rq->__sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+       return rq->__data_len;
+}
+
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+       if (!rq->bio)
+               return 0;
+       if (!bio_has_data(rq->bio))     /* dataless requests such as discard */
+               return rq->bio->bi_iter.bi_size;
+       return bio_iovec(rq->bio).bv_len;
+}
+
+unsigned int blk_rq_err_bytes(const struct request *rq);
+
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+       return blk_rq_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+       return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+       return rq->stats_sectors;
+}
+
+/*
+ * Some commands like WRITE SAME have a payload or data transfer size which
+ * is different from the size of the request.  Any driver that supports such
+ * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
+ * calculate the data transfer size.
+ */
+static inline unsigned int blk_rq_payload_bytes(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return rq->special_vec.bv_len;
+       return blk_rq_bytes(rq);
+}
+
+/*
+ * Return the first full biovec in the request.  The caller needs to check that
+ * there are any bvecs before calling this helper.
+ */
+static inline struct bio_vec req_bvec(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return rq->special_vec;
+       return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
+}
+
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+       unsigned int nr_bios = 0;
+       struct bio *bio;
+
+       __rq_for_each_bio(bio, rq)
+               nr_bios++;
+
+       return nr_bios;
+}
+
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
+/*
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ */
+bool blk_update_request(struct request *rq, blk_status_t error,
+                              unsigned int nr_bytes);
+void blk_abort_request(struct request *);
+
+/*
+ * Number of physical segments as sent to the device.
+ *
+ * Normally this is the number of discontiguous data segments sent by the
+ * submitter.  But for data-less command like discard we might have no
+ * actual data segments submitted, but the driver might have to add it's
+ * own special payload.  In that case we still return 1 here so that this
+ * special payload will be mapped.
+ */
+static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return 1;
+       return rq->nr_phys_segments;
+}
+
+/*
+ * Number of discard segments (or ranges) the driver needs to fill in.
+ * Each discard bio merged into a request is counted as one segment.
+ */
+static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
+{
+       return max_t(unsigned short, rq->nr_phys_segments, 1);
+}
+
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+               struct scatterlist *sglist, struct scatterlist **last_sg);
+static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+               struct scatterlist *sglist)
+{
+       struct scatterlist *last_sg = NULL;
+
+       return __blk_rq_map_sg(q, rq, sglist, &last_sg);
+}
+void blk_dump_rq_flags(struct request *, char *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
+bool blk_req_needs_zone_write_lock(struct request *rq);
+bool blk_req_zone_write_trylock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+       if (blk_req_needs_zone_write_lock(rq))
+               __blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+               __blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+       return rq->q->seq_zones_wlock &&
+               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+       if (!blk_req_needs_zone_write_lock(rq))
+               return true;
+       return !blk_req_zone_is_write_locked(rq);
+}
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+       return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+       return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+       return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void rq_flush_dcache_pages(struct request *rq);
+#else
+static inline void rq_flush_dcache_pages(struct request *rq)
+{
+}
+#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+#endif /* BLK_MQ_H */
index be622b5..fe065c3 100644 (file)
@@ -20,8 +20,26 @@ struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 struct bio_crypt_ctx;
 
+/*
+ * The basic unit of block I/O is a sector. It is used in a number of contexts
+ * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
+ * bytes. Variables of type sector_t represent an offset or size that is a
+ * multiple of 512 bytes. Hence these two constants.
+ */
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK            (PAGE_SECTORS - 1)
+
 struct block_device {
        sector_t                bd_start_sect;
+       sector_t                bd_nr_sectors;
        struct disk_stats __percpu *bd_stats;
        unsigned long           bd_stamp;
        bool                    bd_read_only;   /* read-only policy */
@@ -38,6 +56,7 @@ struct block_device {
        u8                      bd_partno;
        spinlock_t              bd_size_lock; /* for bd_inode->i_size updates */
        struct gendisk *        bd_disk;
+       struct request_queue *  bd_queue;
 
        /* The counter of freeze processes */
        int                     bd_fsfreeze_count;
@@ -208,6 +227,9 @@ static inline void bio_issue_init(struct bio_issue *issue,
                        ((u64)size << BIO_ISSUE_SIZE_SHIFT));
 }
 
+typedef unsigned int blk_qc_t;
+#define BLK_QC_T_NONE          -1U
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -227,8 +249,8 @@ struct bio {
 
        struct bvec_iter        bi_iter;
 
+       blk_qc_t                bi_cookie;
        bio_end_io_t            *bi_end_io;
-
        void                    *bi_private;
 #ifdef CONFIG_BLK_CGROUP
        /*
@@ -384,7 +406,7 @@ enum req_flag_bits {
        /* command specific flags for REQ_OP_WRITE_ZEROES: */
        __REQ_NOUNMAP,          /* do not free blocks when zeroing */
 
-       __REQ_HIPRI,
+       __REQ_POLLED,           /* caller polls for completion using bio_poll */
 
        /* for driver use */
        __REQ_DRV,
@@ -409,7 +431,7 @@ enum req_flag_bits {
 #define REQ_CGROUP_PUNT                (1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
-#define REQ_HIPRI              (1ULL << __REQ_HIPRI)
+#define REQ_POLLED             (1ULL << __REQ_POLLED)
 
 #define REQ_DRV                        (1ULL << __REQ_DRV)
 #define REQ_SWAP               (1ULL << __REQ_SWAP)
@@ -431,8 +453,6 @@ enum stat_group {
 
 #define bio_op(bio) \
        ((bio)->bi_opf & REQ_OP_MASK)
-#define req_op(req) \
-       ((req)->cmd_flags & REQ_OP_MASK)
 
 /* obsolete, don't use in new code */
 static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
@@ -497,31 +517,6 @@ static inline int op_stat_group(unsigned int op)
        return op_is_write(op);
 }
 
-typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE          -1U
-#define BLK_QC_T_SHIFT         16
-#define BLK_QC_T_INTERNAL      (1U << 31)
-
-static inline bool blk_qc_t_valid(blk_qc_t cookie)
-{
-       return cookie != BLK_QC_T_NONE;
-}
-
-static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-{
-       return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
-}
-
-static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
-{
-       return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
-}
-
-static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
-{
-       return (cookie & BLK_QC_T_INTERNAL) != 0;
-}
-
 struct blk_rq_stat {
        u64 mean;
        u64 min;
index 12b9dbc..bd4370b 100644 (file)
@@ -3,8 +3,6 @@
 #define _LINUX_BLKDEV_H
 
 #include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/llist.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/wait.h>
-#include <linux/mempool.h>
-#include <linux/pfn.h>
 #include <linux/bio.h>
-#include <linux/stringify.h>
 #include <linux/gfp.h>
-#include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
-#include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
-#include <linux/pm.h>
 #include <linux/sbitmap.h>
 
 struct module;
@@ -33,14 +25,12 @@ struct request;
 struct sg_io_hdr;
 struct blkcg_gq;
 struct blk_flush_queue;
+struct kiocb;
 struct pr_ops;
 struct rq_qos;
 struct blk_queue_stats;
 struct blk_stat_callback;
-struct blk_keyslot_manager;
-
-#define BLKDEV_MIN_RQ  4
-#define BLKDEV_MAX_RQ  128     /* Default maximum */
+struct blk_crypto_profile;
 
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
@@ -54,186 +44,13 @@ struct blk_keyslot_manager;
  */
 #define BLKCG_MAX_POLS         6
 
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
-
-/*
- * request flags */
-typedef __u32 __bitwise req_flags_t;
-
-/* drive already may have started this one */
-#define RQF_STARTED            ((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER                ((__force req_flags_t)(1 << 3))
-/* request for flush sequence */
-#define RQF_FLUSH_SEQ          ((__force req_flags_t)(1 << 4))
-/* merge of different types, fail separately */
-#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT                ((__force req_flags_t)(1 << 6))
-/* don't call prep for this one */
-#define RQF_DONTPREP           ((__force req_flags_t)(1 << 7))
-/* vaguely specified driver internal error.  Ignored by the block layer */
-#define RQF_FAILED             ((__force req_flags_t)(1 << 10))
-/* don't warn about errors */
-#define RQF_QUIET              ((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV            ((__force req_flags_t)(1 << 12))
-/* account into disk and partition IO statistics */
-#define RQF_IO_STAT            ((__force req_flags_t)(1 << 13))
-/* runtime pm request */
-#define RQF_PM                 ((__force req_flags_t)(1 << 15))
-/* on IO scheduler merge hash */
-#define RQF_HASHED             ((__force req_flags_t)(1 << 16))
-/* track IO completion time */
-#define RQF_STATS              ((__force req_flags_t)(1 << 17))
-/* Look at ->special_vec for the actual data payload instead of the
-   bio chain. */
-#define RQF_SPECIAL_PAYLOAD    ((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
-
-/* flags that prevent us from merging requests: */
-#define RQF_NOMERGE_FLAGS \
-       (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
-
-/*
- * Request state for blk-mq.
- */
-enum mq_rq_state {
-       MQ_RQ_IDLE              = 0,
-       MQ_RQ_IN_FLIGHT         = 1,
-       MQ_RQ_COMPLETE          = 2,
-};
-
-/*
- * Try to put the fields that are referenced together in the same cacheline.
- *
- * If you modify this structure, make sure to update blk_rq_init() and
- * especially blk_mq_rq_ctx_init() to take care of the added fields.
- */
-struct request {
-       struct request_queue *q;
-       struct blk_mq_ctx *mq_ctx;
-       struct blk_mq_hw_ctx *mq_hctx;
-
-       unsigned int cmd_flags;         /* op and common flags */
-       req_flags_t rq_flags;
-
-       int tag;
-       int internal_tag;
-
-       /* the following two fields are internal, NEVER access directly */
-       unsigned int __data_len;        /* total data len */
-       sector_t __sector;              /* sector cursor */
-
-       struct bio *bio;
-       struct bio *biotail;
-
-       struct list_head queuelist;
-
-       /*
-        * The hash is used inside the scheduler, and killed once the
-        * request reaches the dispatch list. The ipi_list is only used
-        * to queue the request for softirq completion, which is long
-        * after the request has been unhashed (and even removed from
-        * the dispatch list).
-        */
-       union {
-               struct hlist_node hash; /* merge hash */
-               struct llist_node ipi_list;
-       };
-
-       /*
-        * The rb_node is only used inside the io scheduler, requests
-        * are pruned when moved to the dispatch queue. So let the
-        * completion_data share space with the rb_node.
-        */
-       union {
-               struct rb_node rb_node; /* sort/lookup */
-               struct bio_vec special_vec;
-               void *completion_data;
-               int error_count; /* for legacy drivers, don't use */
-       };
-
-       /*
-        * Three pointers are available for the IO schedulers, if they need
-        * more they have to dynamically allocate it.  Flush requests are
-        * never put on the IO scheduler. So let the flush fields share
-        * space with the elevator data.
-        */
-       union {
-               struct {
-                       struct io_cq            *icq;
-                       void                    *priv[2];
-               } elv;
-
-               struct {
-                       unsigned int            seq;
-                       struct list_head        list;
-                       rq_end_io_fn            *saved_end_io;
-               } flush;
-       };
-
-       struct gendisk *rq_disk;
-       struct block_device *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
-       /* Time that the first bio started allocating this request. */
-       u64 alloc_time_ns;
-#endif
-       /* Time that this request was allocated for this IO. */
-       u64 start_time_ns;
-       /* Time that I/O was submitted to the device. */
-       u64 io_start_time_ns;
-
-#ifdef CONFIG_BLK_WBT
-       unsigned short wbt_flags;
-#endif
-       /*
-        * rq sectors used for blk stats. It has the same value
-        * with blk_rq_sectors(rq), except that it never be zeroed
-        * by completion.
-        */
-       unsigned short stats_sectors;
-
-       /*
-        * Number of scatter-gather DMA addr+len pairs after
-        * physical address coalescing is performed.
-        */
-       unsigned short nr_phys_segments;
-
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-       unsigned short nr_integrity_segments;
-#endif
-
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       struct bio_crypt_ctx *crypt_ctx;
-       struct blk_ksm_keyslot *crypt_keyslot;
-#endif
-
-       unsigned short write_hint;
-       unsigned short ioprio;
-
-       enum mq_rq_state state;
-       refcount_t ref;
-
-       unsigned int timeout;
-       unsigned long deadline;
-
-       union {
-               struct __call_single_data csd;
-               u64 fifo_time;
-       };
+static inline int blk_validate_block_size(unsigned int bsize)
+{
+       if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
+               return -EINVAL;
 
-       /*
-        * completion callback.
-        */
-       rq_end_io_fn *end_io;
-       void *end_io_data;
-};
+       return 0;
+}
 
 static inline bool blk_op_is_passthrough(unsigned int op)
 {
@@ -241,35 +58,6 @@ static inline bool blk_op_is_passthrough(unsigned int op)
        return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 }
 
-static inline bool blk_rq_is_passthrough(struct request *rq)
-{
-       return blk_op_is_passthrough(req_op(rq));
-}
-
-static inline unsigned short req_get_ioprio(struct request *req)
-{
-       return req->ioprio;
-}
-
-#include <linux/elevator.h>
-
-struct blk_queue_ctx;
-
-struct bio_vec;
-
-enum blk_eh_timer_return {
-       BLK_EH_DONE,            /* drivers has completed the command */
-       BLK_EH_RESET_TIMER,     /* reset timer and try again */
-};
-
-enum blk_queue_state {
-       Queue_down,
-       Queue_up,
-};
-
-#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
-
 /*
  * Zoned block device models (zoned limit).
  *
@@ -370,6 +158,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
 
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+/*
+ * Independent access ranges: struct blk_independent_access_range describes
+ * a range of contiguous sectors that can be accessed using device command
+ * execution resources that are independent from the resources used for
+ * other access ranges. This is typically found with single-LUN multi-actuator
+ * HDDs where each access range is served by a different set of heads.
+ * The set of independent ranges supported by the device is defined using
+ * struct blk_independent_access_ranges. The independent ranges must not overlap
+ * and must include all sectors within the disk capacity (no sector holes
+ * allowed).
+ * For a device with multiple ranges, requests targeting sectors in different
+ * ranges can be executed in parallel. A request can straddle an access range
+ * boundary.
+ */
+struct blk_independent_access_range {
+       struct kobject          kobj;
+       struct request_queue    *queue;
+       sector_t                sector;
+       sector_t                nr_sectors;
+};
+
+struct blk_independent_access_ranges {
+       struct kobject                          kobj;
+       bool                                    sysfs_registered;
+       unsigned int                            nr_ia_ranges;
+       struct blk_independent_access_range     ia_range[];
+};
+
 struct request_queue {
        struct request          *last_merge;
        struct elevator_queue   *elevator;
@@ -444,8 +260,7 @@ struct request_queue {
        unsigned int            dma_alignment;
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       /* Inline crypto capabilities */
-       struct blk_keyslot_manager *ksm;
+       struct blk_crypto_profile *crypto_profile;
 #endif
 
        unsigned int            rq_timeout;
@@ -457,10 +272,9 @@ struct request_queue {
        struct timer_list       timeout;
        struct work_struct      timeout_work;
 
-       atomic_t                nr_active_requests_shared_sbitmap;
+       atomic_t                nr_active_requests_shared_tags;
 
-       struct sbitmap_queue    sched_bitmap_tags;
-       struct sbitmap_queue    sched_breserved_tags;
+       struct blk_mq_tags      *sched_shared_tags;
 
        struct list_head        icq_list;
 #ifdef CONFIG_BLK_CGROUP
@@ -536,6 +350,8 @@ struct request_queue {
         */
        struct mutex            mq_freeze_lock;
 
+       int                     quiesce_depth;
+
        struct blk_mq_tag_set   *tag_set;
        struct list_head        tag_set_list;
        struct bio_set          bio_split;
@@ -549,10 +365,14 @@ struct request_queue {
 
        bool                    mq_sysfs_init_done;
 
-       size_t                  cmd_size;
-
 #define BLK_MAX_WRITE_HINTS    5
        u64                     write_hints[BLK_MAX_WRITE_HINTS];
+
+       /*
+        * Independent sector access ranges. This is always NULL for
+        * devices that do not have multiple independent access ranges.
+        */
+       struct blk_independent_access_ranges *ia_ranges;
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
@@ -579,7 +399,6 @@ struct request_queue {
 #define QUEUE_FLAG_STATS       20      /* track IO start and completion times */
 #define QUEUE_FLAG_POLL_STATS  21      /* collecting stats for hybrid polling */
 #define QUEUE_FLAG_REGISTERED  22      /* queue has been registered to a disk */
-#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED    24      /* queue has been quiesced */
 #define QUEUE_FLAG_PCI_P2PDMA  25      /* device supports PCI p2p requests */
 #define QUEUE_FLAG_ZONE_RESETALL 26    /* supports Zone Reset All */
@@ -613,8 +432,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_secure_erase(q) \
        (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
 #define blk_queue_dax(q)       test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
-#define blk_queue_scsi_passthrough(q)  \
-       test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
 #define blk_queue_pci_p2pdma(q)        \
        test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
@@ -638,11 +455,6 @@ extern void blk_clear_pm_only(struct request_queue *q);
 
 #define list_entry_rq(ptr)     list_entry((ptr), struct request, queuelist)
 
-#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)
-
-#define rq_dma_dir(rq) \
-       (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-
 #define dma_map_bvec(dev, bv, dir, attrs) \
        dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
        (dir), (attrs))
@@ -758,42 +570,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-static inline bool rq_is_sync(struct request *rq)
-{
-       return op_is_sync(rq->cmd_flags);
-}
-
-static inline bool rq_mergeable(struct request *rq)
-{
-       if (blk_rq_is_passthrough(rq))
-               return false;
-
-       if (req_op(rq) == REQ_OP_FLUSH)
-               return false;
-
-       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
-               return false;
-
-       if (req_op(rq) == REQ_OP_ZONE_APPEND)
-               return false;
-
-       if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
-               return false;
-       if (rq->rq_flags & RQF_NOMERGE_FLAGS)
-               return false;
-
-       return true;
-}
-
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
-       if (bio_page(a) == bio_page(b) &&
-           bio_offset(a) == bio_offset(b))
-               return true;
-
-       return false;
-}
-
 static inline unsigned int blk_queue_depth(struct request_queue *q)
 {
        if (q->queue_depth)
@@ -808,83 +584,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ)
 #define BLK_MIN_SG_TIMEOUT     (7 * HZ)
 
-struct rq_map_data {
-       struct page **pages;
-       int page_order;
-       int nr_entries;
-       unsigned long offset;
-       int null_mapped;
-       int from_user;
-};
-
-struct req_iterator {
-       struct bvec_iter iter;
-       struct bio *bio;
-};
-
 /* This should not be used directly - use rq_for_each_segment */
 #define for_each_bio(_bio)             \
        for (; _bio; _bio = _bio->bi_next)
-#define __rq_for_each_bio(_bio, rq)    \
-       if ((rq->bio))                  \
-               for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 
-#define rq_for_each_segment(bvl, _rq, _iter)                   \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bio_for_each_segment(bvl, _iter.bio, _iter.iter)
-
-#define rq_for_each_bvec(bvl, _rq, _iter)                      \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
-
-#define rq_iter_last(bvec, _iter)                              \
-               (_iter.bio->bi_next == NULL &&                  \
-                bio_iter_last(bvec, _iter.iter))
-
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-extern void rq_flush_dcache_pages(struct request *rq);
-#else
-static inline void rq_flush_dcache_pages(struct request *rq)
-{
-}
-#endif
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
-blk_qc_t submit_bio_noacct(struct bio *bio);
-extern void blk_rq_init(struct request_queue *q, struct request *rq);
-extern void blk_put_request(struct request *);
-extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-                                      blk_mq_req_flags_t flags);
+void submit_bio_noacct(struct bio *bio);
+
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-                            struct bio_set *bs, gfp_t gfp_mask,
-                            int (*bio_ctr)(struct bio *, struct bio *, void *),
-                            void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
-extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-                                    struct request *rq);
-int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-extern int blk_rq_map_user(struct request_queue *, struct request *,
-                          struct rq_map_data *, void __user *, unsigned long,
-                          gfp_t);
-extern int blk_rq_unmap_user(struct bio *);
-extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
-extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-                              struct rq_map_data *, const struct iov_iter *,
-                              gfp_t);
-extern void blk_execute_rq_nowait(struct gendisk *,
-                                 struct request *, int, rq_end_io_fn *);
-
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
-                           int at_head);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
 extern const char *blk_op_str(unsigned int op);
@@ -892,69 +605,17 @@ extern const char *blk_op_str(unsigned int op);
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
+/* only poll the hardware once, don't continue until a completion was found */
+#define BLK_POLL_ONESHOT               (1 << 0)
+/* do not sleep to wait for the expected completion time */
+#define BLK_POLL_NOSLEEP               (1 << 1)
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+                       unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
-       return bdev->bd_disk->queue;    /* this is never NULL */
-}
-
-/*
- * The basic unit of block I/O is a sector. It is used in a number of contexts
- * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
- * bytes. Variables of type sector_t represent an offset or size that is a
- * multiple of 512 bytes. Hence these two constants.
- */
-#ifndef SECTOR_SHIFT
-#define SECTOR_SHIFT 9
-#endif
-#ifndef SECTOR_SIZE
-#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-#endif
-
-#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
-#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
-#define SECTOR_MASK            (PAGE_SECTORS - 1)
-
-/*
- * blk_rq_pos()                        : the current sector
- * blk_rq_bytes()              : bytes left in the entire request
- * blk_rq_cur_bytes()          : bytes left in the current segment
- * blk_rq_err_bytes()          : bytes left till the next error boundary
- * blk_rq_sectors()            : sectors left in the entire request
- * blk_rq_cur_sectors()                : sectors left in the current segment
- * blk_rq_stats_sectors()      : sectors of the entire request used for stats
- */
-static inline sector_t blk_rq_pos(const struct request *rq)
-{
-       return rq->__sector;
-}
-
-static inline unsigned int blk_rq_bytes(const struct request *rq)
-{
-       return rq->__data_len;
-}
-
-static inline int blk_rq_cur_bytes(const struct request *rq)
-{
-       return rq->bio ? bio_cur_bytes(rq->bio) : 0;
-}
-
-extern unsigned int blk_rq_err_bytes(const struct request *rq);
-
-static inline unsigned int blk_rq_sectors(const struct request *rq)
-{
-       return blk_rq_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
-{
-       return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
-{
-       return rq->stats_sectors;
+       return bdev->bd_queue;  /* this is never NULL */
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -973,42 +634,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
        return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
                                     bio->bi_iter.bi_sector);
 }
-
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
-       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
-       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
-}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-/*
- * Some commands like WRITE SAME have a payload or data transfer size which
- * is different from the size of the request.  Any driver that supports such
- * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
- * calculate the data transfer size.
- */
-static inline unsigned int blk_rq_payload_bytes(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return rq->special_vec.bv_len;
-       return blk_rq_bytes(rq);
-}
-
-/*
- * Return the first full biovec in the request.  The caller needs to check that
- * there are any bvecs before calling this helper.
- */
-static inline struct bio_vec req_bvec(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return rq->special_vec;
-       return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
-}
-
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                     int op)
 {
@@ -1048,47 +675,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
        return min(q->limits.max_sectors, chunk_sectors);
 }
 
-static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
-                                                 sector_t offset)
-{
-       struct request_queue *q = rq->q;
-
-       if (blk_rq_is_passthrough(rq))
-               return q->limits.max_hw_sectors;
-
-       if (!q->limits.chunk_sectors ||
-           req_op(rq) == REQ_OP_DISCARD ||
-           req_op(rq) == REQ_OP_SECURE_ERASE)
-               return blk_queue_get_max_sectors(q, req_op(rq));
-
-       return min(blk_max_size_offset(q, offset, 0),
-                       blk_queue_get_max_sectors(q, req_op(rq)));
-}
-
-static inline unsigned int blk_rq_count_bios(struct request *rq)
-{
-       unsigned int nr_bios = 0;
-       struct bio *bio;
-
-       __rq_for_each_bio(bio, rq)
-               nr_bios++;
-
-       return nr_bios;
-}
-
-void blk_steal_bios(struct bio_list *list, struct request *rq);
-
-/*
- * Request completion related functions.
- *
- * blk_update_request() completes given number of bytes and updates
- * the request without completing it.
- */
-extern bool blk_update_request(struct request *rq, blk_status_t error,
-                              unsigned int nr_bytes);
-
-extern void blk_abort_request(struct request *);
-
 /*
  * Access functions for manipulating queue properties
  */
@@ -1133,46 +719,24 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-extern void blk_queue_required_elevator_features(struct request_queue *q,
-                                                unsigned int features);
-extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
-                                             struct device *dev);
 
-/*
- * Number of physical segments as sent to the device.
- *
- * Normally this is the number of discontiguous data segments sent by the
- * submitter.  But for data-less command like discard we might have no
- * actual data segments submitted, but the driver might have to add it's
- * own special payload.  In that case we still return 1 here so that this
- * special payload will be mapped.
- */
-static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return 1;
-       return rq->nr_phys_segments;
-}
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
+void disk_set_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars);
 
 /*
- * Number of discard segments (or ranges) the driver needs to fill in.
- * Each discard bio merged into a request is counted as one segment.
+ * Elevator features for blk_queue_required_elevator_features:
  */
-static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
-{
-       return max_t(unsigned short, rq->nr_phys_segments, 1);
-}
-
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
-               struct scatterlist *sglist, struct scatterlist **last_sg);
-static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
-               struct scatterlist *sglist)
-{
-       struct scatterlist *last_sg = NULL;
+/* Supports zoned block devices sequential write constraint */
+#define ELEVATOR_F_ZBD_SEQ_WRITE       (1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE            (1U << 1)
 
-       return __blk_rq_map_sg(q, rq, sglist, &last_sg);
-}
-extern void blk_dump_rq_flags(struct request *, char *);
+extern void blk_queue_required_elevator_features(struct request_queue *q,
+                                                unsigned int features);
+extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
+                                             struct device *dev);
 
 bool __must_check blk_get_queue(struct request_queue *);
 extern void blk_put_queue(struct request_queue *);
@@ -1187,19 +751,24 @@ extern void blk_set_queue_dying(struct request_queue *);
  * as the lock contention for request_queue lock is reduced.
  *
  * It is ok not to disable preemption when adding the request to the plug list
- * or when attempting a merge, because blk_schedule_flush_list() will only flush
- * the plug list when the task sleeps by itself. For details, please see
- * schedule() where blk_schedule_flush_plug() is called.
+ * or when attempting a merge. For details, please see schedule() where
+ * blk_flush_plug() is called.
  */
 struct blk_plug {
-       struct list_head mq_list; /* blk-mq requests */
-       struct list_head cb_list; /* md requires an unplug callback */
+       struct request *mq_list; /* blk-mq requests */
+
+       /* if ios_left is > 1, we can batch tag/rq allocations */
+       struct request *cached_rq;
+       unsigned short nr_ios;
+
        unsigned short rq_count;
+
        bool multiple_queues;
+       bool has_elevator;
        bool nowait;
+
+       struct list_head cb_list; /* md requires an unplug callback */
 };
-#define BLK_MAX_REQUEST_COUNT 16
-#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
 
 struct blk_plug_cb;
 typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
@@ -1211,32 +780,17 @@ struct blk_plug_cb {
 extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                             void *data, int size);
 extern void blk_start_plug(struct blk_plug *);
+extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *, bool);
-
-static inline void blk_flush_plug(struct task_struct *tsk)
-{
-       struct blk_plug *plug = tsk->plug;
 
-       if (plug)
-               blk_flush_plug_list(plug, false);
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *tsk)
-{
-       struct blk_plug *plug = tsk->plug;
-
-       if (plug)
-               blk_flush_plug_list(plug, true);
-}
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
 
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
        struct blk_plug *plug = tsk->plug;
 
        return plug &&
-                (!list_empty(&plug->mq_list) ||
-                !list_empty(&plug->cb_list));
+                (plug->mq_list || !list_empty(&plug->cb_list));
 }
 
 int blkdev_issue_flush(struct block_device *bdev);
@@ -1245,23 +799,23 @@ long nr_blockdev_pages(void);
 struct blk_plug {
 };
 
-static inline void blk_start_plug(struct blk_plug *plug)
+static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
+                                        unsigned short nr_ios)
 {
 }
 
-static inline void blk_finish_plug(struct blk_plug *plug)
+static inline void blk_start_plug(struct blk_plug *plug)
 {
 }
 
-static inline void blk_flush_plug(struct task_struct *task)
+static inline void blk_finish_plug(struct blk_plug *plug)
 {
 }
 
-static inline void blk_schedule_flush_plug(struct task_struct *task)
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 {
 }
 
-
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
        return false;
@@ -1499,22 +1053,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
        return offset << SECTOR_SHIFT;
 }
 
-/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
- */
-static inline bool blk_discard_mergable(struct request *req)
-{
-       if (req_op(req) == REQ_OP_DISCARD &&
-           queue_max_discard_segments(req->q) > 1)
-               return true;
-       return false;
-}
-
 static inline int bdev_discard_alignment(struct block_device *bdev)
 {
        struct request_queue *q = bdev_get_queue(bdev);
@@ -1628,210 +1166,36 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
        MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-enum blk_integrity_flags {
-       BLK_INTEGRITY_VERIFY            = 1 << 0,
-       BLK_INTEGRITY_GENERATE          = 1 << 1,
-       BLK_INTEGRITY_DEVICE_CAPABLE    = 1 << 2,
-       BLK_INTEGRITY_IP_CHECKSUM       = 1 << 3,
-};
-
-struct blk_integrity_iter {
-       void                    *prot_buf;
-       void                    *data_buf;
-       sector_t                seed;
-       unsigned int            data_size;
-       unsigned short          interval;
-       const char              *disk_name;
-};
-
-typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
-typedef void (integrity_prepare_fn) (struct request *);
-typedef void (integrity_complete_fn) (struct request *, unsigned int);
-
-struct blk_integrity_profile {
-       integrity_processing_fn         *generate_fn;
-       integrity_processing_fn         *verify_fn;
-       integrity_prepare_fn            *prepare_fn;
-       integrity_complete_fn           *complete_fn;
-       const char                      *name;
-};
-
-extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
-extern void blk_integrity_unregister(struct gendisk *);
-extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
-                                  struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
-
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-       struct blk_integrity *bi = &disk->queue->integrity;
-
-       if (!bi->profile)
-               return NULL;
-
-       return bi;
-}
-
-static inline
-struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-       return blk_get_integrity(bdev->bd_disk);
-}
-
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-       return q->integrity.profile;
-}
-
-static inline bool blk_integrity_rq(struct request *rq)
-{
-       return rq->cmd_flags & REQ_INTEGRITY;
-}
-
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-                                                   unsigned int segs)
-{
-       q->limits.max_integrity_segments = segs;
-}
-
-static inline unsigned short
-queue_max_integrity_segments(const struct request_queue *q)
-{
-       return q->limits.max_integrity_segments;
-}
-
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi:                blk_integrity profile for device
- * @sectors:   Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device.  Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-                                                  unsigned int sectors)
-{
-       return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-                                              unsigned int sectors)
-{
-       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/*
- * Return the first bvec that contains integrity data.  Only drivers that are
- * limited to a single integrity segment should use this helper.
- */
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-       if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
-               return NULL;
-       return rq->bio->bi_integrity->bip_vec;
-}
-
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-
-struct bio;
-struct block_device;
-struct gendisk;
-struct blk_integrity;
-
-static inline int blk_integrity_rq(struct request *rq)
-{
-       return 0;
-}
-static inline int blk_rq_count_integrity_sg(struct request_queue *q,
-                                           struct bio *b)
-{
-       return 0;
-}
-static inline int blk_rq_map_integrity_sg(struct request_queue *q,
-                                         struct bio *b,
-                                         struct scatterlist *s)
-{
-       return 0;
-}
-static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
-{
-       return NULL;
-}
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-       return NULL;
-}
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-       return false;
-}
-static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
-{
-       return 0;
-}
-static inline void blk_integrity_register(struct gendisk *d,
-                                        struct blk_integrity *b)
-{
-}
-static inline void blk_integrity_unregister(struct gendisk *d)
-{
-}
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-                                                   unsigned int segs)
-{
-}
-static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
-{
-       return 0;
-}
-
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-                                                  unsigned int sectors)
-{
-       return 0;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-                                              unsigned int sectors)
-{
-       return 0;
-}
-
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-       return NULL;
-}
-
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+                        struct request_queue *q);
 
-void blk_ksm_unregister(struct request_queue *q);
+void blk_crypto_unregister(struct request_queue *q);
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
-static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm,
-                                   struct request_queue *q)
+static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
+                                      struct request_queue *q)
 {
        return true;
 }
 
-static inline void blk_ksm_unregister(struct request_queue *q) { }
+static inline void blk_crypto_unregister(struct request_queue *q) { }
 
 #endif /* CONFIG_BLK_INLINE_ENCRYPTION */
 
+enum blk_unique_id {
+       /* these match the Designator Types specified in SPC */
+       BLK_UID_T10     = 1,
+       BLK_UID_EUI64   = 2,
+       BLK_UID_NAA     = 3,
+};
+
+#define NFL4_UFLG_MASK                 0x0000003F
 
 struct block_device_operations {
-       blk_qc_t (*submit_bio) (struct bio *bio);
+       void (*submit_bio)(struct bio *bio);
        int (*open) (struct block_device *, fmode_t);
        void (*release) (struct gendisk *, fmode_t);
        int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
@@ -1847,6 +1211,9 @@ struct block_device_operations {
        int (*report_zones)(struct gendisk *, sector_t sector,
                        unsigned int nr_zones, report_zones_cb cb, void *data);
        char *(*devnode)(struct gendisk *disk, umode_t *mode);
+       /* returns the length of the identifier or a negative errno: */
+       int (*get_unique_id)(struct gendisk *disk, u8 id[16],
+                       enum blk_unique_id id_type);
        struct module *owner;
        const struct pr_ops *pr_ops;
 
@@ -1869,60 +1236,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-       if (blk_req_needs_zone_write_lock(rq))
-               __blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
-               __blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-       return rq->q->seq_zones_wlock &&
-               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-       if (!blk_req_needs_zone_write_lock(rq))
-               return true;
-       return !blk_req_zone_is_write_locked(rq);
-}
-#else
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-       return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-       return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-       return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
 static inline void blk_wake_io_task(struct task_struct *waiter)
 {
        /*
@@ -1991,6 +1304,8 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
 int sync_blockdev(struct block_device *bdev);
+int sync_blockdev_nowait(struct block_device *bdev);
+void sync_bdevs(bool wait);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
@@ -1999,10 +1314,54 @@ static inline int sync_blockdev(struct block_device *bdev)
 {
        return 0;
 }
+static inline int sync_blockdev_nowait(struct block_device *bdev)
+{
+       return 0;
+}
+static inline void sync_bdevs(bool wait)
+{
+}
 #endif
 int fsync_bdev(struct block_device *bdev);
 
 int freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev);
 
+struct io_comp_batch {
+       struct request *req_list;
+       bool need_ts;
+       void (*complete)(struct io_comp_batch *);
+};
+
+#define DEFINE_IO_COMP_BATCH(name)     struct io_comp_batch name = { }
+
+#define rq_list_add(listptr, rq)       do {            \
+       (rq)->rq_next = *(listptr);                     \
+       *(listptr) = rq;                                \
+} while (0)
+
+#define rq_list_pop(listptr)                           \
+({                                                     \
+       struct request *__req = NULL;                   \
+       if ((listptr) && *(listptr))    {               \
+               __req = *(listptr);                     \
+               *(listptr) = __req->rq_next;            \
+       }                                               \
+       __req;                                          \
+})
+
+#define rq_list_peek(listptr)                          \
+({                                                     \
+       struct request *__req = NULL;                   \
+       if ((listptr) && *(listptr))                    \
+               __req = *(listptr);                     \
+       __req;                                          \
+})
+
+#define rq_list_for_each(listptr, pos)                 \
+       for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \
+
+#define rq_list_next(rq)       (rq)->rq_next
+#define rq_list_empty(list)    ((list) == (struct request *) NULL)
+
 #endif /* _LINUX_BLKDEV_H */
index a083e15..22501a2 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef BLKTRACE_H
 #define BLKTRACE_H
 
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/relay.h>
 #include <linux/compat.h>
 #include <uapi/linux/blktrace_api.h>
index 020a7d5..3db6f6c 100644 (file)
@@ -929,8 +929,11 @@ struct bpf_array_aux {
         * stored in the map to make sure that all callers and callees have
         * the same prog type and JITed flag.
         */
-       enum bpf_prog_type type;
-       bool jited;
+       struct {
+               spinlock_t lock;
+               enum bpf_prog_type type;
+               bool jited;
+       } owner;
        /* Programs with direct jumps into programs part of this array. */
        struct list_head poke_progs;
        struct bpf_map *map;
index 9c81724..bbe1eef 100644 (file)
@@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
-#ifdef CONFIG_NET
-BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
+#ifdef CONFIG_NET
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
index 0e9bdd4..35c25df 100644 (file)
@@ -44,7 +44,7 @@ struct bvec_iter {
 
        unsigned int            bi_bvec_done;   /* number of bytes completed in
                                                   current bvec */
-};
+} __packed;
 
 struct bvec_iter_all {
        struct bio_vec  bv;
index c4fef00..0a89f11 100644 (file)
@@ -64,6 +64,7 @@ struct cdrom_device_info {
        int for_data;
        int (*exit)(struct cdrom_device_info *);
        int mrw_mode_page;
+       __s64 last_media_change_ms;
 };
 
 struct cdrom_device_ops {
index 114553b..a7df155 100644 (file)
@@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
                               struct dm_table *t);
 
 /*
- * Table keyslot manager functions
+ * Table blk_crypto_profile functions
  */
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
 
 /*-----------------------------------------------------------------
  * Macros.
index 4a93c12..ef03ff3 100644 (file)
@@ -1051,6 +1051,7 @@ extern int bpf_jit_enable;
 extern int bpf_jit_harden;
 extern int bpf_jit_kallsyms;
 extern long bpf_jit_limit;
+extern long bpf_jit_limit_max;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
index c12df59..3e378b1 100644 (file)
@@ -83,9 +83,10 @@ struct fprop_local_percpu {
 
 int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp);
 void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
-void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
-void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
-                           int max_frac);
+void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
+               long nr);
+void __fprop_add_percpu_max(struct fprop_global *p,
+               struct fprop_local_percpu *pl, int max_frac, long nr);
 void fprop_fraction_percpu(struct fprop_global *p,
        struct fprop_local_percpu *pl, unsigned long *numerator,
        unsigned long *denominator);
@@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
        unsigned long flags;
 
        local_irq_save(flags);
-       __fprop_inc_percpu(p, pl);
+       __fprop_add_percpu(p, pl, 1);
        local_irq_restore(flags);
 }
 
index 56eba72..f3cfca5 100644 (file)
@@ -48,6 +48,7 @@
 struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
+struct io_comp_batch;
 struct export_operations;
 struct fiemap_extent_info;
 struct hd_geometry;
@@ -329,16 +330,12 @@ struct kiocb {
        randomized_struct_fields_start
 
        loff_t                  ki_pos;
-       void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+       void (*ki_complete)(struct kiocb *iocb, long ret);
        void                    *private;
        int                     ki_flags;
        u16                     ki_hint;
        u16                     ki_ioprio; /* See linux/ioprio.h */
-       union {
-               unsigned int            ki_cookie; /* for ->iopoll */
-               struct wait_page_queue  *ki_waitq; /* for async buffered IO */
-       };
-
+       struct wait_page_queue  *ki_waitq; /* for async buffered IO */
        randomized_struct_fields_end
 };
 
@@ -2075,7 +2072,8 @@ struct file_operations {
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-       int (*iopoll)(struct kiocb *kiocb, bool spin);
+       int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
+                       unsigned int flags);
        int (*iterate) (struct file *, struct dir_context *);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
index e912ed9..91ea947 100644 (file)
@@ -118,9 +118,6 @@ struct fscrypt_operations {
         */
        bool (*empty_dir)(struct inode *inode);
 
-       /* The filesystem's maximum ciphertext filename length, in bytes */
-       unsigned int max_namelen;
-
        /*
         * Check whether the filesystem's inode numbers and UUID are stable,
         * meaning that they will never be changed even by offline operations
index 0f5315c..59eabbc 100644 (file)
 
 #include <linux/types.h>
 #include <linux/kdev_t.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-#include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
 #include <linux/blk_types.h>
-#include <asm/local.h>
+#include <linux/device.h>
+#include <linux/xarray.h>
 
 extern const struct device_type disk_type;
 extern struct device_type part_type;
@@ -26,14 +24,6 @@ extern struct class block_class;
 #define DISK_MAX_PARTS                 256
 #define DISK_NAME_LEN                  32
 
-#include <linux/major.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/workqueue.h>
-#include <linux/xarray.h>
-
 #define PARTITION_META_INFO_VOLNAMELTH 64
 /*
  * Enough for the string representation of any kind of UUID plus NULL.
@@ -223,6 +213,8 @@ static inline int add_disk(struct gendisk *disk)
 }
 extern void del_gendisk(struct gendisk *gp);
 
+void invalidate_disk(struct gendisk *disk);
+
 void set_disk_ro(struct gendisk *disk, bool read_only);
 
 static inline int get_disk_ro(struct gendisk *disk)
@@ -231,6 +223,11 @@ static inline int get_disk_ro(struct gendisk *disk)
                test_bit(GD_READ_ONLY, &disk->state);
 }
 
+static inline int bdev_read_only(struct block_device *bdev)
+{
+       return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
 extern void disk_block_events(struct gendisk *disk);
 extern void disk_unblock_events(struct gendisk *disk);
 extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
@@ -248,7 +245,12 @@ static inline sector_t get_start_sect(struct block_device *bdev)
 
 static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 {
-       return i_size_read(bdev->bd_inode) >> 9;
+       return bdev->bd_nr_sectors;
+}
+
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+{
+       return bdev_nr_sectors(bdev) << SECTOR_SHIFT;
 }
 
 static inline sector_t get_capacity(struct gendisk *disk)
@@ -256,6 +258,12 @@ static inline sector_t get_capacity(struct gendisk *disk)
        return bdev_nr_sectors(disk->part0);
 }
 
+static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
+{
+       return bdev_nr_sectors(sb->s_bdev) >>
+               (sb->s_blocksize_bits - SECTOR_SHIFT);
+}
+
 int bdev_disk_changed(struct gendisk *disk, bool invalidate);
 void blk_drop_partitions(struct gendisk *disk);
 
@@ -291,10 +299,6 @@ bool bdev_check_media_change(struct block_device *bdev);
 int __invalidate_device(struct block_device *bdev, bool kill_dirty);
 void set_capacity(struct gendisk *disk, sector_t size);
 
-/* for drivers/char/raw.c: */
-int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
-long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
index 55b2ec1..3745efd 100644 (file)
@@ -520,15 +520,11 @@ static inline void arch_free_page(struct page *page, int order) { }
 #ifndef HAVE_ARCH_ALLOC_PAGE
 static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
-       return 0;
-}
-#endif
 
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+               nodemask_t *nodemask);
 
 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
@@ -570,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
        return __alloc_pages(gfp_mask, order, nid, NULL);
 }
 
+static inline
+struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
+{
+       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+       VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid));
+
+       return __folio_alloc(gfp, order, nid, NULL);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
  * prefer the current CPU's closest node. Otherwise node must be valid and
@@ -586,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
+struct folio *folio_alloc(gfp_t gfp, unsigned order);
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr,
                        int node, bool hugepage);
@@ -596,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
        return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
+static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+{
+       return __folio_alloc_node(gfp, order, numa_node_id());
+}
 #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
        alloc_pages(gfp_mask, order)
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
index 4aa1031..0a0b2b0 100644 (file)
@@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page)
        return __kmap_local_page_prot(page, kmap_prot);
 }
 
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+       struct page *page = folio_page(folio, offset / PAGE_SIZE);
+       return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
+}
+
 static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
 {
        return __kmap_local_page_prot(page, prot);
@@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page)
        return page_address(page);
 }
 
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+       return page_address(&folio->page) + offset;
+}
+
 static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
 {
        return kmap_local_page(page);
index b4c49f9..27cdd71 100644 (file)
@@ -97,6 +97,43 @@ static inline void kmap_flush_unused(void);
 static inline void *kmap_local_page(struct page *page);
 
 /**
+ * kmap_local_folio - Map a page in this folio for temporary usage
+ * @folio: The folio containing the page.
+ * @offset: The byte offset within the folio which identifies the page.
+ *
+ * Requires careful handling when nesting multiple mappings because the map
+ * management is stack based. The unmap has to be in the reverse order of
+ * the map operation::
+ *
+ *   addr1 = kmap_local_folio(folio1, offset1);
+ *   addr2 = kmap_local_folio(folio2, offset2);
+ *   ...
+ *   kunmap_local(addr2);
+ *   kunmap_local(addr1);
+ *
+ * Unmapping addr1 before addr2 is invalid and causes malfunction.
+ *
+ * Contrary to kmap() mappings the mapping is only valid in the context of
+ * the caller and cannot be handed to other contexts.
+ *
+ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
+ * virtual address of the direct mapping. Only real highmem pages are
+ * temporarily mapped.
+ *
+ * While it is significantly faster than kmap() for the higmem case it
+ * comes with restrictions about the pointer validity. Only use when really
+ * necessary.
+ *
+ * On HIGHMEM enabled systems mapping a highmem page has the side effect of
+ * disabling migration in order to keep the virtual address stable across
+ * preemption. No caller of kmap_local_folio() can rely on this side effect.
+ *
+ * Context: Can be invoked from any context.
+ * Return: The virtual address of @offset.
+ */
+static inline void *kmap_local_folio(struct folio *folio, size_t offset);
+
+/**
  * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
  * @page:      Pointer to the page to be mapped
  *
index f123e15..f280f33 100644 (file)
@@ -251,15 +251,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 }
 
 /**
- * thp_head - Head page of a transparent huge page.
- * @page: Any page (tail, head or regular) found in the page cache.
- */
-static inline struct page *thp_head(struct page *page)
-{
-       return compound_head(page);
-}
-
-/**
  * thp_order - Order of a transparent huge page.
  * @page: Head page of a transparent huge page.
  */
@@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page)
 #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
 #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
 
-static inline struct page *thp_head(struct page *page)
-{
-       VM_BUG_ON_PGFLAGS(PageTail(page), page);
-       return page;
-}
-
 static inline unsigned int thp_order(struct page *page)
 {
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
index 24f8489..63f4ea4 100644 (file)
@@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping,
 struct iomap_dio_ops {
        int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
                      unsigned flags);
-       blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
-                             loff_t file_offset);
+       void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
+                         loff_t file_offset);
 };
 
 /*
@@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                unsigned int dio_flags);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
 #ifdef CONFIG_SWAP
 struct file;
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
deleted file mode 100644 (file)
index a27605e..0000000
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2019 Google LLC
- */
-
-#ifndef __LINUX_KEYSLOT_MANAGER_H
-#define __LINUX_KEYSLOT_MANAGER_H
-
-#include <linux/bio.h>
-#include <linux/blk-crypto.h>
-
-struct blk_keyslot_manager;
-
-/**
- * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
- * @keyslot_program:   Program the specified key into the specified slot in the
- *                     inline encryption hardware.
- * @keyslot_evict:     Evict key from the specified keyslot in the hardware.
- *                     The key is provided so that e.g. dm layers can evict
- *                     keys from the devices that they map over.
- *                     Returns 0 on success, -errno otherwise.
- *
- * This structure should be provided by storage device drivers when they set up
- * a keyslot manager - this structure holds the function ptrs that the keyslot
- * manager will use to manipulate keyslots in the hardware.
- */
-struct blk_ksm_ll_ops {
-       int (*keyslot_program)(struct blk_keyslot_manager *ksm,
-                              const struct blk_crypto_key *key,
-                              unsigned int slot);
-       int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
-                            const struct blk_crypto_key *key,
-                            unsigned int slot);
-};
-
-struct blk_keyslot_manager {
-       /*
-        * The struct blk_ksm_ll_ops that this keyslot manager will use
-        * to perform operations like programming and evicting keys on the
-        * device
-        */
-       struct blk_ksm_ll_ops ksm_ll_ops;
-
-       /*
-        * The maximum number of bytes supported for specifying the data unit
-        * number.
-        */
-       unsigned int max_dun_bytes_supported;
-
-       /*
-        * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
-        * whether a crypto mode and data unit size are supported. The i'th
-        * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
-        * size of (1 << i) is supported. We only support data unit sizes
-        * that are powers of 2.
-        */
-       unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
-
-       /* Device for runtime power management (NULL if none) */
-       struct device *dev;
-
-       /* Here onwards are *private* fields for internal keyslot manager use */
-
-       unsigned int num_slots;
-
-       /* Protects programming and evicting keys from the device */
-       struct rw_semaphore lock;
-
-       /* List of idle slots, with least recently used slot at front */
-       wait_queue_head_t idle_slots_wait_queue;
-       struct list_head idle_slots;
-       spinlock_t idle_slots_lock;
-
-       /*
-        * Hash table which maps struct *blk_crypto_key to keyslots, so that we
-        * can find a key's keyslot in O(1) time rather than O(num_slots).
-        * Protected by 'lock'.
-        */
-       struct hlist_head *slot_hashtable;
-       unsigned int log_slot_ht_size;
-
-       /* Per-keyslot data */
-       struct blk_ksm_keyslot *slots;
-};
-
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
-
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-                     unsigned int num_slots);
-
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     struct blk_ksm_keyslot **slot_ptr);
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
-
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
-
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-                                 const struct blk_crypto_config *cfg);
-
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-                     const struct blk_crypto_key *key);
-
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-                            const struct blk_keyslot_manager *child);
-
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
-
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-                        struct blk_keyslot_manager *ksm_subset);
-
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-                                struct blk_keyslot_manager *reference_ksm);
-
-#endif /* __LINUX_KEYSLOT_MANAGER_H */
index 161e816..a38a5bc 100644 (file)
@@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
                        struct vm_area_struct *vma, unsigned long address);
 
 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
-void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
 
 #else  /* !CONFIG_KSM */
 
@@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page,
 {
 }
 
-static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
 {
 }
 #endif /* CONFIG_MMU */
index c0c64f0..236ec68 100644 (file)
@@ -676,6 +676,18 @@ struct ata_ering {
        struct ata_ering_entry  ring[ATA_ERING_SIZE];
 };
 
+struct ata_cpr {
+       u8                      num;
+       u8                      num_storage_elements;
+       u64                     start_lba;
+       u64                     num_lbas;
+};
+
+struct ata_cpr_log {
+       u8                      nr_cpr;
+       struct ata_cpr          cpr[];
+};
+
 struct ata_device {
        struct ata_link         *link;
        unsigned int            devno;          /* 0 or 1 */
@@ -735,6 +747,9 @@ struct ata_device {
        u32                     zac_zones_optimal_nonseq;
        u32                     zac_zones_max_open;
 
+       /* Concurrent positioning ranges */
+       struct ata_cpr_log      *cpr_log;
+
        /* error history */
        int                     spdn_cnt;
        /* ering is CLEAR_END, read comment above CLEAR_END */
index 3096c9a..e34bf0c 100644 (file)
@@ -369,7 +369,7 @@ enum page_memcg_data_flags {
 
 #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
 
-static inline bool PageMemcgKmem(struct page *page);
+static inline bool folio_memcg_kmem(struct folio *folio);
 
 /*
  * After the initialization objcg->memcg is always pointing at
@@ -384,89 +384,95 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
 }
 
 /*
- * __page_memcg - get the memory cgroup associated with a non-kmem page
- * @page: a pointer to the page struct
+ * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * kmem pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * kmem folios.
  */
-static inline struct mem_cgroup *__page_memcg(struct page *page)
+static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
 {
-       unsigned long memcg_data = page->memcg_data;
+       unsigned long memcg_data = folio->memcg_data;
 
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
 
        return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
- * __page_objcg - get the object cgroup associated with a kmem page
- * @page: a pointer to the page struct
+ * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the object cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the object cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper object cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * LRU pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * LRU folios.
  */
-static inline struct obj_cgroup *__page_objcg(struct page *page)
+static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
 {
-       unsigned long memcg_data = page->memcg_data;
+       unsigned long memcg_data = folio->memcg_data;
 
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-       VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+       VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
 
        return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
- * page_memcg - get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+ * folio_memcg - Get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
  *
- * For a non-kmem page any of the following ensures page and memcg binding
+ * For a non-kmem folio any of the following ensures folio and memcg binding
  * stability:
  *
- * - the page lock
+ * - the folio lock
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
  *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * For a kmem folio a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem folio from being released.
  */
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+       if (folio_memcg_kmem(folio))
+               return obj_cgroup_memcg(__folio_objcg(folio));
+       return __folio_memcg(folio);
+}
+
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
-       if (PageMemcgKmem(page))
-               return obj_cgroup_memcg(__page_objcg(page));
-       else
-               return __page_memcg(page);
+       return folio_memcg(page_folio(page));
 }
 
-/*
- * page_memcg_rcu - locklessly get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+/**
+ * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
  *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * This function assumes that the folio is known to have a
  * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
+ *
+ * Return: A pointer to the memory cgroup associated with the folio,
+ * or NULL.
  */
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 {
-       unsigned long memcg_data = READ_ONCE(page->memcg_data);
+       unsigned long memcg_data = READ_ONCE(folio->memcg_data);
 
-       VM_BUG_ON_PAGE(PageSlab(page), page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        WARN_ON_ONCE(!rcu_read_lock_held());
 
        if (memcg_data & MEMCG_DATA_KMEM) {
@@ -523,17 +529,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * PageMemcgKmem - check if the page has MemcgKmem flag set
- * @page: a pointer to the page struct
+ * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
+ * @folio: Pointer to the folio.
  *
- * Checks if the page has MemcgKmem flag set. The caller must ensure that
- * the page has an associated memory cgroup. It's not safe to call this function
- * against some types of pages, e.g. slab pages.
+ * Checks if the folio has MemcgKmem flag set. The caller must ensure
+ * that the folio has an associated memory cgroup. It's not safe to call
+ * this function against some types of folios, e.g. slab folios.
  */
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
 {
-       VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
-       return page->memcg_data & MEMCG_DATA_KMEM;
+       VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
+       VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+       return folio->memcg_data & MEMCG_DATA_KMEM;
 }
 
 /*
@@ -577,7 +584,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 
 #else
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
 {
        return false;
 }
@@ -593,6 +600,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 #endif
 
+static inline bool PageMemcgKmem(struct page *page)
+{
+       return folio_memcg_kmem(page_folio(page));
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -684,26 +696,47 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
                page_counter_read(&memcg->memory);
 }
 
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                       gfp_t gfp_mask);
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                                   gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
+
+/**
+ * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
+ * @folio: Folio to charge.
+ * @mm: mm context of the allocating task.
+ * @gfp: Reclaim mode.
+ *
+ * Try to charge @folio to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp if necessary.  If @mm is NULL, try to
+ * charge to the active memcg.
+ *
+ * Do not use this for folios allocated for swapin.
+ *
+ * Return: 0 on success. Otherwise, an error code is returned.
+ */
+static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
+                                   gfp_t gfp)
 {
        if (mem_cgroup_disabled())
                return 0;
-       return __mem_cgroup_charge(page, mm, gfp_mask);
+       return __mem_cgroup_charge(folio, mm, gfp);
 }
 
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
-void __mem_cgroup_uncharge(struct page *page);
-static inline void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio);
+
+/**
+ * mem_cgroup_uncharge - Uncharge a folio.
+ * @folio: Folio to uncharge.
+ *
+ * Uncharge a folio previously charged with mem_cgroup_charge().
+ */
+static inline void mem_cgroup_uncharge(struct folio *folio)
 {
        if (mem_cgroup_disabled())
                return;
-       __mem_cgroup_uncharge(page);
+       __mem_cgroup_uncharge(folio);
 }
 
 void __mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -714,7 +747,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
        __mem_cgroup_uncharge_list(page_list);
 }
 
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
+void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
@@ -753,33 +786,33 @@ out:
 }
 
 /**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
+ * folio_lruvec - return lruvec for isolating/putting an LRU folio
+ * @folio: Pointer to the folio.
  *
- * This function relies on page->mem_cgroup being stable.
+ * This function relies on folio->mem_cgroup being stable.
  */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
 {
-       pg_data_t *pgdat = page_pgdat(page);
-       struct mem_cgroup *memcg = page_memcg(page);
+       struct mem_cgroup *memcg = folio_memcg(folio);
 
-       VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
-       return mem_cgroup_lruvec(memcg, pgdat);
+       VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
+       return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
 }
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
-struct lruvec *lock_page_lruvec(struct page *page);
-struct lruvec *lock_page_lruvec_irq(struct page *page);
-struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+struct lruvec *folio_lruvec_lock(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);
 
 #ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
 #else
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 {
 }
 #endif
@@ -947,6 +980,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
 extern bool cgroup_memory_noswap;
 #endif
 
+void folio_memcg_lock(struct folio *folio);
+void folio_memcg_unlock(struct folio *folio);
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
@@ -1115,12 +1150,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 #define MEM_CGROUP_ID_SHIFT    0
 #define MEM_CGROUP_ID_MAX      0
 
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+       return NULL;
+}
+
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
        return NULL;
 }
 
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 {
        WARN_ON_ONCE(!rcu_read_lock_held());
        return NULL;
@@ -1131,6 +1171,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
        return NULL;
 }
 
+static inline bool folio_memcg_kmem(struct folio *folio)
+{
+       return false;
+}
+
 static inline bool PageMemcgKmem(struct page *page)
 {
        return false;
@@ -1179,8 +1224,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
        return false;
 }
 
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                                   gfp_t gfp_mask)
+static inline int mem_cgroup_charge(struct folio *folio,
+               struct mm_struct *mm, gfp_t gfp)
 {
        return 0;
 }
@@ -1195,7 +1240,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
 {
 }
 
-static inline void mem_cgroup_uncharge(struct page *page)
+static inline void mem_cgroup_uncharge(struct folio *folio)
 {
 }
 
@@ -1203,7 +1248,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
-static inline void mem_cgroup_migrate(struct page *old, struct page *new)
+static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
 
@@ -1213,14 +1258,14 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
        return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
 {
-       pg_data_t *pgdat = page_pgdat(page);
-
+       struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
 }
 
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 {
 }
 
@@ -1250,26 +1295,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
-static inline struct lruvec *lock_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
 {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
 
        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
 {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
 
        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
 {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
 
        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
@@ -1356,6 +1401,14 @@ static inline void unlock_page_memcg(struct page *page)
 {
 }
 
+static inline void folio_memcg_lock(struct folio *folio)
+{
+}
+
+static inline void folio_memcg_unlock(struct folio *folio)
+{
+}
+
 static inline void mem_cgroup_handle_over_high(void)
 {
 }
@@ -1517,38 +1570,39 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
 }
 
 /* Test requires a stable page->memcg binding, see page_memcg() */
-static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
+static inline bool folio_matches_lruvec(struct folio *folio,
+               struct lruvec *lruvec)
 {
-       return lruvec_pgdat(lruvec) == page_pgdat(page) &&
-              lruvec_memcg(lruvec) == page_memcg(page);
+       return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
+              lruvec_memcg(lruvec) == folio_memcg(folio);
 }
 
 /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
 {
        if (locked_lruvec) {
-               if (page_matches_lruvec(page, locked_lruvec))
+               if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;
 
                unlock_page_lruvec_irq(locked_lruvec);
        }
 
-       return lock_page_lruvec_irq(page);
+       return folio_lruvec_lock_irq(folio);
 }
 
 /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec *locked_lruvec, unsigned long *flags)
 {
        if (locked_lruvec) {
-               if (page_matches_lruvec(page, locked_lruvec))
+               if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;
 
                unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
        }
 
-       return lock_page_lruvec_irqsave(page, flags);
+       return folio_lruvec_lock_irqsave(folio, flags);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -1558,17 +1612,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);
 
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);
 
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
 {
        if (mem_cgroup_disabled())
                return;
 
-       if (unlikely(&page_memcg(page)->css != wb->memcg_css))
-               mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+       if (unlikely(&folio_memcg(folio)->css != wb->memcg_css))
+               mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
 }
 
 void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
@@ -1588,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 {
 }
 
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
 {
 }
index c8077e9..0d2aeb9 100644 (file)
@@ -57,6 +57,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page, int extra_count);
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
+int folio_migrate_mapping(struct address_space *mapping,
+               struct folio *newfolio, struct folio *folio, int extra_count);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
index 73a52ab..40ff114 100644 (file)
 struct mempolicy;
 struct anon_vma;
 struct anon_vma_chain;
-struct file_ra_state;
 struct user_struct;
-struct writeback_control;
-struct bdi_writeback;
 struct pt_regs;
 
 extern int sysctl_page_lock_unfairness;
@@ -216,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
 int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
-/*
- * Any attempt to mark this function as static leads to build failure
- * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked()
- * is referred to by BPF code. This must be visible for error injection.
- */
-int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-               pgoff_t index, gfp_t gfp, void **shadowp);
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
@@ -748,13 +738,18 @@ static inline int put_page_testzero(struct page *page)
        return page_ref_dec_and_test(page);
 }
 
+static inline int folio_put_testzero(struct folio *folio)
+{
+       return put_page_testzero(&folio->page);
+}
+
 /*
  * Try to grab a ref unless the page has a refcount of zero, return false if
  * that is the case.
  * This can be called when MMU is off so it must not access
  * any of the virtual mappings.
  */
-static inline int get_page_unless_zero(struct page *page)
+static inline bool get_page_unless_zero(struct page *page)
 {
        return page_ref_add_unless(page, 1, 0);
 }
@@ -907,7 +902,7 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-void copy_huge_page(struct page *dst, struct page *src);
+void folio_copy(struct folio *dst, struct folio *src);
 
 /*
  * Compound pages have a destructor function.  Provide a
@@ -950,6 +945,20 @@ static inline unsigned int compound_order(struct page *page)
        return page[1].compound_order;
 }
 
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+       return compound_order(&folio->page);
+}
+
 static inline bool hpage_pincount_available(struct page *page)
 {
        /*
@@ -1131,6 +1140,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+static inline enum zone_type folio_zonenum(const struct folio *folio)
+{
+       return page_zonenum(&folio->page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 static inline bool is_zone_device_page(const struct page *page)
 {
@@ -1200,18 +1214,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 }
 
 /* 127: arbitrary random number, small enough to assemble well */
-#define page_ref_zero_or_close_to_overflow(page) \
-       ((unsigned int) page_ref_count(page) + 127u <= 127u)
+#define folio_ref_zero_or_close_to_overflow(folio) \
+       ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
+
+/**
+ * folio_get - Increment the reference count on a folio.
+ * @folio: The folio.
+ *
+ * Context: May be called in any context, as long as you know that
+ * you have a refcount on the folio.  If you do not already have one,
+ * folio_try_get() may be the right interface for you to use.
+ */
+static inline void folio_get(struct folio *folio)
+{
+       VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
+       folio_ref_inc(folio);
+}
 
 static inline void get_page(struct page *page)
 {
-       page = compound_head(page);
-       /*
-        * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_refcount.
-        */
-       VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
-       page_ref_inc(page);
+       folio_get(page_folio(page));
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
@@ -1228,9 +1250,28 @@ static inline __must_check bool try_get_page(struct page *page)
        return true;
 }
 
+/**
+ * folio_put - Decrement the reference count on a folio.
+ * @folio: The folio.
+ *
+ * If the folio's reference count reaches zero, the memory will be
+ * released back to the page allocator and may be used by another
+ * allocation immediately.  Do not access the memory or the struct folio
+ * after calling folio_put() unless you can be sure that it wasn't the
+ * last reference.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folio_put(struct folio *folio)
+{
+       if (folio_put_testzero(folio))
+               __put_page(&folio->page);
+}
+
 static inline void put_page(struct page *page)
 {
-       page = compound_head(page);
+       struct folio *folio = page_folio(page);
 
        /*
         * For devmap managed pages we need to catch refcount transition from
@@ -1238,13 +1279,12 @@ static inline void put_page(struct page *page)
         * need to inform the device driver through callback. See
         * include/linux/memremap.h and HMM for details.
         */
-       if (page_is_devmap_managed(page)) {
-               put_devmap_managed_page(page);
+       if (page_is_devmap_managed(&folio->page)) {
+               put_devmap_managed_page(&folio->page);
                return;
        }
 
-       if (put_page_testzero(page))
-               __put_page(page);
+       folio_put(folio);
 }
 
 /*
@@ -1379,6 +1419,11 @@ static inline int page_to_nid(const struct page *page)
 }
 #endif
 
+static inline int folio_nid(const struct folio *folio)
+{
+       return page_to_nid(&folio->page);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 static inline int cpu_pid_to_cpupid(int cpu, int pid)
 {
@@ -1546,6 +1591,16 @@ static inline pg_data_t *page_pgdat(const struct page *page)
        return NODE_DATA(page_to_nid(page));
 }
 
+static inline struct zone *folio_zone(const struct folio *folio)
+{
+       return page_zone(&folio->page);
+}
+
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
+{
+       return page_pgdat(&folio->page);
+}
+
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
@@ -1559,6 +1614,20 @@ static inline unsigned long page_to_section(const struct page *page)
 }
 #endif
 
+/**
+ * folio_pfn - Return the Page Frame Number of a folio.
+ * @folio: The folio.
+ *
+ * A folio may contain multiple pages.  The pages have consecutive
+ * Page Frame Numbers.
+ *
+ * Return: The Page Frame Number of the first page in the folio.
+ */
+static inline unsigned long folio_pfn(struct folio *folio)
+{
+       return page_to_pfn(&folio->page);
+}
+
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
 static inline bool is_pinnable_page(struct page *page)
@@ -1595,6 +1664,89 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
+/**
+ * folio_nr_pages - The number of pages in the folio.
+ * @folio: The folio.
+ *
+ * Return: A positive power of two.
+ */
+static inline long folio_nr_pages(struct folio *folio)
+{
+       return compound_nr(&folio->page);
+}
+
+/**
+ * folio_next - Move to the next physical folio.
+ * @folio: The folio we're currently operating on.
+ *
+ * If you have physically contiguous memory which may span more than
+ * one folio (eg a &struct bio_vec), use this function to move from one
+ * folio to the next.  Do not use it if the memory is only virtually
+ * contiguous as the folios are almost certainly not adjacent to each
+ * other.  This is the folio equivalent to writing ``page++``.
+ *
+ * Context: We assume that the folios are refcounted and/or locked at a
+ * higher level and do not adjust the reference counts.
+ * Return: The next struct folio.
+ */
+static inline struct folio *folio_next(struct folio *folio)
+{
+       return (struct folio *)folio_page(folio, folio_nr_pages(folio));
+}
+
+/**
+ * folio_shift - The size of the memory described by this folio.
+ * @folio: The folio.
+ *
+ * A folio represents a number of bytes which is a power-of-two in size.
+ * This function tells you which power-of-two the folio is.  See also
+ * folio_size() and folio_order().
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The base-2 logarithm of the size of this folio.
+ */
+static inline unsigned int folio_shift(struct folio *folio)
+{
+       return PAGE_SHIFT + folio_order(folio);
+}
+
+/**
+ * folio_size - The number of bytes in a folio.
+ * @folio: The folio.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The number of bytes in this folio.
+ */
+static inline size_t folio_size(struct folio *folio)
+{
+       return PAGE_SIZE << folio_order(folio);
+}
+
+#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+static inline int arch_make_page_accessible(struct page *page)
+{
+       return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
+static inline int arch_make_folio_accessible(struct folio *folio)
+{
+       int ret;
+       long i, nr = folio_nr_pages(folio);
+
+       for (i = 0; i < nr; i++) {
+               ret = arch_make_page_accessible(folio_page(folio, i));
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+#endif
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
@@ -1635,19 +1787,6 @@ void page_address_init(void);
 
 extern void *page_rmapping(struct page *page);
 extern struct anon_vma *page_anon_vma(struct page *page);
-extern struct address_space *page_mapping(struct page *page);
-
-extern struct address_space *__page_file_mapping(struct page *);
-
-static inline
-struct address_space *page_file_mapping(struct page *page)
-{
-       if (unlikely(PageSwapCache(page)))
-               return __page_file_mapping(page);
-
-       return page->mapping;
-}
-
 extern pgoff_t __page_file_index(struct page *page);
 
 /*
@@ -1662,7 +1801,7 @@ static inline pgoff_t page_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
-struct address_space *page_mapping(struct page *page);
+bool folio_mapped(struct folio *folio);
 
 /*
  * Return true only if the page has been allocated with
@@ -1700,6 +1839,7 @@ extern void pagefault_out_of_memory(void);
 
 #define offset_in_page(p)      ((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
 
 /*
  * Flags passed to show_mem() and show_free_areas() to suppress output in
@@ -1854,20 +1994,9 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned int offset,
                              unsigned int length);
 
-int redirty_page_for_writepage(struct writeback_control *wbc,
-                               struct page *page);
-void account_page_cleaned(struct page *page, struct address_space *mapping,
-                         struct bdi_writeback *wb);
-int set_page_dirty(struct page *page);
+bool folio_mark_dirty(struct folio *folio);
+bool set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
-void __cancel_dirty_page(struct page *page);
-static inline void cancel_dirty_page(struct page *page)
-{
-       /* Avoid atomic ops, locking, etc. when not actually needed. */
-       if (PageDirty(page))
-               __cancel_dirty_page(page);
-}
-int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
@@ -2659,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
 
-/* mm/page-writeback.c */
-int __must_check write_one_page(struct page *page);
-void task_dirty_inc(struct task_struct *tsk);
-
 extern unsigned long stack_guard_gap;
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
index 355ea1e..e2ec68b 100644 (file)
@@ -6,27 +6,33 @@
 #include <linux/swap.h>
 
 /**
- * page_is_file_lru - should the page be on a file LRU or anon LRU?
- * @page: the page to test
- *
- * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
- * freed anonymous page (e.g. via MADV_FREE).  Returns 0 if @page is a normal
- * anonymous page, a tmpfs page or otherwise ram or swap backed page.  Used by
- * functions that manipulate the LRU lists, to sort a page onto the right LRU
- * list.
+ * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
+ * @folio: The folio to test.
  *
  * We would like to get this info without a page flag, but the state
- * needs to survive until the page is last deleted from the LRU, which
+ * needs to survive until the folio is last deleted from the LRU, which
  * could be as far down as __page_cache_release.
+ *
+ * Return: An integer (not a boolean!) used to sort a folio onto the
+ * right LRU list and to account folios correctly.
+ * 1 if @folio is a regular filesystem backed page cache folio
+ * or a lazily freed anonymous folio (e.g. via MADV_FREE).
+ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
+ * ram or swap backed folio.
  */
+static inline int folio_is_file_lru(struct folio *folio)
+{
+       return !folio_test_swapbacked(folio);
+}
+
 static inline int page_is_file_lru(struct page *page)
 {
-       return !PageSwapBacked(page);
+       return folio_is_file_lru(page_folio(page));
 }
 
 static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
-                               int nr_pages)
+                               long nr_pages)
 {
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
@@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 }
 
 /**
- * __clear_page_lru_flags - clear page lru flags before releasing a page
- * @page: the page that was on lru and now has a zero reference
+ * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
+ * @folio: The folio that was on lru and now has a zero reference.
  */
-static __always_inline void __clear_page_lru_flags(struct page *page)
+static __always_inline void __folio_clear_lru_flags(struct folio *folio)
 {
-       VM_BUG_ON_PAGE(!PageLRU(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);
 
-       __ClearPageLRU(page);
+       __folio_clear_lru(folio);
 
        /* this shouldn't happen, so leave the flags to bad_page() */
-       if (PageActive(page) && PageUnevictable(page))
+       if (folio_test_active(folio) && folio_test_unevictable(folio))
                return;
 
-       __ClearPageActive(page);
-       __ClearPageUnevictable(page);
+       __folio_clear_active(folio);
+       __folio_clear_unevictable(folio);
+}
+
+static __always_inline void __clear_page_lru_flags(struct page *page)
+{
+       __folio_clear_lru_flags(page_folio(page));
 }
 
 /**
- * page_lru - which LRU list should a page be on?
- * @page: the page to test
+ * folio_lru_list - Which LRU list should a folio be on?
+ * @folio: The folio to test.
  *
- * Returns the LRU list a page should be on, as an index
+ * Return: The LRU list a folio should be on, as an index
  * into the array of LRU lists.
  */
-static __always_inline enum lru_list page_lru(struct page *page)
+static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 {
        enum lru_list lru;
 
-       VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+       VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
 
-       if (PageUnevictable(page))
+       if (folio_test_unevictable(folio))
                return LRU_UNEVICTABLE;
 
-       lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
-       if (PageActive(page))
+       lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+       if (folio_test_active(folio))
                lru += LRU_ACTIVE;
 
        return lru;
 }
 
+static __always_inline
+void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
+{
+       enum lru_list lru = folio_lru_list(folio);
+
+       update_lru_size(lruvec, lru, folio_zonenum(folio),
+                       folio_nr_pages(folio));
+       list_add(&folio->lru, &lruvec->lists[lru]);
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
                                struct lruvec *lruvec)
 {
-       enum lru_list lru = page_lru(page);
+       lruvec_add_folio(lruvec, page_folio(page));
+}
 
-       update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-       list_add(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
+{
+       enum lru_list lru = folio_lru_list(folio);
+
+       update_lru_size(lruvec, lru, folio_zonenum(folio),
+                       folio_nr_pages(folio));
+       list_add_tail(&folio->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void add_page_to_lru_list_tail(struct page *page,
                                struct lruvec *lruvec)
 {
-       enum lru_list lru = page_lru(page);
+       lruvec_add_folio_tail(lruvec, page_folio(page));
+}
 
-       update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-       list_add_tail(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
+{
+       list_del(&folio->lru);
+       update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
+                       -folio_nr_pages(folio));
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
                                struct lruvec *lruvec)
 {
-       list_del(&page->lru);
-       update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-                       -thp_nr_pages(page));
+       lruvec_del_folio(lruvec, page_folio(page));
 }
 #endif
index 7f8ee09..82dab23 100644 (file)
@@ -239,6 +239,72 @@ struct page {
 #endif
 } _struct_page_alignment;
 
+/**
+ * struct folio - Represents a contiguous set of bytes.
+ * @flags: Identical to the page flags.
+ * @lru: Least Recently Used list; tracks how recently this folio was used.
+ * @mapping: The file this page belongs to, or refers to the anon_vma for
+ *    anonymous memory.
+ * @index: Offset within the file, in units of pages.  For anonymous memory,
+ *    this is the index from the beginning of the mmap.
+ * @private: Filesystem per-folio data (see folio_attach_private()).
+ *    Used for swp_entry_t if folio_test_swapcache().
+ * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
+ *    find out how many times this folio is mapped by userspace.
+ * @_refcount: Do not access this member directly.  Use folio_ref_count()
+ *    to find how many references there are to this folio.
+ * @memcg_data: Memory Control Group data.
+ *
+ * A folio is a physically, virtually and logically contiguous set
+ * of bytes.  It is a power-of-two in size, and it is aligned to that
+ * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
+ * in the page cache, it is at a file offset which is a multiple of that
+ * power-of-two.  It may be mapped into userspace at an address which is
+ * at an arbitrary page offset, but its kernel virtual address is aligned
+ * to its size.
+ */
+struct folio {
+       /* private: don't document the anon union */
+       union {
+               struct {
+       /* public: */
+                       unsigned long flags;
+                       struct list_head lru;
+                       struct address_space *mapping;
+                       pgoff_t index;
+                       void *private;
+                       atomic_t _mapcount;
+                       atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+                       unsigned long memcg_data;
+#endif
+       /* private: the union with struct page is transitional */
+               };
+               struct page page;
+       };
+};
+
+static_assert(sizeof(struct page) == sizeof(struct folio));
+#define FOLIO_MATCH(pg, fl)                                            \
+       static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
+FOLIO_MATCH(flags, flags);
+FOLIO_MATCH(lru, lru);
+FOLIO_MATCH(compound_head, lru);
+FOLIO_MATCH(index, index);
+FOLIO_MATCH(private, private);
+FOLIO_MATCH(_mapcount, _mapcount);
+FOLIO_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+FOLIO_MATCH(memcg_data, memcg_data);
+#endif
+#undef FOLIO_MATCH
+
+static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
+{
+       struct page *tail = &folio->page + 1;
+       return &tail->compound_mapcount;
+}
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
        return &page[1].compound_mapcount;
@@ -257,6 +323,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page)
 #define PAGE_FRAG_CACHE_MAX_SIZE       __ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER      get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
+/*
+ * page_private can be used on tail pages.  However, PagePrivate is only
+ * checked by the VM on the head page.  So page_private on the tail pages
+ * should be used for data that's ancillary to the head page (eg attaching
+ * buffer heads to tail pages after attaching buffer heads to the head page)
+ */
 #define page_private(page)             ((page)->private)
 
 static inline void set_page_private(struct page *page, unsigned long private)
@@ -264,6 +336,11 @@ static inline void set_page_private(struct page *page, unsigned long private)
        page->private = private;
 }
 
+static inline void *folio_get_private(struct folio *folio)
+{
+       return folio->private;
+}
+
 struct page_frag_cache {
        void * va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
index 0c0c9a0..52eae8c 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/mmc/card.h>
 #include <linux/mmc/pm.h>
 #include <linux/dma-direction.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 
 struct mmc_ios {
        unsigned int    clock;                  /* clock rate */
@@ -492,7 +492,7 @@ struct mmc_host {
 
        /* Inline encryption support */
 #ifdef CONFIG_MMC_CRYPTO
-       struct blk_keyslot_manager ksm;
+       struct blk_crypto_profile crypto_profile;
 #endif
 
        /* Host Software Queue support */
index 1935d4c..d7285f8 100644 (file)
@@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm);
                        BUG();                                          \
                }                                                       \
        } while (0)
+#define VM_BUG_ON_FOLIO(cond, folio)                                   \
+       do {                                                            \
+               if (unlikely(cond)) {                                   \
+                       dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\
+                       BUG();                                          \
+               }                                                       \
+       } while (0)
 #define VM_BUG_ON_VMA(cond, vma)                                       \
        do {                                                            \
                if (unlikely(cond)) {                                   \
@@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm);
        }                                                               \
        unlikely(__ret_warn_once);                                      \
 })
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)     ({                      \
+       static bool __section(".data.once") __warned;                   \
+       int __ret_warn_once = !!(cond);                                 \
+                                                                       \
+       if (unlikely(__ret_warn_once && !__warned)) {                   \
+               dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\
+               __warned = true;                                        \
+               WARN_ON(1);                                             \
+       }                                                               \
+       unlikely(__ret_warn_once);                                      \
+})
 
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm);
 #else
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond)
 #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
index 5d6a415..12c4177 100644 (file)
@@ -22,6 +22,7 @@
  * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
  * a page is currently backed by a local disk cache
  */
+#define folio_test_fscache(folio)      folio_test_private_2(folio)
 #define PageFsCache(page)              PagePrivate2((page))
 #define SetPageFsCache(page)           SetPagePrivate2((page))
 #define ClearPageFsCache(page)         ClearPagePrivate2((page))
 #define TestClearPageFsCache(page)     TestClearPagePrivate2((page))
 
 /**
- * set_page_fscache - Set PG_fscache on a page and take a ref
- * @page: The page.
+ * folio_start_fscache - Start an fscache write on a folio.
+ * @folio: The folio.
  *
- * Set the PG_fscache (PG_private_2) flag on a page and take the reference
- * needed for the VM to handle its lifetime correctly.  This sets the flag and
- * takes the reference unconditionally, so care must be taken not to set the
- * flag again if it's already set.
+ * Call this function before writing a folio to a local cache.  Starting a
+ * second write before the first one finishes is not allowed.
  */
-static inline void set_page_fscache(struct page *page)
+static inline void folio_start_fscache(struct folio *folio)
 {
-       set_page_private_2(page);
+       VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio);
+       folio_get(folio);
+       folio_set_private_2(folio);
 }
 
 /**
- * end_page_fscache - Clear PG_fscache and release any waiters
- * @page: The page
- *
- * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers
- * waiting for this.  The page ref held for PG_private_2 being set is released.
+ * folio_end_fscache - End an fscache write on a folio.
+ * @folio: The folio.
  *
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
- * serialised.
+ * Call this function after the folio has been written to the local cache.
+ * This will wake any sleepers waiting on this folio.
  */
-static inline void end_page_fscache(struct page *page)
+static inline void folio_end_fscache(struct folio *folio)
 {
-       end_page_private_2(page);
+       folio_end_private_2(folio);
 }
 
 /**
- * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
  *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page.
+ * If this folio is currently being written to a local cache, wait for
+ * the write to finish.  Another write may start after this one finishes,
+ * unless the caller holds the folio lock.
  */
-static inline void wait_on_page_fscache(struct page *page)
+static inline void folio_wait_fscache(struct folio *folio)
 {
-       wait_on_page_private_2(page);
+       folio_wait_private_2(folio);
 }
 
 /**
- * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache_killable - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
  *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a
- * fatal signal is received by the calling task.
+ * If this folio is currently being written to a local cache, wait
+ * for the write to finish or for a fatal signal to be received.
+ * Another write may start after this one finishes, unless the caller
+ * holds the folio lock.
  *
  * Return:
  * - 0 if successful.
  * - -EINTR if a fatal signal was encountered.
  */
+static inline int folio_wait_fscache_killable(struct folio *folio)
+{
+       return folio_wait_private_2_killable(folio);
+}
+
+static inline void set_page_fscache(struct page *page)
+{
+       folio_start_fscache(page_folio(page));
+}
+
+static inline void end_page_fscache(struct page *page)
+{
+       folio_end_private_2(page_folio(page));
+}
+
+static inline void wait_on_page_fscache(struct page *page)
+{
+       folio_wait_private_2(page_folio(page));
+}
+
 static inline int wait_on_page_fscache_killable(struct page *page)
 {
-       return wait_on_page_private_2_killable(page);
+       return folio_wait_private_2_killable(page_folio(page));
 }
 
 enum netfs_read_source {
index 2a38f2b..cb909ed 100644 (file)
@@ -7,6 +7,7 @@
 #define _NVME_FC_DRIVER_H 1
 
 #include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
 
 
 /*
@@ -497,6 +498,8 @@ struct nvme_fc_port_template {
        int     (*xmt_ls_rsp)(struct nvme_fc_local_port *localport,
                                struct nvme_fc_remote_port *rport,
                                struct nvmefc_ls_rsp *ls_rsp);
+       void    (*map_queues)(struct nvme_fc_local_port *localport,
+                             struct blk_mq_queue_map *map);
 
        u32     max_hw_queues;
        u16     max_sgl_segments;
@@ -779,6 +782,10 @@ struct nvmet_fc_target_port {
  *       LS received.
  *       Entrypoint is Mandatory.
  *
+ * @map_queues: This functions lets the driver expose the queue mapping
+ *      to the block layer.
+ *       Entrypoint is Optional.
+ *
  * @fcp_op:  Called to perform a data transfer or transmit a response.
  *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
  *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
index 3ec8e50..4dd7e6f 100644 (file)
@@ -6,6 +6,8 @@
 #ifndef _LINUX_NVME_RDMA_H
 #define _LINUX_NVME_RDMA_H
 
+#define NVME_RDMA_MAX_QUEUE_SIZE       128
+
 enum nvme_rdma_cm_fmt {
        NVME_RDMA_CM_FMT_1_0 = 0x0,
 };
index b7c4c41..855dd9b 100644 (file)
 #define NVME_NSID_ALL          0xffffffff
 
 enum nvme_subsys_type {
-       NVME_NQN_DISC   = 1,            /* Discovery type target subsystem */
-       NVME_NQN_NVME   = 2,            /* NVME type target subsystem */
+       /* Referral to another discovery type target subsystem */
+       NVME_NQN_DISC   = 1,
+
+       /* NVME type target subsystem */
+       NVME_NQN_NVME   = 2,
+
+       /* Current discovery type target subsystem */
+       NVME_NQN_CURR   = 3,
+};
+
+enum nvme_ctrl_type {
+       NVME_CTRL_IO    = 1,            /* I/O controller */
+       NVME_CTRL_DISC  = 2,            /* Discovery controller */
+       NVME_CTRL_ADMIN = 3,            /* Administrative controller */
 };
 
 /* Address Family codes for Discovery Log Page entry ADRFAM field */
@@ -244,7 +256,9 @@ struct nvme_id_ctrl {
        __le32                  rtd3e;
        __le32                  oaes;
        __le32                  ctratt;
-       __u8                    rsvd100[28];
+       __u8                    rsvd100[11];
+       __u8                    cntrltype;
+       __u8                    fguid[16];
        __le16                  crdt1;
        __le16                  crdt2;
        __le16                  crdt3;
@@ -312,6 +326,7 @@ struct nvme_id_ctrl {
 };
 
 enum {
+       NVME_CTRL_CMIC_MULTI_PORT               = 1 << 0,
        NVME_CTRL_CMIC_MULTI_CTRL               = 1 << 1,
        NVME_CTRL_CMIC_ANA                      = 1 << 3,
        NVME_CTRL_ONCS_COMPARE                  = 1 << 0,
@@ -1303,6 +1318,12 @@ struct nvmf_common_command {
 
 #define MAX_DISC_LOGS  255
 
+/* Discovery log page entry flags (EFLAGS): */
+enum {
+       NVME_DISC_EFLAGS_EPCSD          = (1 << 1),
+       NVME_DISC_EFLAGS_DUPRETINFO     = (1 << 0),
+};
+
 /* Discovery log page entry */
 struct nvmf_disc_rsp_page_entry {
        __u8            trtype;
@@ -1312,7 +1333,8 @@ struct nvmf_disc_rsp_page_entry {
        __le16          portid;
        __le16          cntlid;
        __le16          asqsz;
-       __u8            resv8[22];
+       __le16          eflags;
+       __u8            resv10[20];
        char            trsvcid[NVMF_TRSVCID_SIZE];
        __u8            resv64[192];
        char            subnqn[NVMF_NQN_FIELD_LEN];
index a558d67..d8623d6 100644 (file)
@@ -143,6 +143,8 @@ enum pageflags {
 #endif
        __NR_PAGEFLAGS,
 
+       PG_readahead = PG_reclaim,
+
        /* Filesystems */
        PG_checked = PG_owner_priv_1,
 
@@ -171,6 +173,15 @@ enum pageflags {
        /* Compound pages. Stored in first tail page's flags */
        PG_double_map = PG_workingset,
 
+#ifdef CONFIG_MEMORY_FAILURE
+       /*
+        * Compound pages. Stored in first tail page's flags.
+        * Indicates that at least one subpage is hwpoisoned in the
+        * THP.
+        */
+       PG_has_hwpoisoned = PG_mappedtodisk,
+#endif
+
        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,
 
@@ -193,6 +204,34 @@ static inline unsigned long _compound_head(const struct page *page)
 
 #define compound_head(page)    ((typeof(page))_compound_head(page))
 
+/**
+ * page_folio - Converts from page to folio.
+ * @p: The page.
+ *
+ * Every page is part of a folio.  This function cannot be called on a
+ * NULL pointer.
+ *
+ * Context: No reference, nor lock is required on @page.  If the caller
+ * does not hold a reference, this call may race with a folio split, so
+ * it should re-check the folio still contains this page after gaining
+ * a reference on the folio.
+ * Return: The folio which contains this page.
+ */
+#define page_folio(p)          (_Generic((p),                          \
+       const struct page *:    (const struct folio *)_compound_head(p), \
+       struct page *:          (struct folio *)_compound_head(p)))
+
+/**
+ * folio_page - Return a page from a folio.
+ * @folio: The folio.
+ * @n: The page number to return.
+ *
+ * @n is relative to the start of the folio.  This function does not
+ * check that the page number lies within @folio; the caller is presumed
+ * to have a reference to the page.
+ */
+#define folio_page(folio, n)   nth_page(&(folio)->page, n)
+
 static __always_inline int PageTail(struct page *page)
 {
        return READ_ONCE(page->compound_head) & 1;
@@ -217,6 +256,15 @@ static inline void page_init_poison(struct page *page, size_t size)
 }
 #endif
 
+static unsigned long *folio_flags(struct folio *folio, unsigned n)
+{
+       struct page *page = &folio->page;
+
+       VM_BUG_ON_PGFLAGS(PageTail(page), page);
+       VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
+       return &page[n].flags;
+}
+
 /*
  * Page flags policies wrt compound pages
  *
@@ -261,36 +309,64 @@ static inline void page_init_poison(struct page *page, size_t size)
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);               \
                PF_POISONED_CHECK(&page[1]); })
 
+/* Which page is the flag stored in */
+#define FOLIO_PF_ANY           0
+#define FOLIO_PF_HEAD          0
+#define FOLIO_PF_ONLY_HEAD     0
+#define FOLIO_PF_NO_TAIL       0
+#define FOLIO_PF_NO_COMPOUND   0
+#define FOLIO_PF_SECOND                1
+
 /*
  * Macros to create function definitions for page flags
  */
 #define TESTPAGEFLAG(uname, lname, policy)                             \
+static __always_inline bool folio_test_##lname(struct folio *folio)    \
+{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }   \
 static __always_inline int Page##uname(struct page *page)              \
-       { return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
 
 #define SETPAGEFLAG(uname, lname, policy)                              \
+static __always_inline                                                 \
+void folio_set_##lname(struct folio *folio)                            \
+{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }           \
 static __always_inline void SetPage##uname(struct page *page)          \
-       { set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define CLEARPAGEFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+void folio_clear_##lname(struct folio *folio)                          \
+{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }         \
 static __always_inline void ClearPage##uname(struct page *page)                \
-       { clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __SETPAGEFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+void __folio_set_##lname(struct folio *folio)                          \
+{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }         \
 static __always_inline void __SetPage##uname(struct page *page)                \
-       { __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __CLEARPAGEFLAG(uname, lname, policy)                          \
+static __always_inline                                                 \
+void __folio_clear_##lname(struct folio *folio)                                \
+{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }       \
 static __always_inline void __ClearPage##uname(struct page *page)      \
-       { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTSETFLAG(uname, lname, policy)                              \
+static __always_inline                                                 \
+bool folio_test_set_##lname(struct folio *folio)                       \
+{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline int TestSetPage##uname(struct page *page)       \
-       { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTCLEARFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+bool folio_test_clear_##lname(struct folio *folio)                     \
+{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline int TestClearPage##uname(struct page *page)     \
-       { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define PAGEFLAG(uname, lname, policy)                                 \
        TESTPAGEFLAG(uname, lname, policy)                              \
@@ -306,29 +382,37 @@ static __always_inline int TestClearPage##uname(struct page *page)        \
        TESTSETFLAG(uname, lname, policy)                               \
        TESTCLEARFLAG(uname, lname, policy)
 
-#define TESTPAGEFLAG_FALSE(uname)                                      \
+#define TESTPAGEFLAG_FALSE(uname, lname)                               \
+static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
 static inline int Page##uname(const struct page *page) { return 0; }
 
-#define SETPAGEFLAG_NOOP(uname)                                                \
+#define SETPAGEFLAG_NOOP(uname, lname)                                 \
+static inline void folio_set_##lname(struct folio *folio) { }          \
 static inline void SetPage##uname(struct page *page) {  }
 
-#define CLEARPAGEFLAG_NOOP(uname)                                      \
+#define CLEARPAGEFLAG_NOOP(uname, lname)                               \
+static inline void folio_clear_##lname(struct folio *folio) { }                \
 static inline void ClearPage##uname(struct page *page) {  }
 
-#define __CLEARPAGEFLAG_NOOP(uname)                                    \
+#define __CLEARPAGEFLAG_NOOP(uname, lname)                             \
+static inline void __folio_clear_##lname(struct folio *folio) { }      \
 static inline void __ClearPage##uname(struct page *page) {  }
 
-#define TESTSETFLAG_FALSE(uname)                                       \
+#define TESTSETFLAG_FALSE(uname, lname)                                        \
+static inline bool folio_test_set_##lname(struct folio *folio)         \
+{ return 0; }                                                          \
 static inline int TestSetPage##uname(struct page *page) { return 0; }
 
-#define TESTCLEARFLAG_FALSE(uname)                                     \
+#define TESTCLEARFLAG_FALSE(uname, lname)                              \
+static inline bool folio_test_clear_##lname(struct folio *folio)       \
+{ return 0; }                                                          \
 static inline int TestClearPage##uname(struct page *page) { return 0; }
 
-#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                        \
-       SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
+#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)  \
+       SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)
 
-#define TESTSCFLAG_FALSE(uname)                                                \
-       TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
+#define TESTSCFLAG_FALSE(uname, lname)                                 \
+       TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
 PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
@@ -384,8 +468,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
 PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
-       TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)
 
 #ifdef CONFIG_HIGHMEM
 /*
@@ -394,22 +478,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
  */
 #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
 #else
-PAGEFLAG_FALSE(HighMem)
+PAGEFLAG_FALSE(HighMem, highmem)
 #endif
 
 #ifdef CONFIG_SWAP
-static __always_inline int PageSwapCache(struct page *page)
+static __always_inline bool folio_test_swapcache(struct folio *folio)
 {
-#ifdef CONFIG_THP_SWAP
-       page = compound_head(page);
-#endif
-       return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
+       return folio_test_swapbacked(folio) &&
+                       test_bit(PG_swapcache, folio_flags(folio, 0));
+}
 
+static __always_inline bool PageSwapCache(struct page *page)
+{
+       return folio_test_swapcache(page_folio(page));
 }
+
 SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
 CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
 #else
-PAGEFLAG_FALSE(SwapCache)
+PAGEFLAG_FALSE(SwapCache, swapcache)
 #endif
 
 PAGEFLAG(Unevictable, unevictable, PF_HEAD)
@@ -421,14 +508,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
 #else
-PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
-       TESTSCFLAG_FALSE(Mlocked)
+PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
+       TESTSCFLAG_FALSE(Mlocked, mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
 #else
-PAGEFLAG_FALSE(Uncached)
+PAGEFLAG_FALSE(Uncached, uncached)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -437,7 +524,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
 extern bool take_page_off_buddy(struct page *page);
 #else
-PAGEFLAG_FALSE(HWPoison)
+PAGEFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
 #endif
 
@@ -451,7 +538,7 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #ifdef CONFIG_KASAN_HW_TAGS
 PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
 #else
-PAGEFLAG_FALSE(SkipKASanPoison)
+PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
 #endif
 
 /*
@@ -489,10 +576,14 @@ static __always_inline int PageMappingFlags(struct page *page)
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
-static __always_inline int PageAnon(struct page *page)
+static __always_inline bool folio_test_anon(struct folio *folio)
+{
+       return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+static __always_inline bool PageAnon(struct page *page)
 {
-       page = compound_head(page);
-       return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+       return folio_test_anon(page_folio(page));
 }
 
 static __always_inline int __PageMovable(struct page *page)
@@ -508,30 +599,32 @@ static __always_inline int __PageMovable(struct page *page)
  * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
  * anon_vma, but to that page's node of the stable tree.
  */
-static __always_inline int PageKsm(struct page *page)
+static __always_inline bool folio_test_ksm(struct folio *folio)
 {
-       page = compound_head(page);
-       return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+       return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
 }
+
+static __always_inline bool PageKsm(struct page *page)
+{
+       return folio_test_ksm(page_folio(page));
+}
 #else
-TESTPAGEFLAG_FALSE(Ksm)
+TESTPAGEFLAG_FALSE(Ksm, ksm)
 #endif
 
 u64 stable_page_flags(struct page *page);
 
-static inline int PageUptodate(struct page *page)
+static inline bool folio_test_uptodate(struct folio *folio)
 {
-       int ret;
-       page = compound_head(page);
-       ret = test_bit(PG_uptodate, &(page)->flags);
+       bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
        /*
-        * Must ensure that the data we read out of the page is loaded
-        * _after_ we've loaded page->flags to check for PageUptodate.
-        * We can skip the barrier if the page is not uptodate, because
+        * Must ensure that the data we read out of the folio is loaded
+        * _after_ we've loaded folio->flags to check the uptodate bit.
+        * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
-        * See SetPageUptodate() for the other side of the story.
+        * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();
@@ -539,47 +632,71 @@ static inline int PageUptodate(struct page *page)
        return ret;
 }
 
-static __always_inline void __SetPageUptodate(struct page *page)
+static inline int PageUptodate(struct page *page)
+{
+       return folio_test_uptodate(page_folio(page));
+}
+
+static __always_inline void __folio_mark_uptodate(struct folio *folio)
 {
-       VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
-       __set_bit(PG_uptodate, &page->flags);
+       __set_bit(PG_uptodate, folio_flags(folio, 0));
 }
 
-static __always_inline void SetPageUptodate(struct page *page)
+static __always_inline void folio_mark_uptodate(struct folio *folio)
 {
-       VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
-        * so that all previous stores issued in order to bring the page
-        * uptodate are actually visible before PageUptodate becomes true.
+        * so that all previous stores issued in order to bring the folio
+        * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
-       set_bit(PG_uptodate, &page->flags);
+       set_bit(PG_uptodate, folio_flags(folio, 0));
+}
+
+static __always_inline void __SetPageUptodate(struct page *page)
+{
+       __folio_mark_uptodate((struct folio *)page);
+}
+
+static __always_inline void SetPageUptodate(struct page *page)
+{
+       folio_mark_uptodate((struct folio *)page);
 }
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
-int test_clear_page_writeback(struct page *page);
-int __test_set_page_writeback(struct page *page, bool keep_write);
+bool __folio_start_writeback(struct folio *folio, bool keep_write);
+bool set_page_writeback(struct page *page);
 
-#define test_set_page_writeback(page)                  \
-       __test_set_page_writeback(page, false)
-#define test_set_page_writeback_keepwrite(page)        \
-       __test_set_page_writeback(page, true)
+#define folio_start_writeback(folio)                   \
+       __folio_start_writeback(folio, false)
+#define folio_start_writeback_keepwrite(folio) \
+       __folio_start_writeback(folio, true)
 
-static inline void set_page_writeback(struct page *page)
+static inline void set_page_writeback_keepwrite(struct page *page)
 {
-       test_set_page_writeback(page);
+       folio_start_writeback_keepwrite(page_folio(page));
 }
 
-static inline void set_page_writeback_keepwrite(struct page *page)
+static inline bool test_set_page_writeback(struct page *page)
 {
-       test_set_page_writeback_keepwrite(page);
+       return set_page_writeback(page);
 }
 
 __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
 
+/* Whether there are one or multiple pages in a folio */
+static inline bool folio_test_single(struct folio *folio)
+{
+       return !folio_test_head(folio);
+}
+
+static inline bool folio_test_multi(struct folio *folio)
+{
+       return folio_test_head(folio);
+}
+
 static __always_inline void set_compound_head(struct page *page, struct page *head)
 {
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
@@ -603,12 +720,15 @@ static inline void ClearPageCompound(struct page *page)
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
 int PageHeadHuge(struct page *page);
+static inline bool folio_test_hugetlb(struct folio *folio)
+{
+       return PageHeadHuge(&folio->page);
+}
 #else
-TESTPAGEFLAG_FALSE(Huge)
-TESTPAGEFLAG_FALSE(HeadHuge)
+TESTPAGEFLAG_FALSE(Huge, hugetlb)
+TESTPAGEFLAG_FALSE(HeadHuge, headhuge)
 #endif
 
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for
@@ -624,6 +744,11 @@ static inline int PageTransHuge(struct page *page)
        return PageHead(page);
 }
 
+static inline bool folio_test_transhuge(struct folio *folio)
+{
+       return folio_test_head(folio);
+}
+
 /*
  * PageTransCompound returns true for both transparent huge pages
  * and hugetlbfs pages, so it should only be called when it's known
@@ -660,12 +785,26 @@ static inline int PageTransTail(struct page *page)
 PAGEFLAG(DoubleMap, double_map, PF_SECOND)
        TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
 #else
-TESTPAGEFLAG_FALSE(TransHuge)
-TESTPAGEFLAG_FALSE(TransCompound)
-TESTPAGEFLAG_FALSE(TransCompoundMap)
-TESTPAGEFLAG_FALSE(TransTail)
-PAGEFLAG_FALSE(DoubleMap)
-       TESTSCFLAG_FALSE(DoubleMap)
+TESTPAGEFLAG_FALSE(TransHuge, transhuge)
+TESTPAGEFLAG_FALSE(TransCompound, transcompound)
+TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
+TESTPAGEFLAG_FALSE(TransTail, transtail)
+PAGEFLAG_FALSE(DoubleMap, double_map)
+       TESTSCFLAG_FALSE(DoubleMap, double_map)
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
+ * compound page.
+ *
+ * This flag is set by hwpoison handler.  Cleared by THP split or free page.
+ */
+PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
+       TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
+#else
+PAGEFLAG_FALSE(HasHWPoisoned)
+       TESTSCFLAG_FALSE(HasHWPoisoned)
 #endif
 
 /*
@@ -849,6 +988,11 @@ static inline int page_has_private(struct page *page)
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
 }
 
+static inline bool folio_has_private(struct folio *folio)
+{
+       return page_has_private(&folio->page);
+}
+
 #undef PF_ANY
 #undef PF_HEAD
 #undef PF_ONLY_HEAD
index d8a6aec..83abf95 100644 (file)
@@ -8,46 +8,16 @@
 
 #ifdef CONFIG_PAGE_IDLE_FLAG
 
-#ifdef CONFIG_64BIT
-static inline bool page_is_young(struct page *page)
-{
-       return PageYoung(page);
-}
-
-static inline void set_page_young(struct page *page)
-{
-       SetPageYoung(page);
-}
-
-static inline bool test_and_clear_page_young(struct page *page)
-{
-       return TestClearPageYoung(page);
-}
-
-static inline bool page_is_idle(struct page *page)
-{
-       return PageIdle(page);
-}
-
-static inline void set_page_idle(struct page *page)
-{
-       SetPageIdle(page);
-}
-
-static inline void clear_page_idle(struct page *page)
-{
-       ClearPageIdle(page);
-}
-#else /* !CONFIG_64BIT */
+#ifndef CONFIG_64BIT
 /*
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
  */
 extern struct page_ext_operations page_idle_ops;
 
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return false;
@@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page)
        return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return;
@@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page)
        set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return false;
@@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page)
        return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return false;
@@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page)
        return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return;
@@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page)
        set_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
 {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
 
        if (unlikely(!page_ext))
                return;
 
        clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
-#endif /* CONFIG_64BIT */
+#endif /* !CONFIG_64BIT */
 
 #else /* !CONFIG_PAGE_IDLE_FLAG */
 
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
 {
        return false;
 }
 
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
 {
 }
 
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
 {
        return false;
 }
 
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
 {
        return false;
 }
 
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
 {
 }
 
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
 {
 }
 
 #endif /* CONFIG_PAGE_IDLE_FLAG */
 
+static inline bool page_is_young(struct page *page)
+{
+       return folio_test_young(page_folio(page));
+}
+
+static inline void set_page_young(struct page *page)
+{
+       folio_set_young(page_folio(page));
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return folio_test_clear_young(page_folio(page));
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return folio_test_idle(page_folio(page));
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       folio_set_idle(page_folio(page));
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       folio_clear_idle(page_folio(page));
+}
 #endif /* _LINUX_MM_PAGE_IDLE_H */
index 719bfe5..43c638c 100644 (file)
@@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
                        unsigned int order, gfp_t gfp_mask);
 extern void __split_page_owner(struct page *page, unsigned int nr);
-extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr)
        if (static_branch_unlikely(&page_owner_inited))
                __split_page_owner(page, nr);
 }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
 {
        if (static_branch_unlikely(&page_owner_inited))
-               __copy_page_owner(oldpage, newpage);
+               __folio_copy_owner(newfolio, old);
 }
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 {
@@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page,
                        unsigned int order)
 {
 }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
 {
 }
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
index 7ad46f4..2e677e6 100644 (file)
@@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page)
        return atomic_read(&page->_refcount);
 }
 
+/**
+ * folio_ref_count - The reference count on this folio.
+ * @folio: The folio.
+ *
+ * The refcount is usually incremented by calls to folio_get() and
+ * decremented by calls to folio_put().  Some typical users of the
+ * folio refcount:
+ *
+ * - Each reference from a page table
+ * - The page cache
+ * - Filesystem private data
+ * - The LRU list
+ * - Pipes
+ * - Direct IO which references this page in the process address space
+ *
+ * Return: The number of references to this folio.
+ */
+static inline int folio_ref_count(const struct folio *folio)
+{
+       return page_ref_count(&folio->page);
+}
+
 static inline int page_count(const struct page *page)
 {
-       return atomic_read(&compound_head(page)->_refcount);
+       return folio_ref_count(page_folio(page));
 }
 
 static inline void set_page_count(struct page *page, int v)
@@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v)
                __page_ref_set(page, v);
 }
 
+static inline void folio_set_count(struct folio *folio, int v)
+{
+       set_page_count(&folio->page, v);
+}
+
 /*
  * Setup the page count before being freed into the page allocator for
  * the first time (boot or memory hotplug)
@@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr)
                __page_ref_mod(page, nr);
 }
 
+static inline void folio_ref_add(struct folio *folio, int nr)
+{
+       page_ref_add(&folio->page, nr);
+}
+
 static inline void page_ref_sub(struct page *page, int nr)
 {
        atomic_sub(nr, &page->_refcount);
@@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr)
                __page_ref_mod(page, -nr);
 }
 
+static inline void folio_ref_sub(struct folio *folio, int nr)
+{
+       page_ref_sub(&folio->page, nr);
+}
+
 static inline int page_ref_sub_return(struct page *page, int nr)
 {
        int ret = atomic_sub_return(nr, &page->_refcount);
@@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr)
        return ret;
 }
 
+static inline int folio_ref_sub_return(struct folio *folio, int nr)
+{
+       return page_ref_sub_return(&folio->page, nr);
+}
+
 static inline void page_ref_inc(struct page *page)
 {
        atomic_inc(&page->_refcount);
@@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page)
                __page_ref_mod(page, 1);
 }
 
+static inline void folio_ref_inc(struct folio *folio)
+{
+       page_ref_inc(&folio->page);
+}
+
 static inline void page_ref_dec(struct page *page)
 {
        atomic_dec(&page->_refcount);
@@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page)
                __page_ref_mod(page, -1);
 }
 
+static inline void folio_ref_dec(struct folio *folio)
+{
+       page_ref_dec(&folio->page);
+}
+
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
        int ret = atomic_sub_and_test(nr, &page->_refcount);
@@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
        return ret;
 }
 
+static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
+{
+       return page_ref_sub_and_test(&folio->page, nr);
+}
+
 static inline int page_ref_inc_return(struct page *page)
 {
        int ret = atomic_inc_return(&page->_refcount);
@@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page)
        return ret;
 }
 
+static inline int folio_ref_inc_return(struct folio *folio)
+{
+       return page_ref_inc_return(&folio->page);
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
        int ret = atomic_dec_and_test(&page->_refcount);
@@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page)
        return ret;
 }
 
+static inline int folio_ref_dec_and_test(struct folio *folio)
+{
+       return page_ref_dec_and_test(&folio->page);
+}
+
 static inline int page_ref_dec_return(struct page *page)
 {
        int ret = atomic_dec_return(&page->_refcount);
@@ -161,15 +228,91 @@ static inline int page_ref_dec_return(struct page *page)
        return ret;
 }
 
-static inline int page_ref_add_unless(struct page *page, int nr, int u)
+static inline int folio_ref_dec_return(struct folio *folio)
+{
+       return page_ref_dec_return(&folio->page);
+}
+
+static inline bool page_ref_add_unless(struct page *page, int nr, int u)
 {
-       int ret = atomic_add_unless(&page->_refcount, nr, u);
+       bool ret = atomic_add_unless(&page->_refcount, nr, u);
 
        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
 }
 
+static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
+{
+       return page_ref_add_unless(&folio->page, nr, u);
+}
+
+/**
+ * folio_try_get - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * If you do not already have a reference to a folio, you can attempt to
+ * get one using this function.  It may fail if, for example, the folio
+ * has been freed since you found a pointer to it, or it is frozen for
+ * the purposes of splitting or migration.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get(struct folio *folio)
+{
+       return folio_ref_add_unless(folio, 1, 0);
+}
+
+static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
+{
+#ifdef CONFIG_TINY_RCU
+       /*
+        * The caller guarantees the folio will not be freed from interrupt
+        * context, so (on !SMP) we only need preemption to be disabled
+        * and TINY_RCU does that for us.
+        */
+# ifdef CONFIG_PREEMPT_COUNT
+       VM_BUG_ON(!in_atomic() && !irqs_disabled());
+# endif
+       VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
+       folio_ref_add(folio, count);
+#else
+       if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
+               /* Either the folio has been freed, or will be freed. */
+               return false;
+       }
+#endif
+       return true;
+}
+
+/**
+ * folio_try_get_rcu - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * This is a version of folio_try_get() optimised for non-SMP kernels.
+ * If you are still holding the rcu_read_lock() after looking up the
+ * page and know that the page cannot have its refcount decreased to
+ * zero in interrupt context, you can use this instead of folio_try_get().
+ *
+ * Example users include get_user_pages_fast() (as pages are not unmapped
+ * from interrupt context) and the page cache lookups (as pages are not
+ * truncated from interrupt context).  We also know that pages are not
+ * frozen in interrupt context for the purposes of splitting or migration.
+ *
+ * You can also use this function if you're holding a lock that prevents
+ * pages being frozen & removed; eg the i_pages lock for the page cache
+ * or the mmap_sem or page table lock for page tables.  In this case,
+ * it will always succeed, and you could have used a plain folio_get(),
+ * but it's sometimes more convenient to have a common function called
+ * from both locked and RCU-protected contexts.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get_rcu(struct folio *folio)
+{
+       return folio_ref_try_add_rcu(folio, 1);
+}
+
 static inline int page_ref_freeze(struct page *page, int count)
 {
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
@@ -179,6 +322,11 @@ static inline int page_ref_freeze(struct page *page, int count)
        return ret;
 }
 
+static inline int folio_ref_freeze(struct folio *folio, int count)
+{
+       return page_ref_freeze(&folio->page, count);
+}
+
 static inline void page_ref_unfreeze(struct page *page, int count)
 {
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
@@ -189,4 +337,8 @@ static inline void page_ref_unfreeze(struct page *page, int count)
                __page_ref_unfreeze(page, count);
 }
 
+static inline void folio_ref_unfreeze(struct folio *folio, int count)
+{
+       page_ref_unfreeze(&folio->page, count);
+}
 #endif
index 62db6b0..013cdc9 100644 (file)
@@ -162,149 +162,119 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 
 void release_pages(struct page **pages, int nr);
 
-/*
- * For file cache pages, return the address_space, otherwise return NULL
+struct address_space *page_mapping(struct page *);
+struct address_space *folio_mapping(struct folio *);
+struct address_space *swapcache_mapping(struct folio *);
+
+/**
+ * folio_file_mapping - Find the mapping this folio belongs to.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the mapping of the
+ * swap file or swap device where the data is stored.  This is different
+ * from the mapping returned by folio_mapping().  The only reason to
+ * use it is if, like NFS, you return 0 from ->activate_swapfile.
+ *
+ * Do not call this for folios which aren't in the page cache or swap cache.
  */
-static inline struct address_space *page_mapping_file(struct page *page)
+static inline struct address_space *folio_file_mapping(struct folio *folio)
 {
-       if (unlikely(PageSwapCache(page)))
-               return NULL;
-       return page_mapping(page);
+       if (unlikely(folio_test_swapcache(folio)))
+               return swapcache_mapping(folio);
+
+       return folio->mapping;
+}
+
+static inline struct address_space *page_file_mapping(struct page *page)
+{
+       return folio_file_mapping(page_folio(page));
 }
 
 /*
- * speculatively take a reference to a page.
- * If the page is free (_refcount == 0), then _refcount is untouched, and 0
- * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
- *
- * This function must be called inside the same rcu_read_lock() section as has
- * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
- *
- * Unless an RCU grace period has passed, the count of all pages coming out
- * of the allocator must be considered unstable. page_count may return higher
- * than expected, and put_page must be able to do the right thing when the
- * page has been finished with, no matter what it is subsequently allocated
- * for (because put_page is what is used here to drop an invalid speculative
- * reference).
- *
- * This is the interesting part of the lockless pagecache (and lockless
- * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
- * has the following pattern:
- * 1. find page in radix tree
- * 2. conditionally increment refcount
- * 3. check the page is still in pagecache (if no, goto 1)
- *
- * Remove-side that cares about stability of _refcount (eg. reclaim) has the
- * following (with the i_pages lock held):
- * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
- * B. remove page from pagecache
- * C. free the page
- *
- * There are 2 critical interleavings that matter:
- * - 2 runs before A: in this case, A sees elevated refcount and bails out
- * - A runs before 2: in this case, 2 sees zero refcount and retries;
- *   subsequently, B will complete and 1 will find no page, causing the
- *   lookup to return NULL.
- *
- * It is possible that between 1 and 2, the page is removed then the exact same
- * page is inserted into the same position in pagecache. That's OK: the
- * old find_get_page using a lock could equally have run before or after
- * such a re-insertion, depending on order that locks are granted.
- *
- * Lookups racing against pagecache insertion isn't a big problem: either 1
- * will find the page or it will not. Likewise, the old find_get_page could run
- * either before the insertion or afterwards, depending on timing.
+ * For file cache pages, return the address_space, otherwise return NULL
  */
-static inline int __page_cache_add_speculative(struct page *page, int count)
+static inline struct address_space *page_mapping_file(struct page *page)
 {
-#ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
-       VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
-       /*
-        * Preempt must be disabled here - we rely on rcu_read_lock doing
-        * this for us.
-        *
-        * Pagecache won't be truncated from interrupt context, so if we have
-        * found a page in the radix tree here, we have pinned its refcount by
-        * disabling preempt, and hence no need for the "speculative get" that
-        * SMP requires.
-        */
-       VM_BUG_ON_PAGE(page_count(page) == 0, page);
-       page_ref_add(page, count);
+       struct folio *folio = page_folio(page);
 
-#else
-       if (unlikely(!page_ref_add_unless(page, count, 0))) {
-               /*
-                * Either the page has been freed, or will be freed.
-                * In either case, retry here and the caller should
-                * do the right thing (see comments above).
-                */
-               return 0;
-       }
-#endif
-       VM_BUG_ON_PAGE(PageTail(page), page);
-
-       return 1;
+       if (unlikely(folio_test_swapcache(folio)))
+               return NULL;
+       return folio_mapping(folio);
 }
 
-static inline int page_cache_get_speculative(struct page *page)
+static inline bool page_cache_add_speculative(struct page *page, int count)
 {
-       return __page_cache_add_speculative(page, 1);
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       return folio_ref_try_add_rcu((struct folio *)page, count);
 }
 
-static inline int page_cache_add_speculative(struct page *page, int count)
+static inline bool page_cache_get_speculative(struct page *page)
 {
-       return __page_cache_add_speculative(page, count);
+       return page_cache_add_speculative(page, 1);
 }
 
 /**
- * attach_page_private - Attach private data to a page.
- * @page: Page to attach data to.
- * @data: Data to attach to page.
+ * folio_attach_private - Attach private data to a folio.
+ * @folio: Folio to attach data to.
+ * @data: Data to attach to folio.
  *
- * Attaching private data to a page increments the page's reference count.
- * The data must be detached before the page will be freed.
+ * Attaching private data to a folio increments the page's reference count.
+ * The data must be detached before the folio will be freed.
  */
-static inline void attach_page_private(struct page *page, void *data)
+static inline void folio_attach_private(struct folio *folio, void *data)
 {
-       get_page(page);
-       set_page_private(page, (unsigned long)data);
-       SetPagePrivate(page);
+       folio_get(folio);
+       folio->private = data;
+       folio_set_private(folio);
 }
 
 /**
- * detach_page_private - Detach private data from a page.
- * @page: Page to detach data from.
+ * folio_detach_private - Detach private data from a folio.
+ * @folio: Folio to detach data from.
  *
- * Removes the data that was previously attached to the page and decrements
+ * Removes the data that was previously attached to the folio and decrements
  * the refcount on the page.
  *
- * Return: Data that was attached to the page.
+ * Return: Data that was attached to the folio.
  */
-static inline void *detach_page_private(struct page *page)
+static inline void *folio_detach_private(struct folio *folio)
 {
-       void *data = (void *)page_private(page);
+       void *data = folio_get_private(folio);
 
-       if (!PagePrivate(page))
+       if (!folio_test_private(folio))
                return NULL;
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
+       folio_clear_private(folio);
+       folio->private = NULL;
+       folio_put(folio);
 
        return data;
 }
 
+static inline void attach_page_private(struct page *page, void *data)
+{
+       folio_attach_private(page_folio(page), data);
+}
+
+static inline void *detach_page_private(struct page *page)
+{
+       return folio_detach_private(page_folio(page));
+}
+
 #ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
 #else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
 {
-       return alloc_pages(gfp, 0);
+       return folio_alloc(gfp, order);
 }
 #endif
 
+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+       return &filemap_alloc_folio(gfp, 0)->page;
+}
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
        return __page_cache_alloc(mapping_gfp_mask(x));
@@ -331,9 +301,28 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_FOR_MMAP           0x00000040
 #define FGP_HEAD               0x00000080
 #define FGP_ENTRY              0x00000100
+#define FGP_STABLE             0x00000200
 
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
-               int fgp_flags, gfp_t cache_gfp_mask);
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp);
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp);
+
+/**
+ * filemap_get_folio - Find and get a folio.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ *
+ * Looks up the page cache entry at @mapping & @index.  If a folio is
+ * present, it is returned with an increased refcount.
+ *
+ * Otherwise, %NULL is returned.
+ */
+static inline struct folio *filemap_get_folio(struct address_space *mapping,
+                                       pgoff_t index)
+{
+       return __filemap_get_folio(mapping, index, 0, 0);
+}
 
 /**
  * find_get_page - find and get a page reference
@@ -377,25 +366,6 @@ static inline struct page *find_lock_page(struct address_space *mapping,
 }
 
 /**
- * find_lock_head - Locate, pin and lock a pagecache page.
- * @mapping: The address_space to search.
- * @index: The page index.
- *
- * Looks up the page cache entry at @mapping & @index.  If there is a
- * page cache page, its head page is returned locked and with an increased
- * refcount.
- *
- * Context: May sleep.
- * Return: A struct page which is !PageTail, or %NULL if there is no page
- * in the cache for this index.
- */
-static inline struct page *find_lock_head(struct address_space *mapping,
-                                       pgoff_t index)
-{
-       return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
-}
-
-/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -452,6 +422,73 @@ static inline bool thp_contains(struct page *head, pgoff_t index)
        return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
 }
 
+#define swapcache_index(folio) __page_file_index(&(folio)->page)
+
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to.  If you know
+ * the page is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+        if (unlikely(folio_test_swapcache(folio)))
+                return swapcache_index(folio);
+        return folio->index;
+}
+
+/**
+ * folio_next_index - Get the index of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The index of the folio which follows this folio in the file.
+ */
+static inline pgoff_t folio_next_index(struct folio *folio)
+{
+       return folio->index + folio_nr_pages(folio);
+}
+
+/**
+ * folio_file_page - The page for a particular index.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Sometimes after looking up a folio in the page cache, we need to
+ * obtain the specific page for an index (eg a page fault).
+ *
+ * Return: The page containing the file data for this index.
+ */
+static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
+{
+       /* HugeTLBfs indexes the page cache in units of hpage_size */
+       if (folio_test_hugetlb(folio))
+               return &folio->page;
+       return folio_page(folio, index & (folio_nr_pages(folio) - 1));
+}
+
+/**
+ * folio_contains - Does this folio contain this index?
+ * @folio: The folio.
+ * @index: The page index within the file.
+ *
+ * Context: The caller should have the page locked in order to prevent
+ * (eg) shmem from moving the page between the page cache and swap cache
+ * and changing its index in the middle of the operation.
+ * Return: true or false.
+ */
+static inline bool folio_contains(struct folio *folio, pgoff_t index)
+{
+       /* HugeTLBfs indexes the page cache in units of hpage_size */
+       if (folio_test_hugetlb(folio))
+               return folio->index == index;
+       return index - folio_index(folio) < folio_nr_pages(folio);
+}
+
 /*
  * Given the page we found in the page cache, return the page corresponding
  * to this index in the file
@@ -560,6 +597,27 @@ static inline loff_t page_file_offset(struct page *page)
        return ((loff_t)page_index(page)) << PAGE_SHIFT;
 }
 
+/**
+ * folio_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ */
+static inline loff_t folio_pos(struct folio *folio)
+{
+       return page_offset(&folio->page);
+}
+
+/**
+ * folio_file_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ *
+ * This differs from folio_pos() for folios which belong to a swap file.
+ * NFS is the only filesystem today which needs to use folio_file_pos().
+ */
+static inline loff_t folio_file_pos(struct folio *folio)
+{
+       return page_file_offset(&folio->page);
+}
+
 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                     unsigned long address);
 
@@ -575,13 +633,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 }
 
 struct wait_page_key {
-       struct page *page;
+       struct folio *folio;
        int bit_nr;
        int page_match;
 };
 
 struct wait_page_queue {
-       struct page *page;
+       struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
 };
@@ -589,7 +647,7 @@ struct wait_page_queue {
 static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
 {
-       if (wait_page->page != key->page)
+       if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;
 
@@ -599,20 +657,31 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
        return true;
 }
 
-extern void __lock_page(struct page *page);
-extern int __lock_page_killable(struct page *page);
-extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
-extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+void __folio_lock(struct folio *folio);
+int __folio_lock_killable(struct folio *folio);
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
                                unsigned int flags);
-extern void unlock_page(struct page *page);
+void unlock_page(struct page *page);
+void folio_unlock(struct folio *folio);
+
+static inline bool folio_trylock(struct folio *folio)
+{
+       return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
+}
 
 /*
  * Return true if the page was successfully locked
  */
 static inline int trylock_page(struct page *page)
 {
-       page = compound_head(page);
-       return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+       return folio_trylock(page_folio(page));
+}
+
+static inline void folio_lock(struct folio *folio)
+{
+       might_sleep();
+       if (!folio_trylock(folio))
+               __folio_lock(folio);
 }
 
 /*
@@ -620,38 +689,30 @@ static inline int trylock_page(struct page *page)
  */
 static inline void lock_page(struct page *page)
 {
+       struct folio *folio;
        might_sleep();
-       if (!trylock_page(page))
-               __lock_page(page);
+
+       folio = page_folio(page);
+       if (!folio_trylock(folio))
+               __folio_lock(folio);
 }
 
-/*
- * lock_page_killable is like lock_page but can be interrupted by fatal
- * signals.  It returns 0 if it locked the page and -EINTR if it was
- * killed while waiting.
- */
-static inline int lock_page_killable(struct page *page)
+static inline int folio_lock_killable(struct folio *folio)
 {
        might_sleep();
-       if (!trylock_page(page))
-               return __lock_page_killable(page);
+       if (!folio_trylock(folio))
+               return __folio_lock_killable(folio);
        return 0;
 }
 
 /*
- * lock_page_async - Lock the page, unless this would block. If the page
- * is already locked, then queue a callback when the page becomes unlocked.
- * This callback can then retry the operation.
- *
- * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
- * was already locked and the callback defined in 'wait' was queued.
+ * lock_page_killable is like lock_page but can be interrupted by fatal
+ * signals.  It returns 0 if it locked the page and -EINTR if it was
+ * killed while waiting.
  */
-static inline int lock_page_async(struct page *page,
-                                 struct wait_page_queue *wait)
+static inline int lock_page_killable(struct page *page)
 {
-       if (!trylock_page(page))
-               return __lock_page_async(page, wait);
-       return 0;
+       return folio_lock_killable(page_folio(page));
 }
 
 /*
@@ -659,78 +720,108 @@ static inline int lock_page_async(struct page *page,
  * caller indicated that it can handle a retry.
  *
  * Return value and mmap_lock implications depend on flags; see
- * __lock_page_or_retry().
+ * __folio_lock_or_retry().
  */
-static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
+static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                     unsigned int flags)
 {
+       struct folio *folio;
        might_sleep();
-       return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+
+       folio = page_folio(page);
+       return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
  * and should not be used directly.
  */
-extern void wait_on_page_bit(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
+void folio_wait_bit(struct folio *folio, int bit_nr);
+int folio_wait_bit_killable(struct folio *folio, int bit_nr);
 
 /* 
- * Wait for a page to be unlocked.
+ * Wait for a folio to be unlocked.
  *
- * This must be called with the caller "holding" the page,
- * ie with increased "page->count" so that the page won't
+ * This must be called with the caller "holding" the folio,
+ * ie with increased "page->count" so that the folio won't
  * go away during the wait..
  */
+static inline void folio_wait_locked(struct folio *folio)
+{
+       if (folio_test_locked(folio))
+               folio_wait_bit(folio, PG_locked);
+}
+
+static inline int folio_wait_locked_killable(struct folio *folio)
+{
+       if (!folio_test_locked(folio))
+               return 0;
+       return folio_wait_bit_killable(folio, PG_locked);
+}
+
 static inline void wait_on_page_locked(struct page *page)
 {
-       if (PageLocked(page))
-               wait_on_page_bit(compound_head(page), PG_locked);
+       folio_wait_locked(page_folio(page));
 }
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-       if (!PageLocked(page))
-               return 0;
-       return wait_on_page_bit_killable(compound_head(page), PG_locked);
+       return folio_wait_locked_killable(page_folio(page));
 }
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
-int wait_on_page_writeback_killable(struct page *page);
-extern void end_page_writeback(struct page *page);
+void folio_wait_writeback(struct folio *folio);
+int folio_wait_writeback_killable(struct folio *folio);
+void end_page_writeback(struct page *page);
+void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
+void folio_wait_stable(struct folio *folio);
+void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
+static inline void __set_page_dirty(struct page *page,
+               struct address_space *mapping, int warn)
+{
+       __folio_mark_dirty(page_folio(page), mapping, warn);
+}
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
+                         struct bdi_writeback *wb);
+static inline void account_page_cleaned(struct page *page,
+               struct address_space *mapping, struct bdi_writeback *wb)
+{
+       return folio_account_cleaned(page_folio(page), mapping, wb);
+}
+void __folio_cancel_dirty(struct folio *folio);
+static inline void folio_cancel_dirty(struct folio *folio)
+{
+       /* Avoid atomic ops, locking, etc. when not actually needed. */
+       if (folio_test_dirty(folio))
+               __folio_cancel_dirty(folio);
+}
+static inline void cancel_dirty_page(struct page *page)
+{
+       folio_cancel_dirty(page_folio(page));
+}
+bool folio_clear_dirty_for_io(struct folio *folio);
+bool clear_page_dirty_for_io(struct page *page);
+int __must_check folio_write_one(struct folio *folio);
+static inline int __must_check write_one_page(struct page *page)
+{
+       return folio_write_one(page_folio(page));
+}
 
-void __set_page_dirty(struct page *, struct address_space *, int warn);
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 
 void page_endio(struct page *page, bool is_write, int err);
 
-/**
- * set_page_private_2 - Set PG_private_2 on a page and take a ref
- * @page: The page.
- *
- * Set the PG_private_2 flag on a page and take the reference needed for the VM
- * to handle its lifetime correctly.  This sets the flag and takes the
- * reference unconditionally, so care must be taken not to set the flag again
- * if it's already set.
- */
-static inline void set_page_private_2(struct page *page)
-{
-       page = compound_head(page);
-       get_page(page);
-       SetPagePrivate2(page);
-}
-
-void end_page_private_2(struct page *page);
-void wait_on_page_private_2(struct page *page);
-int wait_on_page_private_2_killable(struct page *page);
+void folio_end_private_2(struct folio *folio);
+void folio_wait_private_2(struct folio *folio);
+int folio_wait_private_2_killable(struct folio *folio);
 
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);
 
 /*
  * Fault everything in given userspace address range in.
@@ -790,9 +881,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
 }
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-                               pgoff_t index, gfp_t gfp_mask);
+               pgoff_t index, gfp_t gfp);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-                               pgoff_t index, gfp_t gfp_mask);
+               pgoff_t index, gfp_t gfp);
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+               pgoff_t index, gfp_t gfp);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 void replace_page_cache_page(struct page *old, struct page *new);
@@ -817,6 +910,10 @@ static inline int add_to_page_cache(struct page *page,
        return error;
 }
 
+/* Must be non-static for BPF error injection */
+int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
+               pgoff_t index, gfp_t gfp, void **shadowp);
+
 /**
  * struct readahead_control - Describes a readahead request.
  *
@@ -906,33 +1003,57 @@ void page_cache_async_readahead(struct address_space *mapping,
        page_cache_async_ra(&ractl, page, req_count);
 }
 
+static inline struct folio *__readahead_folio(struct readahead_control *ractl)
+{
+       struct folio *folio;
+
+       BUG_ON(ractl->_batch_count > ractl->_nr_pages);
+       ractl->_nr_pages -= ractl->_batch_count;
+       ractl->_index += ractl->_batch_count;
+
+       if (!ractl->_nr_pages) {
+               ractl->_batch_count = 0;
+               return NULL;
+       }
+
+       folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       ractl->_batch_count = folio_nr_pages(folio);
+
+       return folio;
+}
+
 /**
  * readahead_page - Get the next page to read.
- * @rac: The current readahead request.
+ * @ractl: The current readahead request.
  *
  * Context: The page is locked and has an elevated refcount.  The caller
  * should decreases the refcount once the page has been submitted for I/O
  * and unlock the page once all I/O to that page has completed.
  * Return: A pointer to the next page, or %NULL if we are done.
  */
-static inline struct page *readahead_page(struct readahead_control *rac)
+static inline struct page *readahead_page(struct readahead_control *ractl)
 {
-       struct page *page;
-
-       BUG_ON(rac->_batch_count > rac->_nr_pages);
-       rac->_nr_pages -= rac->_batch_count;
-       rac->_index += rac->_batch_count;
+       struct folio *folio = __readahead_folio(ractl);
 
-       if (!rac->_nr_pages) {
-               rac->_batch_count = 0;
-               return NULL;
-       }
+       return &folio->page;
+}
 
-       page = xa_load(&rac->mapping->i_pages, rac->_index);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       rac->_batch_count = thp_nr_pages(page);
+/**
+ * readahead_folio - Get the next folio to read.
+ * @ractl: The current readahead request.
+ *
+ * Context: The folio is locked.  The caller should unlock the folio once
+ * all I/O to that folio has completed.
+ * Return: A pointer to the next folio, or %NULL if we are done.
+ */
+static inline struct folio *readahead_folio(struct readahead_control *ractl)
+{
+       struct folio *folio = __readahead_folio(ractl);
 
-       return page;
+       if (folio)
+               folio_put(folio);
+       return folio;
 }
 
 static inline unsigned int __readahead_batch(struct readahead_control *rac,
@@ -1040,6 +1161,34 @@ static inline unsigned long dir_pages(struct inode *inode)
 }
 
 /**
+ * folio_mkwrite_check_truncate - check if folio was truncated
+ * @folio: the folio to check
+ * @inode: the inode to check the folio against
+ *
+ * Return: the number of bytes in the folio up to EOF,
+ * or -EFAULT if the folio was truncated.
+ */
+static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
+                                             struct inode *inode)
+{
+       loff_t size = i_size_read(inode);
+       pgoff_t index = size >> PAGE_SHIFT;
+       size_t offset = offset_in_folio(folio, size);
+
+       if (!folio->mapping)
+               return -EFAULT;
+
+       /* folio is wholly inside EOF */
+       if (folio_next_index(folio) - 1 < index)
+               return folio_size(folio);
+       /* folio is wholly past EOF */
+       if (folio->index > index || !offset)
+               return -EFAULT;
+       /* folio is partially inside EOF */
+       return offset;
+}
+
+/**
  * page_mkwrite_check_truncate - check if page was truncated
  * @page: the page to check
  * @inode: the inode to check the page against
@@ -1068,19 +1217,25 @@ static inline int page_mkwrite_check_truncate(struct page *page,
 }
 
 /**
- * i_blocks_per_page - How many blocks fit in this page.
+ * i_blocks_per_folio - How many blocks fit in this folio.
  * @inode: The inode which contains the blocks.
- * @page: The page (head page if the page is a THP).
+ * @folio: The folio.
  *
- * If the block size is larger than the size of this page, return zero.
+ * If the block size is larger than the size of this folio, return zero.
  *
- * Context: The caller should hold a refcount on the page to prevent it
+ * Context: The caller should hold a refcount on the folio to prevent it
  * from being split.
- * Return: The number of filesystem blocks covered by this page.
+ * Return: The number of filesystem blocks covered by this folio.
  */
 static inline
+unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
+{
+       return folio_size(folio) >> inode->i_blkbits;
+}
+
+static inline
 unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
 {
-       return thp_size(page) >> inode->i_blkbits;
+       return i_blocks_per_folio(inode, page_folio(page));
 }
 #endif /* _LINUX_PAGEMAP_H */
index d255812..6f7949b 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_PART_STAT_H
 
 #include <linux/genhd.h>
+#include <asm/local.h>
 
 struct disk_stats {
        u64 nsecs[NR_STAT_GROUPS];
index ae16a98..b31d3f3 100644 (file)
@@ -267,6 +267,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 }
 
 /**
+ * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
+ * caller is responsible for taking RCU.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
+{
+       unsigned long __percpu *percpu_count;
+       bool ret = false;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (likely(__ref_is_percpu(ref, &percpu_count))) {
+               this_cpu_inc(*percpu_count);
+               ret = true;
+       } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
+               ret = atomic_long_inc_not_zero(&ref->data->count);
+       }
+       return ret;
+}
+
+/**
  * percpu_ref_tryget_live - try to increment a live percpu refcount
  * @ref: percpu_ref to try-get
  *
@@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
  */
 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 {
-       unsigned long __percpu *percpu_count;
        bool ret = false;
 
        rcu_read_lock();
-
-       if (__ref_is_percpu(ref, &percpu_count)) {
-               this_cpu_inc(*percpu_count);
-               ret = true;
-       } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
-               ret = atomic_long_inc_not_zero(&ref->data->count);
-       }
-
+       ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
-
        return ret;
 }
 
index c976cc6..e704b1a 100644 (file)
@@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  *
  * returns the number of cleaned PTEs.
  */
-int page_mkclean(struct page *);
+int folio_mkclean(struct folio *);
 
 /*
  * called in munlock()/munmap() path to check for other vmas holding
@@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 }
 
-static inline int page_mkclean(struct page *page)
+static inline int folio_mkclean(struct folio *folio)
 {
        return 0;
 }
-
-
 #endif /* CONFIG_MMU */
 
+static inline int page_mkclean(struct page *page)
+{
+       return folio_mkclean(page_folio(page));
+}
 #endif /* _LINUX_RMAP_H */
index 2713e68..4a6ff27 100644 (file)
@@ -427,6 +427,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
 /**
+ * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
+ * @sbq: Bitmap queue to allocate from.
+ * @nr_tags: number of tags requested
+ * @offset: offset to add to returned bits
+ *
+ * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
+ * a bit in the mask returned, and the caller must add @offset to the value to
+ * get the absolute tag value.
+ */
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+                                       unsigned int *offset);
+
+/**
  * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
  * sbitmap_queue, limiting the depth used from each word, with preemption
  * already disabled.
@@ -515,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu);
 
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+                               int *tags, int nr_tags);
+
 static inline int sbq_index_inc(int index)
 {
        return (index + 1) & (SBQ_WAIT_QUEUES - 1);
index c1a927d..e0454e6 100644 (file)
@@ -1160,10 +1160,8 @@ struct task_struct {
        /* Stacked block device info: */
        struct bio_list                 *bio_list;
 
-#ifdef CONFIG_BLOCK
        /* Stack plugging: */
        struct blk_plug                 *plug;
-#endif
 
        /* VM state: */
        struct reclaim_state            *reclaim_state;
index 14ab0c0..1ce9a9e 100644 (file)
@@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
                             struct sk_msg *msg, u32 bytes);
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
                   int len, int flags);
+bool sk_msg_is_readable(struct sock *sk);
 
 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
 {
index ba52f3a..cdf0957 100644 (file)
@@ -320,11 +320,17 @@ struct vma_swap_readahead {
 #endif
 };
 
+static inline swp_entry_t folio_swap_entry(struct folio *folio)
+{
+       swp_entry_t entry = { .val = page_private(&folio->page) };
+       return entry;
+}
+
 /* linux/mm/workingset.c */
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
-void workingset_refault(struct page *page, void *shadow);
-void workingset_activation(struct page *page);
+void workingset_refault(struct folio *folio, void *shadow);
+void workingset_activation(struct folio *folio);
 
 /* Only track the nodes of mappings with shadow entries */
 void workingset_update_node(struct xa_node *node);
@@ -344,9 +350,11 @@ extern unsigned long nr_free_buffer_pages(void);
 /* linux/mm/swap.c */
 extern void lru_note_cost(struct lruvec *lruvec, bool file,
                          unsigned int nr_pages);
-extern void lru_note_cost_page(struct page *);
+extern void lru_note_cost_folio(struct folio *);
+extern void folio_add_lru(struct folio *);
 extern void lru_cache_add(struct page *);
-extern void mark_page_accessed(struct page *);
+void mark_page_accessed(struct page *);
+void folio_mark_accessed(struct folio *);
 
 extern atomic_t lru_disable_count;
 
@@ -365,7 +373,6 @@ extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
 extern void lru_add_drain_all(void);
-extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
index 96305a6..c635c2e 100644 (file)
@@ -3,7 +3,7 @@
 #define _LINUX_T10_PI_H
 
 #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 /*
  * A T10 PI-capable target device can be formatted with different
index aa11fe3..12d8277 100644 (file)
@@ -269,6 +269,7 @@ enum tpm2_cc_attrs {
 #define TPM_VID_INTEL    0x8086
 #define TPM_VID_WINBOND  0x1050
 #define TPM_VID_STM      0x104A
+#define TPM_VID_ATML     0x1114
 
 enum tpm_chip_flags {
        TPM_CHIP_FLAG_TPM2              = BIT(1),
index d6a6cf5..bfe3886 100644 (file)
@@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
 #endif         /* CONFIG_SMP */
 
+static inline void __zone_stat_mod_folio(struct folio *folio,
+               enum zone_stat_item item, long nr)
+{
+       __mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void __zone_stat_add_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __zone_stat_sub_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void zone_stat_mod_folio(struct folio *folio,
+               enum zone_stat_item item, long nr)
+{
+       mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void zone_stat_add_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void zone_stat_sub_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void __node_stat_mod_folio(struct folio *folio,
+               enum node_stat_item item, long nr)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void __node_stat_add_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __node_stat_sub_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void node_stat_mod_folio(struct folio *folio,
+               enum node_stat_item item, long nr)
+{
+       mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void node_stat_add_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void node_stat_sub_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
                                             int migratetype)
 {
@@ -525,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void inc_lruvec_state(struct lruvec *lruvec,
-                                   enum node_stat_item idx)
-{
-       mod_lruvec_state(lruvec, idx, 1);
-}
-
 static inline void __inc_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx)
 {
@@ -543,6 +609,24 @@ static inline void __dec_lruvec_page_state(struct page *page,
        __mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+                                          enum node_stat_item idx, int val)
+{
+       __mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void __lruvec_stat_add_folio(struct folio *folio,
+                                          enum node_stat_item idx)
+{
+       __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void __lruvec_stat_sub_folio(struct folio *folio,
+                                          enum node_stat_item idx)
+{
+       __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
+
 static inline void inc_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx)
 {
@@ -555,4 +639,21 @@ static inline void dec_lruvec_page_state(struct page *page,
        mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+                                        enum node_stat_item idx, int val)
+{
+       mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void lruvec_stat_add_folio(struct folio *folio,
+                                        enum node_stat_item idx)
+{
+       lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void lruvec_stat_sub_folio(struct folio *folio,
+                                        enum node_stat_item idx)
+{
+       lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
 #endif /* _LINUX_VMSTAT_H */
index d1f65ad..3bfd487 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/blk_types.h>
-#include <linux/blk-cgroup.h>
 
 struct bio;
 
@@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
        return flags;
 }
 
-static inline struct cgroup_subsys_state *
-wbc_blkcg_css(struct writeback_control *wbc)
-{
 #ifdef CONFIG_CGROUP_WRITEBACK
-       if (wbc->wb)
-               return wbc->wb->blkcg_css;
-#endif
-       return blkcg_root_css;
-}
+#define wbc_blkcg_css(wbc) \
+       ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
+#else
+#define wbc_blkcg_css(wbc)             (blkcg_root_css)
+#endif /* CONFIG_CGROUP_WRITEBACK */
 
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
@@ -393,7 +389,14 @@ void writeback_set_ratelimit(void);
 void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);
 
-void account_page_redirty(struct page *page);
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
+void folio_account_redirty(struct folio *folio);
+static inline void account_page_redirty(struct page *page)
+{
+       folio_account_redirty(page_folio(page));
+}
+bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
+bool redirty_page_for_writepage(struct writeback_control *, struct page *);
 
 void sb_mark_inode_writeback(struct inode *inode);
 void sb_clear_inode_writeback(struct inode *inode);
index 9884c84..7285ca5 100644 (file)
@@ -234,6 +234,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
 XZ_EXTERN void xz_dec_end(struct xz_dec *s);
 
 /*
+ * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
+ * See xz_dec_microlzma_alloc() below for details.
+ *
+ * These functions aren't used or available in preboot code and thus aren't
+ * marked with XZ_EXTERN. This avoids warnings about static functions that
+ * are never defined.
+ */
+/**
+ * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
+ */
+struct xz_dec_microlzma;
+
+/**
+ * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
+ * @mode        XZ_SINGLE or XZ_PREALLOC
+ * @dict_size   LZMA dictionary size. This must be at least 4 KiB and
+ *              at most 3 GiB.
+ *
+ * In contrast to xz_dec_init(), this function only allocates the memory
+ * and remembers the dictionary size. xz_dec_microlzma_reset() must be used
+ * before calling xz_dec_microlzma_run().
+ *
+ * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE.
+ * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated.
+ *
+ * On success, xz_dec_microlzma_alloc() returns a pointer to
+ * struct xz_dec_microlzma. If memory allocation fails or
+ * dict_size is invalid, NULL is returned.
+ *
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too.
+ */
+extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+                                                      uint32_t dict_size);
+
+/**
+ * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
+ * @s           Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size   Compressed size of the input stream
+ * @uncomp_size Uncompressed size of the input stream. A value smaller
+ *              than the real uncompressed size of the input stream can
+ *              be specified if uncomp_size_is_exact is set to false.
+ *              uncomp_size can never be set to a value larger than the
+ *              expected real uncompressed size because it would eventually
+ *              result in XZ_DATA_ERROR.
+ * @uncomp_size_is_exact  This is an int instead of bool to avoid
+ *              requiring stdbool.h. This should normally be set to true.
+ *              When this is set to false, error detection is weaker.
+ */
+extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
+                                  uint32_t comp_size, uint32_t uncomp_size,
+                                  int uncomp_size_is_exact);
+
+/**
+ * xz_dec_microlzma_run() - Run the MicroLZMA decoder
+ * @s           Decoder state initialized using xz_dec_microlzma_reset()
+ * @b:          Input and output buffers
+ *
+ * This works similarly to xz_dec_run() with a few important differences.
+ * Only the differences are documented here.
+ *
+ * The only possible return values are XZ_OK, XZ_STREAM_END, and
+ * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress
+ * is possible due to lack of input data or output space, this function will
+ * keep returning XZ_OK. Thus, the calling code must be written so that it
+ * will eventually provide input and output space matching (or exceeding)
+ * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset().
+ * If the caller cannot do this (for example, if the input file is truncated
+ * or otherwise corrupt), the caller must detect this error by itself to
+ * avoid an infinite loop.
+ *
+ * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned.
+ * This can happen also when incorrect dictionary, uncompressed, or
+ * compressed sizes have been specified.
+ *
+ * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over
+ * uncompressed data. This way the caller doesn't need to provide a temporary
+ * output buffer for the bytes that will be ignored.
+ *
+ * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK
+ * is also possible and thus XZ_SINGLE is actually a limited multi-call mode.
+ * After XZ_OK the bytes decoded so far may be read from the output buffer.
+ * It is possible to continue decoding but the variables b->out and b->out_pos
+ * MUST NOT be changed by the caller. Increasing the value of b->out_size is
+ * allowed to make more output space available; one doesn't need to provide
+ * space for the whole uncompressed data on the first call. The input buffer
+ * may be changed normally like with XZ_PREALLOC. This way input data can be
+ * provided from non-contiguous memory.
+ */
+extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
+                                       struct xz_buf *b);
+
+/**
+ * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
+ * @s:          Decoder state allocated using xz_dec_microlzma_alloc().
+ *              If s is NULL, this function does nothing.
+ */
+extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+
+/*
  * Standalone build (userspace build or in-kernel build for boot time use)
  * needs a CRC32 implementation. For normal in-kernel use, kernel's own
  * CRC32 module is used instead, and users of this module don't need to
index 62dd842..27336fc 100644 (file)
@@ -5376,7 +5376,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  *     netdev and may otherwise be used by driver read-only, will be update
  *     by cfg80211 on change_interface
  * @mgmt_registrations: list of registrations for management frames
- * @mgmt_registrations_lock: lock for the list
  * @mgmt_registrations_need_update: mgmt registrations were updated,
  *     need to propagate the update to the driver
  * @mtx: mutex used to lock data in this struct, may be used by drivers
@@ -5423,7 +5422,6 @@ struct wireless_dev {
        u32 identifier;
 
        struct list_head mgmt_registrations;
-       spinlock_t mgmt_registrations_lock;
        u8 mgmt_registrations_need_update:1;
 
        struct mutex mtx;
index 6026bbe..3214848 100644 (file)
@@ -69,6 +69,10 @@ struct mptcp_out_options {
                struct {
                        u64 sndr_key;
                        u64 rcvr_key;
+                       u64 data_seq;
+                       u32 subflow_seq;
+                       u16 data_len;
+                       __sum16 csum;
                };
                struct {
                        struct mptcp_addr_info addr;
index ea6fbc8..463f390 100644 (file)
@@ -1208,7 +1208,7 @@ struct proto {
 #endif
 
        bool                    (*stream_memory_free)(const struct sock *sk, int wake);
-       bool                    (*stream_memory_read)(const struct sock *sk);
+       bool                    (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                    (*enter_memory_pressure)(struct sock *sk);
        void                    (*leave_memory_pressure)(struct sock *sk);
@@ -2820,4 +2820,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
 
+static inline bool sk_is_readable(struct sock *sk)
+{
+       if (sk->sk_prot->sock_is_readable)
+               return sk->sk_prot->sock_is_readable(sk);
+       return false;
+}
 #endif /* _SOCK_H */
index be4b3e1..1fffb20 100644 (file)
@@ -358,6 +358,7 @@ int tls_sk_query(struct sock *sk, int optname, char __user *optval,
                int __user *optlen);
 int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
                  unsigned int optlen);
+void tls_err_abort(struct sock *sk, int err);
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
 void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
@@ -375,7 +376,7 @@ void tls_sw_release_resources_rx(struct sock *sk);
 void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                   int nonblock, int flags, int *addr_len);
-bool tls_sw_stream_read(const struct sock *sk);
+bool tls_sw_sock_is_readable(struct sock *sk);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
                           struct pipe_inode_info *pipe,
                           size_t len, unsigned int flags);
@@ -466,12 +467,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
 #endif
 }
 
-static inline void tls_err_abort(struct sock *sk, int err)
-{
-       sk->sk_err = err;
-       sk_error_report(sk);
-}
-
 static inline bool tls_bigint_increment(unsigned char *seq, int len)
 {
        int i;
@@ -512,7 +507,7 @@ static inline void tls_advance_record_sn(struct sock *sk,
                                         struct cipher_context *ctx)
 {
        if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
-               tls_err_abort(sk, EBADMSG);
+               tls_err_abort(sk, -EBADMSG);
 
        if (prot->version != TLS_1_3_VERSION &&
            prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
index 360df45..909ecf4 100644 (file)
@@ -494,8 +494,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
         * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial
         * packets in udp_gro_complete_segment. As does UDP GSO, verified by
         * udp_send_skb. But when those packets are looped in dev_loopback_xmit
-        * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this
-        * specific case, where PARTIAL is both correct and required.
+        * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY.
+        * Reset in this specific case, where PARTIAL is both correct and
+        * required.
         */
        if (skb->pkt_type == PACKET_LOOPBACK)
                skb->ip_summed = CHECKSUM_PARTIAL;
index eaf04c9..3107806 100644 (file)
@@ -396,4 +396,7 @@ static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
 extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc,
                             u8 key, u8 asc, u8 ascq);
 
+struct request *scsi_alloc_request(struct request_queue *q,
+               unsigned int op, blk_mq_req_flags_t flags);
+
 #endif /* _SCSI_SCSI_CMND_H */
index b97e142..430b73b 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <scsi/scsi.h>
 #include <linux/atomic.h>
 #include <linux/sbitmap.h>
index cc5ab96..a95daa4 100644 (file)
@@ -114,7 +114,7 @@ TRACE_EVENT(block_rq_requeue,
  */
 TRACE_EVENT(block_rq_complete,
 
-       TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
+       TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
 
        TP_ARGS(rq, error, nr_bytes),
 
@@ -122,7 +122,7 @@ TRACE_EVENT(block_rq_complete,
                __field(  dev_t,        dev                     )
                __field(  sector_t,     sector                  )
                __field(  unsigned int, nr_sector               )
-               __field(  int,          error                   )
+               __field(  int   ,       error                   )
                __array(  char,         rwbs,   RWBS_LEN        )
                __dynamic_array( char,  cmd,    1               )
        ),
@@ -131,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
                __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
-               __entry->error     = error;
+               __entry->error     = blk_status_to_errno(error);
 
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
index db4f2ce..16ae7b6 100644 (file)
@@ -24,7 +24,7 @@ struct erofs_map_blocks;
 #define show_mflags(flags) __print_flags(flags, "",    \
        { EROFS_MAP_MAPPED,     "M" },                  \
        { EROFS_MAP_META,       "I" },                  \
-       { EROFS_MAP_ZIPPED,     "Z" })
+       { EROFS_MAP_ENCODED,    "E" })
 
 TRACE_EVENT(erofs_lookup,
 
index 0dd30de..7346f01 100644 (file)
@@ -6,6 +6,7 @@
 #define _TRACE_IO_URING_H
 
 #include <linux/tracepoint.h>
+#include <uapi/linux/io_uring.h>
 
 struct io_wq_work;
 
@@ -497,6 +498,66 @@ TRACE_EVENT(io_uring_task_run,
                  (unsigned long long) __entry->user_data)
 );
 
+/*
+ * io_uring_req_failed - called when an sqe is errored dring submission
+ *
+ * @sqe:               pointer to the io_uring_sqe that failed
+ * @error:             error it failed with
+ *
+ * Allows easier diagnosing of malformed requests in production systems.
+ */
+TRACE_EVENT(io_uring_req_failed,
+
+       TP_PROTO(const struct io_uring_sqe *sqe, int error),
+
+       TP_ARGS(sqe, error),
+
+       TP_STRUCT__entry (
+               __field(  u8,   opcode )
+               __field(  u8,   flags )
+               __field(  u8,   ioprio )
+               __field( u64,   off )
+               __field( u64,   addr )
+               __field( u32,   len )
+               __field( u32,   op_flags )
+               __field( u64,   user_data )
+               __field( u16,   buf_index )
+               __field( u16,   personality )
+               __field( u32,   file_index )
+               __field( u64,   pad1 )
+               __field( u64,   pad2 )
+               __field( int,   error )
+       ),
+
+       TP_fast_assign(
+               __entry->opcode         = sqe->opcode;
+               __entry->flags          = sqe->flags;
+               __entry->ioprio         = sqe->ioprio;
+               __entry->off            = sqe->off;
+               __entry->addr           = sqe->addr;
+               __entry->len            = sqe->len;
+               __entry->op_flags       = sqe->rw_flags;
+               __entry->user_data      = sqe->user_data;
+               __entry->buf_index      = sqe->buf_index;
+               __entry->personality    = sqe->personality;
+               __entry->file_index     = sqe->file_index;
+               __entry->pad1           = sqe->__pad2[0];
+               __entry->pad2           = sqe->__pad2[1];
+               __entry->error          = error;
+       ),
+
+       TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
+                 "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
+                 "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
+                 __entry->opcode, __entry->flags, __entry->ioprio,
+                 (unsigned long long)__entry->off,
+                 (unsigned long long) __entry->addr, __entry->len,
+                 __entry->op_flags, (unsigned long long) __entry->user_data,
+                 __entry->buf_index, __entry->personality, __entry->file_index,
+                 (unsigned long long) __entry->pad1,
+                 (unsigned long long) __entry->pad2, __entry->error)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
index 1d28431..171524d 100644 (file)
 #define PAGEMAP_MAPPEDDISK     0x0020u
 #define PAGEMAP_BUFFERS                0x0040u
 
-#define trace_pagemap_flags(page) ( \
-       (PageAnon(page)         ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
-       (page_mapped(page)      ? PAGEMAP_MAPPED     : 0) | \
-       (PageSwapCache(page)    ? PAGEMAP_SWAPCACHE  : 0) | \
-       (PageSwapBacked(page)   ? PAGEMAP_SWAPBACKED : 0) | \
-       (PageMappedToDisk(page) ? PAGEMAP_MAPPEDDISK : 0) | \
-       (page_has_private(page) ? PAGEMAP_BUFFERS    : 0) \
+#define trace_pagemap_flags(folio) ( \
+       (folio_test_anon(folio)         ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
+       (folio_mapped(folio)            ? PAGEMAP_MAPPED     : 0) | \
+       (folio_test_swapcache(folio)    ? PAGEMAP_SWAPCACHE  : 0) | \
+       (folio_test_swapbacked(folio)   ? PAGEMAP_SWAPBACKED : 0) | \
+       (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \
+       (folio_test_private(folio)      ? PAGEMAP_BUFFERS    : 0) \
        )
 
 TRACE_EVENT(mm_lru_insertion,
 
-       TP_PROTO(struct page *page),
+       TP_PROTO(struct folio *folio),
 
-       TP_ARGS(page),
+       TP_ARGS(folio),
 
        TP_STRUCT__entry(
-               __field(struct page *,  page    )
+               __field(struct folio *, folio   )
                __field(unsigned long,  pfn     )
                __field(enum lru_list,  lru     )
                __field(unsigned long,  flags   )
        ),
 
        TP_fast_assign(
-               __entry->page   = page;
-               __entry->pfn    = page_to_pfn(page);
-               __entry->lru    = page_lru(page);
-               __entry->flags  = trace_pagemap_flags(page);
+               __entry->folio  = folio;
+               __entry->pfn    = folio_pfn(folio);
+               __entry->lru    = folio_lru_list(folio);
+               __entry->flags  = trace_pagemap_flags(folio);
        ),
 
        /* Flag format is based on page-types.c formatting for pagemap */
-       TP_printk("page=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
-                       __entry->page,
+       TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
+                       __entry->folio,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED         ? "M" : " ",
@@ -60,23 +60,21 @@ TRACE_EVENT(mm_lru_insertion,
 
 TRACE_EVENT(mm_lru_activate,
 
-       TP_PROTO(struct page *page),
+       TP_PROTO(struct folio *folio),
 
-       TP_ARGS(page),
+       TP_ARGS(folio),
 
        TP_STRUCT__entry(
-               __field(struct page *,  page    )
+               __field(struct folio *, folio   )
                __field(unsigned long,  pfn     )
        ),
 
        TP_fast_assign(
-               __entry->page   = page;
-               __entry->pfn    = page_to_pfn(page);
+               __entry->folio  = folio;
+               __entry->pfn    = folio_pfn(folio);
        ),
 
-       /* Flag format is based on page-types.c formatting for pagemap */
-       TP_printk("page=%p pfn=0x%lx", __entry->page, __entry->pfn)
-
+       TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
 );
 
 #endif /* _TRACE_PAGEMAP_H */
index 840d1ba..7dccb66 100644 (file)
@@ -52,11 +52,11 @@ WB_WORK_REASON
 
 struct wb_writeback_work;
 
-DECLARE_EVENT_CLASS(writeback_page_template,
+DECLARE_EVENT_CLASS(writeback_folio_template,
 
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
 
-       TP_ARGS(page, mapping),
+       TP_ARGS(folio, mapping),
 
        TP_STRUCT__entry (
                __array(char, name, 32)
@@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(writeback_page_template,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = mapping ? mapping->host->i_ino : 0;
-               __entry->index = page->index;
+               __entry->index = folio->index;
        ),
 
        TP_printk("bdi %s: ino=%lu index=%lu",
@@ -79,18 +79,18 @@ DECLARE_EVENT_CLASS(writeback_page_template,
        )
 );
 
-DEFINE_EVENT(writeback_page_template, writeback_dirty_page,
+DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,
 
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
 
-       TP_ARGS(page, mapping)
+       TP_ARGS(folio, mapping)
 );
 
-DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,
+DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,
 
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
 
-       TP_ARGS(page, mapping)
+       TP_ARGS(folio, mapping)
 );
 
 DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
@@ -236,9 +236,9 @@ TRACE_EVENT(inode_switch_wbs,
 
 TRACE_EVENT(track_foreign_dirty,
 
-       TP_PROTO(struct page *page, struct bdi_writeback *wb),
+       TP_PROTO(struct folio *folio, struct bdi_writeback *wb),
 
-       TP_ARGS(page, wb),
+       TP_ARGS(folio, wb),
 
        TP_STRUCT__entry(
                __array(char,           name, 32)
@@ -250,7 +250,7 @@ TRACE_EVENT(track_foreign_dirty,
        ),
 
        TP_fast_assign(
-               struct address_space *mapping = page_mapping(page);
+               struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;
 
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
@@ -258,7 +258,7 @@ TRACE_EVENT(track_foreign_dirty,
                __entry->ino            = inode ? inode->i_ino : 0;
                __entry->memcg_id       = wb->memcg_css->id;
                __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
-               __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup);
+               __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
        ),
 
        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
index 9dc0bf0..ecd0f5b 100644 (file)
@@ -181,6 +181,10 @@ struct f_owner_ex {
                                   blocking */
 #define LOCK_UN                8       /* remove lock */
 
+/*
+ * LOCK_MAND support has been removed from the kernel. We leave the symbols
+ * here to not break legacy builds, but these should not be used in new code.
+ */
 #define LOCK_MAND      32      /* This is a mandatory flock ... */
 #define LOCK_READ      64      /* which allows concurrent read operations */
 #define LOCK_WRITE     128     /* which allows concurrent write operations */
index 6c34f6e..804ff8d 100644 (file)
 #define CDROM_NEXT_WRITABLE    0x5394  /* get next writable block */
 #define CDROM_LAST_WRITTEN     0x5395  /* get last block written on disc */
 
+#define CDROM_TIMED_MEDIA_CHANGE   0x5396  /* get the timestamp of the last media change */
+
 /*******************************************************
  * CDROM IOCTL structures
  *******************************************************/
@@ -295,6 +297,23 @@ struct cdrom_generic_command
        };
 };
 
+/* This struct is used by CDROM_TIMED_MEDIA_CHANGE */
+struct cdrom_timed_media_change_info {
+       __s64   last_media_change;      /* Timestamp of the last detected media
+                                        * change in ms. May be set by caller,
+                                        * updated upon successful return of
+                                        * ioctl.
+                                        */
+       __u64   media_flags;            /* Flags returned by ioctl to indicate
+                                        * media status.
+                                        */
+};
+#define MEDIA_CHANGED_FLAG     0x1     /* Last detected media change was more
+                                        * recent than last_media_change set by
+                                        * caller.
+                                        */
+/* other bits of media_flags available for future use */
+
 /*
  * A CD-ROM physical sector size is 2048, 2052, 2056, 2324, 2332, 2336, 
  * 2340, or 2352 bytes long.  
index b270a07..c45b5e9 100644 (file)
@@ -158,6 +158,7 @@ enum {
 #define IORING_TIMEOUT_BOOTTIME                (1U << 2)
 #define IORING_TIMEOUT_REALTIME                (1U << 3)
 #define IORING_LINK_TIMEOUT_UPDATE     (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS   (1U << 5)
 #define IORING_TIMEOUT_CLOCK_MASK      (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 #define IORING_TIMEOUT_UPDATE_MASK     (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
index 3c4054a..4162d7f 100644 (file)
@@ -83,7 +83,6 @@
 #include <linux/ptrace.h>
 #include <linux/pti.h>
 #include <linux/blkdev.h>
-#include <linux/elevator.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
index 23a7ab8..3df53cf 100644 (file)
@@ -60,7 +60,6 @@
 #include <linux/sched/cputime.h>
 
 #include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
 #include <linux/fs_pin.h>
 
index cebd4fb..447def5 100644 (file)
@@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
        INIT_WORK(&aux->work, prog_array_map_clear_deferred);
        INIT_LIST_HEAD(&aux->poke_progs);
        mutex_init(&aux->poke_mutex);
+       spin_lock_init(&aux->owner.lock);
 
        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
index d6b7dfd..6e3ae90 100644 (file)
@@ -524,6 +524,7 @@ int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 int bpf_jit_harden   __read_mostly;
 long bpf_jit_limit   __read_mostly;
+long bpf_jit_limit_max __read_mostly;
 
 static void
 bpf_prog_ksym_set_addr(struct bpf_prog *prog)
@@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
 static int __init bpf_jit_charge_init(void)
 {
        /* Only used as heuristic here to derive limit. */
-       bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+       bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+       bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
 }
@@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
 bool bpf_prog_array_compatible(struct bpf_array *array,
                               const struct bpf_prog *fp)
 {
+       bool ret;
+
        if (fp->kprobe_override)
                return false;
 
-       if (!array->aux->type) {
+       spin_lock(&array->aux->owner.lock);
+
+       if (!array->aux->owner.type) {
                /* There's no owner yet where we could check for
                 * compatibility.
                 */
-               array->aux->type  = fp->type;
-               array->aux->jited = fp->jited;
-               return true;
+               array->aux->owner.type  = fp->type;
+               array->aux->owner.jited = fp->jited;
+               ret = true;
+       } else {
+               ret = array->aux->owner.type  == fp->type &&
+                     array->aux->owner.jited == fp->jited;
        }
-
-       return array->aux->type  == fp->type &&
-              array->aux->jited == fp->jited;
+       spin_unlock(&array->aux->owner.lock);
+       return ret;
 }
 
 static int bpf_check_tail_call(const struct bpf_prog *fp)
index 4e50c0b..1cad697 100644 (file)
@@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 
        if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
                array = container_of(map, struct bpf_array, map);
-               type  = array->aux->type;
-               jited = array->aux->jited;
+               spin_lock(&array->aux->owner.lock);
+               type  = array->aux->owner.type;
+               jited = array->aux->owner.jited;
+               spin_unlock(&array->aux->owner.lock);
        }
 
        seq_printf(m,
@@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map,
        void __user *values = u64_to_user_ptr(attr->batch.values);
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        u32 value_size, cp, max_count;
-       int ufd = attr->map_fd;
+       int ufd = attr->batch.map_fd;
        void *key, *value;
        struct fd f;
        int err = 0;
 
-       f = fdget(ufd);
        if (attr->batch.elem_flags & ~BPF_F_LOCK)
                return -EINVAL;
 
@@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map,
                return -ENOMEM;
        }
 
+       f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
        for (cp = 0; cp < max_count; cp++) {
                err = -EFAULT;
                if (copy_from_user(key, keys + cp * map->key_size,
@@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map,
 
        kvfree(value);
        kvfree(key);
+       fdput(f);
        return err;
 }
 
index e76b559..de00655 100644 (file)
@@ -13319,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject)
 /* Three functions below can be called from sleepable and non-sleepable context.
  * Assume non-sleepable from bpf safety point of view.
  */
-BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, __filemap_add_folio)
 BTF_ID(func, should_fail_alloc_page)
 BTF_ID(func, should_failslab)
 BTF_SET_END(btf_non_sleepable_error_inject)
index 570b0c9..ea08f01 100644 (file)
@@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb)
         * And don't kill the default root.
         */
        if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
-           !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+           !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+               cgroup_bpf_offline(&root->cgrp);
                percpu_ref_kill(&root->cgrp.self.refcnt);
+       }
        cgroup_put(&root->cgrp);
        kernfs_kill_sb(sb);
 }
index af24dc3..6357c35 100644 (file)
@@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                addr + PAGE_SIZE);
 
        if (new_page) {
-               err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+               err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
+                                       GFP_KERNEL);
                if (err)
                        return err;
        }
index 91a43e5..a53863d 100644 (file)
@@ -48,7 +48,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
-#include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
index 38681ad..67679e3 100644 (file)
@@ -76,7 +76,6 @@
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
 #include <linux/perf_event.h>
index f21714e..59bea52 100644 (file)
@@ -13,7 +13,7 @@
 #include "sched.h"
 
 #include <linux/nospec.h>
-
+#include <linux/blkdev.h>
 #include <linux/kcov.h>
 #include <linux/scs.h>
 
@@ -6343,7 +6343,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
         * make sure to submit it to avoid deadlocks.
         */
        if (blk_needs_flush_plug(tsk))
-               blk_schedule_flush_plug(tsk);
+               blk_flush_plug(tsk->plug, true);
 }
 
 static void sched_update_worker(struct task_struct *tsk)
@@ -8354,7 +8354,8 @@ int io_schedule_prepare(void)
        int old_iowait = current->in_iowait;
 
        current->in_iowait = 1;
-       blk_schedule_flush_plug(current);
+       if (current->plug)
+               blk_flush_plug(current->plug, true);
 
        return old_iowait;
 }
index 3d3e579..66128df 100644 (file)
@@ -37,7 +37,6 @@
 
 #include <linux/binfmts.h>
 #include <linux/bitops.h>
-#include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <linux/context_tracking.h>
 #include <linux/cpufreq.h>
index fa91f39..1183c88 100644 (file)
@@ -816,7 +816,7 @@ blk_trace_request_get_cgid(struct request *rq)
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
                             unsigned int nr_bytes, u32 what, u64 cgid)
 {
        struct blk_trace *bt;
@@ -834,7 +834,8 @@ static void blk_add_trace_rq(struct request *rq, int error,
                what |= BLK_TC_ACT(BLK_TC_FS);
 
        __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-                       rq->cmd_flags, what, error, 0, NULL, cgid);
+                       rq->cmd_flags, what, blk_status_to_errno(error), 0,
+                       NULL, cgid);
        rcu_read_unlock();
 }
 
@@ -863,7 +864,7 @@ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
 }
 
 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
-                       int error, unsigned int nr_bytes)
+                       blk_status_t error, unsigned int nr_bytes)
 {
        blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
                         blk_trace_request_get_cgid(rq));
index 635fbdc..feebf57 100644 (file)
@@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 }
 
 /**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
  * @rec: the record to update
  * @enable: set to true if the record is tracing, false to force disable
  *
@@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
 }
 
 /**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
  * @rec: the record to test
  * @enable: set to true to check if enabled, false if it is disabled
  *
@@ -2574,7 +2574,7 @@ struct ftrace_rec_iter {
 };
 
 /**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
  *
  * Returns an iterator handle that is used to iterate over all
  * the records that represent address locations where functions
@@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
 }
 
 /**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
  * @iter: The handle to the iterator.
  *
  * Returns the next iterator after the given iterator @iter.
@@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
 }
 
 /**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
  * @iter: The current iterator location
  *
  * Returns the record that the current @iter is at.
@@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data)
 }
 
 /**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
  * @command: The command to tell ftrace what to do
  *
  * If an arch needs to fall back to the stop machine method, the
@@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command)
 }
 
 /**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
  * @command: The command that needs to be done
  *
  * Archs can override this function if it does not need to
@@ -7525,7 +7525,9 @@ void ftrace_kill(void)
 }
 
 /**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
  */
 int ftrace_is_dead(void)
 {
index c4a15ae..928867f 100644 (file)
@@ -904,8 +904,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 
        if (IS_ERR(ep)) {
                ret = PTR_ERR(ep);
-               /* This must return -ENOMEM, else there is a bug */
-               WARN_ON_ONCE(ret != -ENOMEM);
+               /* This must return -ENOMEM or missing event, else there is a bug */
+               WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
                ep = NULL;
                goto error;
        }
index a2f38e2..9f4262e 100644 (file)
@@ -20,8 +20,8 @@
  *
  * The worst case for in-place decompression is that the beginning of
  * the file is compressed extremely well, and the rest of the file is
- * uncompressible. Thus, we must look for worst-case expansion when the
- * compressor is encoding uncompressible data.
+ * incompressible. Thus, we must look for worst-case expansion when the
+ * compressor is encoding incompressible data.
  *
  * The structure of the .xz file in case of a compressed kernel is as follows.
  * Sizes (as bytes) of the fields are in parenthesis.
@@ -58,7 +58,7 @@
  * uncompressed size of the payload is in practice never less than the
  * payload size itself. The LZMA2 format would allow uncompressed size
  * to be less than the payload size, but no sane compressor creates such
- * files. LZMA2 supports storing uncompressible data in uncompressed form,
+ * files. LZMA2 supports storing incompressible data in uncompressed form,
  * so there's never a need to create payloads whose uncompressed size is
  * smaller than the compressed size.
  *
  * memeq and memzero are not used much and any remotely sane implementation
  * is fast enough. memcpy/memmove speed matters in multi-call mode, but
  * the kernel image is decompressed in single-call mode, in which only
- * memcpy speed can matter and only if there is a lot of uncompressible data
- * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the
+ * memmove speed can matter and only if there is a lot of incompressible data
+ * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the
  * functions below should just be kept small; it's probably not worth
  * optimizing for speed.
  */
index 4515439..53e7eb1 100644 (file)
@@ -217,11 +217,12 @@ static void fprop_reflect_period_percpu(struct fprop_global *p,
 }
 
 /* Event of type pl happened */
-void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
+               long nr)
 {
        fprop_reflect_period_percpu(p, pl);
-       percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
-       percpu_counter_add(&p->events, 1);
+       percpu_counter_add_batch(&pl->events, nr, PROP_BATCH);
+       percpu_counter_add(&p->events, nr);
 }
 
 void fprop_fraction_percpu(struct fprop_global *p,
@@ -253,20 +254,29 @@ void fprop_fraction_percpu(struct fprop_global *p,
 }
 
 /*
- * Like __fprop_inc_percpu() except that event is counted only if the given
+ * Like __fprop_add_percpu() except that event is counted only if the given
  * type has fraction smaller than @max_frac/FPROP_FRAC_BASE
  */
-void __fprop_inc_percpu_max(struct fprop_global *p,
-                           struct fprop_local_percpu *pl, int max_frac)
+void __fprop_add_percpu_max(struct fprop_global *p,
+               struct fprop_local_percpu *pl, int max_frac, long nr)
 {
        if (unlikely(max_frac < FPROP_FRAC_BASE)) {
                unsigned long numerator, denominator;
+               s64 tmp;
 
                fprop_fraction_percpu(p, pl, &numerator, &denominator);
-               if (numerator >
-                   (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT)
+               /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */
+               tmp = (u64)denominator * max_frac -
+                                       ((u64)numerator << FPROP_FRAC_SHIFT);
+               if (tmp < 0) {
+                       /* Maximum fraction already exceeded? */
                        return;
+               } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) {
+                       /* Add just enough for the fraction to saturate */
+                       nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1,
+                                       FPROP_FRAC_BASE - max_frac);
+               }
        }
 
-       __fprop_inc_percpu(p, pl);
+       __fprop_add_percpu(p, pl, nr);
 }
index 4d0e05e..a57a0e1 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include <trace/events/random.h>
 
index b25db9b..2709ab8 100644 (file)
@@ -489,6 +489,57 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+                                       unsigned int *offset)
+{
+       struct sbitmap *sb = &sbq->sb;
+       unsigned int hint, depth;
+       unsigned long index, nr;
+       int i;
+
+       if (unlikely(sb->round_robin))
+               return 0;
+
+       depth = READ_ONCE(sb->depth);
+       hint = update_alloc_hint_before_get(sb, depth);
+
+       index = SB_NR_TO_INDEX(sb, hint);
+
+       for (i = 0; i < sb->map_nr; i++) {
+               struct sbitmap_word *map = &sb->map[index];
+               unsigned long get_mask;
+
+               sbitmap_deferred_clear(map);
+               if (map->word == (1UL << (map->depth - 1)) - 1)
+                       continue;
+
+               nr = find_first_zero_bit(&map->word, map->depth);
+               if (nr + nr_tags <= map->depth) {
+                       atomic_long_t *ptr = (atomic_long_t *) &map->word;
+                       int map_tags = min_t(int, nr_tags, map->depth);
+                       unsigned long val, ret;
+
+                       get_mask = ((1UL << map_tags) - 1) << nr;
+                       do {
+                               val = READ_ONCE(map->word);
+                               ret = atomic_long_cmpxchg(ptr, val, get_mask | val);
+                       } while (ret != val);
+                       get_mask = (get_mask & ~ret) >> nr;
+                       if (get_mask) {
+                               *offset = nr + (index << sb->shift);
+                               update_alloc_hint_after_get(sb, depth, hint,
+                                                       *offset + map_tags - 1);
+                               return get_mask;
+                       }
+               }
+               /* Jump to next index. */
+               if (++index >= sb->map_nr)
+                       index = 0;
+       }
+
+       return 0;
+}
+
 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                                unsigned int shallow_depth)
 {
@@ -577,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+       if (likely(!sb->round_robin && tag < sb->depth))
+               data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag);
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+                               int *tags, int nr_tags)
+{
+       struct sbitmap *sb = &sbq->sb;
+       unsigned long *addr = NULL;
+       unsigned long mask = 0;
+       int i;
+
+       smp_mb__before_atomic();
+       for (i = 0; i < nr_tags; i++) {
+               const int tag = tags[i] - offset;
+               unsigned long *this_addr;
+
+               /* since we're clearing a batch, skip the deferred map */
+               this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+               if (!addr) {
+                       addr = this_addr;
+               } else if (addr != this_addr) {
+                       atomic_long_andnot(mask, (atomic_long_t *) addr);
+                       mask = 0;
+                       addr = this_addr;
+               }
+               mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+       }
+
+       if (mask)
+               atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+       smp_mb__after_atomic();
+       sbitmap_queue_wake_up(sbq);
+       sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+                                       tags[nr_tags - 1] - offset);
+}
+
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu)
 {
@@ -601,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
         */
        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq);
-
-       if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-               *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+       sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 
index 5cb5024..adce22a 100644 (file)
@@ -39,6 +39,19 @@ config XZ_DEC_SPARC
        default y
        select XZ_DEC_BCJ
 
+config XZ_DEC_MICROLZMA
+       bool "MicroLZMA decoder"
+       default n
+       help
+         MicroLZMA is a header format variant where the first byte
+         of a raw LZMA stream (without the end of stream marker) has
+         been replaced with a bitwise-negation of the lc/lp/pb
+         properties byte. MicroLZMA was created to be used in EROFS
+         but can be used by other things too where wasting minimal
+         amount of space for headers is important.
+
+         Unless you know that you need this, say N.
+
 endif
 
 config XZ_DEC_BCJ
index 7a6781e..27ce345 100644 (file)
@@ -248,6 +248,10 @@ struct lzma2_dec {
         * before the first LZMA chunk.
         */
        bool need_props;
+
+#ifdef XZ_DEC_MICROLZMA
+       bool pedantic_microlzma;
+#endif
 };
 
 struct xz_dec_lzma2 {
@@ -387,7 +391,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
 
                *left -= copy_size;
 
-               memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
+               /*
+                * If doing in-place decompression in single-call mode and the
+                * uncompressed size of the file is larger than the caller
+                * thought (i.e. it is invalid input!), the buffers below may
+                * overlap and cause undefined behavior with memcpy().
+                * With valid inputs memcpy() would be fine here.
+                */
+               memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
                dict->pos += copy_size;
 
                if (dict->full < dict->pos)
@@ -397,7 +408,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
                        if (dict->pos == dict->end)
                                dict->pos = 0;
 
-                       memcpy(b->out + b->out_pos, b->in + b->in_pos,
+                       /*
+                        * Like above but for multi-call mode: use memmove()
+                        * to avoid undefined behavior with invalid input.
+                        */
+                       memmove(b->out + b->out_pos, b->in + b->in_pos,
                                        copy_size);
                }
 
@@ -408,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
        }
 }
 
+#ifdef XZ_DEC_MICROLZMA
+#      define DICT_FLUSH_SUPPORTS_SKIPPING true
+#else
+#      define DICT_FLUSH_SUPPORTS_SKIPPING false
+#endif
+
 /*
  * Flush pending data from dictionary to b->out. It is assumed that there is
  * enough space in b->out. This is guaranteed because caller uses dict_limit()
@@ -421,8 +442,19 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b)
                if (dict->pos == dict->end)
                        dict->pos = 0;
 
-               memcpy(b->out + b->out_pos, dict->buf + dict->start,
-                               copy_size);
+               /*
+                * These buffers cannot overlap even if doing in-place
+                * decompression because in multi-call mode dict->buf
+                * has been allocated by us in this file; it's not
+                * provided by the caller like in single-call mode.
+                *
+                * With MicroLZMA, b->out can be NULL to skip bytes that
+                * the caller doesn't need. This cannot be done with XZ
+                * because it would break BCJ filters.
+                */
+               if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL)
+                       memcpy(b->out + b->out_pos, dict->buf + dict->start,
+                                       copy_size);
        }
 
        dict->start = dict->pos;
@@ -488,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc)
  * functions so that the compiler is supposed to be able to more easily avoid
  * an extra branch. In this particular version of the LZMA decoder, this
  * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
- * on x86). Using a non-splitted version results in nicer looking code too.
+ * on x86). Using a non-split version results in nicer looking code too.
  *
  * NOTE: This must return an int. Do not make it return a bool or the speed
  * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care,
@@ -774,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s)
        s->lzma.rep1 = 0;
        s->lzma.rep2 = 0;
        s->lzma.rep3 = 0;
+       s->lzma.len = 0;
 
        /*
         * All probabilities are initialized to the same value. This hack
@@ -1157,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props)
                }
        }
 
-       s->lzma.len = 0;
-
        s->lzma2.sequence = SEQ_CONTROL;
        s->lzma2.need_dict_reset = true;
 
@@ -1174,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s)
 
        kfree(s);
 }
+
+#ifdef XZ_DEC_MICROLZMA
+/* This is a wrapper struct to have a nice struct name in the public API. */
+struct xz_dec_microlzma {
+       struct xz_dec_lzma2 s;
+};
+
+enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr,
+                                struct xz_buf *b)
+{
+       struct xz_dec_lzma2 *s = &s_ptr->s;
+
+       /*
+        * sequence is SEQ_PROPERTIES before the first input byte,
+        * SEQ_LZMA_PREPARE until a total of five bytes have been read,
+        * and SEQ_LZMA_RUN for the rest of the input stream.
+        */
+       if (s->lzma2.sequence != SEQ_LZMA_RUN) {
+               if (s->lzma2.sequence == SEQ_PROPERTIES) {
+                       /* One byte is needed for the props. */
+                       if (b->in_pos >= b->in_size)
+                               return XZ_OK;
+
+                       /*
+                        * Don't increment b->in_pos here. The same byte is
+                        * also passed to rc_read_init() which will ignore it.
+                        */
+                       if (!lzma_props(s, ~b->in[b->in_pos]))
+                               return XZ_DATA_ERROR;
+
+                       s->lzma2.sequence = SEQ_LZMA_PREPARE;
+               }
+
+               /*
+                * xz_dec_microlzma_reset() doesn't validate the compressed
+                * size so we do it here. We have to limit the maximum size
+                * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice
+                * round number and much more than users of this code should
+                * ever need.
+                */
+               if (s->lzma2.compressed < RC_INIT_BYTES
+                               || s->lzma2.compressed > (3U << 30))
+                       return XZ_DATA_ERROR;
+
+               if (!rc_read_init(&s->rc, b))
+                       return XZ_OK;
+
+               s->lzma2.compressed -= RC_INIT_BYTES;
+               s->lzma2.sequence = SEQ_LZMA_RUN;
+
+               dict_reset(&s->dict, b);
+       }
+
+       /* This is to allow increasing b->out_size between calls. */
+       if (DEC_IS_SINGLE(s->dict.mode))
+               s->dict.end = b->out_size - b->out_pos;
+
+       while (true) {
+               dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos,
+                                          s->lzma2.uncompressed));
+
+               if (!lzma2_lzma(s, b))
+                       return XZ_DATA_ERROR;
+
+               s->lzma2.uncompressed -= dict_flush(&s->dict, b);
+
+               if (s->lzma2.uncompressed == 0) {
+                       if (s->lzma2.pedantic_microlzma) {
+                               if (s->lzma2.compressed > 0 || s->lzma.len > 0
+                                               || !rc_is_finished(&s->rc))
+                                       return XZ_DATA_ERROR;
+                       }
+
+                       return XZ_STREAM_END;
+               }
+
+               if (b->out_pos == b->out_size)
+                       return XZ_OK;
+
+               if (b->in_pos == b->in_size
+                               && s->temp.size < s->lzma2.compressed)
+                       return XZ_OK;
+       }
+}
+
+struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+                                               uint32_t dict_size)
+{
+       struct xz_dec_microlzma *s;
+
+       /* Restrict dict_size to the same range as in the LZMA2 code. */
+       if (dict_size < 4096 || dict_size > (3U << 30))
+               return NULL;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (s == NULL)
+               return NULL;
+
+       s->s.dict.mode = mode;
+       s->s.dict.size = dict_size;
+
+       if (DEC_IS_MULTI(mode)) {
+               s->s.dict.end = dict_size;
+
+               s->s.dict.buf = vmalloc(dict_size);
+               if (s->s.dict.buf == NULL) {
+                       kfree(s);
+                       return NULL;
+               }
+       }
+
+       return s;
+}
+
+void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
+                           uint32_t uncomp_size, int uncomp_size_is_exact)
+{
+       /*
+        * comp_size is validated in xz_dec_microlzma_run().
+        * uncomp_size can safely be anything.
+        */
+       s->s.lzma2.compressed = comp_size;
+       s->s.lzma2.uncompressed = uncomp_size;
+       s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact;
+
+       s->s.lzma2.sequence = SEQ_PROPERTIES;
+       s->s.temp.size = 0;
+}
+
+void xz_dec_microlzma_end(struct xz_dec_microlzma *s)
+{
+       if (DEC_IS_MULTI(s->s.dict.mode))
+               vfree(s->s.dict.buf);
+
+       kfree(s);
+}
+#endif
index fea86de..683570b 100644 (file)
@@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s)
         * we will accept other check types too, but then the check won't
         * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given.
         */
+       if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX)
+               return XZ_OPTIONS_ERROR;
+
        s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1];
 
 #ifdef XZ_DEC_ANY_CHECK
-       if (s->check_type > XZ_CHECK_MAX)
-               return XZ_OPTIONS_ERROR;
-
        if (s->check_type > XZ_CHECK_CRC32)
                return XZ_UNSUPPORTED_CHECK;
 #else
index 32eb3c0..61098c6 100644 (file)
@@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset);
 EXPORT_SYMBOL(xz_dec_run);
 EXPORT_SYMBOL(xz_dec_end);
 
+#ifdef CONFIG_XZ_DEC_MICROLZMA
+EXPORT_SYMBOL(xz_dec_microlzma_alloc);
+EXPORT_SYMBOL(xz_dec_microlzma_reset);
+EXPORT_SYMBOL(xz_dec_microlzma_run);
+EXPORT_SYMBOL(xz_dec_microlzma_end);
+#endif
+
 MODULE_DESCRIPTION("XZ decompressor");
-MODULE_VERSION("1.0");
+MODULE_VERSION("1.1");
 MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov");
 
 /*
index 09360eb..bf1e94e 100644 (file)
@@ -37,6 +37,9 @@
 #              ifdef CONFIG_XZ_DEC_SPARC
 #                      define XZ_DEC_SPARC
 #              endif
+#              ifdef CONFIG_XZ_DEC_MICROLZMA
+#                      define XZ_DEC_MICROLZMA
+#              endif
 #              define memeq(a, b, size) (memcmp(a, b, size) == 0)
 #              define memzero(buf, size) memset(buf, 0, size)
 #      endif
index fc60a40..d6c0042 100644 (file)
@@ -46,7 +46,7 @@ mmu-$(CONFIG_MMU)     += process_vm_access.o
 endif
 
 obj-y                  := filemap.o mempool.o oom_kill.o fadvise.o \
-                          maccess.o page-writeback.o \
+                          maccess.o page-writeback.o folio-compat.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o percpu.o slab_common.o \
index 4a9d4e2..c878d99 100644 (file)
@@ -2,8 +2,9 @@
 
 #include <linux/wait.h>
 #include <linux/rbtree.h>
-#include <linux/backing-dev.h>
 #include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
 #include <linux/freezer.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -977,6 +978,22 @@ void bdi_put(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_put);
 
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+       struct super_block *sb;
+
+       if (!inode)
+               return &noop_backing_dev_info;
+
+       sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+       if (sb_is_blkdev_sb(sb))
+               return I_BDEV(inode)->bd_disk->bdi;
+#endif
+       return sb->s_bdi;
+}
+EXPORT_SYMBOL(inode_to_bdi);
+
 const char *bdi_dev_name(struct backing_dev_info *bdi)
 {
        if (!bdi || !bdi->dev)
index bfc93da..fbc60f9 100644 (file)
@@ -1022,7 +1022,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                if (!TestClearPageLRU(page))
                        goto isolate_fail_put;
 
-               lruvec = mem_cgroup_page_lruvec(page);
+               lruvec = folio_lruvec(page_folio(page));
 
                /* If we already hold the lock, we can skip some rechecking */
                if (lruvec != locked) {
@@ -1032,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
                        locked = lruvec;
 
-                       lruvec_memcg_debug(lruvec, page);
+                       lruvec_memcg_debug(lruvec, page_folio(page));
 
                        /* Try get exclusive access under lock */
                        if (!skip_updated) {
index c938a9c..7008c37 100644 (file)
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
        r = damon_new_region(0, 22);
        damon_add_region(r, t);
        damon_split_regions_of(c, t, 2);
-       KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u);
+       KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
        damon_free_target(t);
 
        t = damon_new_target(42);
        r = damon_new_region(0, 220);
        damon_add_region(r, t);
        damon_split_regions_of(c, t, 4);
-       KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u);
+       KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
        damon_free_target(t);
        damon_destroy_ctx(c);
 }
index dae4812..5e206a4 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
-#include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hugetlb.h>
@@ -835,6 +834,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  */
 void replace_page_cache_page(struct page *old, struct page *new)
 {
+       struct folio *fold = page_folio(old);
+       struct folio *fnew = page_folio(new);
        struct address_space *mapping = old->mapping;
        void (*freepage)(struct page *) = mapping->a_ops->freepage;
        pgoff_t offset = old->index;
@@ -848,7 +849,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
        new->mapping = mapping;
        new->index = offset;
 
-       mem_cgroup_migrate(old, new);
+       mem_cgroup_migrate(fold, fnew);
 
        xas_lock_irq(&xas);
        xas_store(&xas, new);
@@ -870,26 +871,25 @@ void replace_page_cache_page(struct page *old, struct page *new)
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-noinline int __add_to_page_cache_locked(struct page *page,
-                                       struct address_space *mapping,
-                                       pgoff_t offset, gfp_t gfp,
-                                       void **shadowp)
+noinline int __filemap_add_folio(struct address_space *mapping,
+               struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
-       XA_STATE(xas, &mapping->i_pages, offset);
-       int huge = PageHuge(page);
+       XA_STATE(xas, &mapping->i_pages, index);
+       int huge = folio_test_hugetlb(folio);
        int error;
        bool charged = false;
 
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
        mapping_set_update(&xas, mapping);
 
-       get_page(page);
-       page->mapping = mapping;
-       page->index = offset;
+       folio_get(folio);
+       folio->mapping = mapping;
+       folio->index = index;
 
        if (!huge) {
-               error = mem_cgroup_charge(page, NULL, gfp);
+               error = mem_cgroup_charge(folio, NULL, gfp);
+               VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
                if (error)
                        goto error;
                charged = true;
@@ -901,7 +901,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
                unsigned int order = xa_get_order(xas.xa, xas.xa_index);
                void *entry, *old = NULL;
 
-               if (order > thp_order(page))
+               if (order > folio_order(folio))
                        xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
                                        order, gfp);
                xas_lock_irq(&xas);
@@ -918,13 +918,13 @@ noinline int __add_to_page_cache_locked(struct page *page,
                                *shadowp = old;
                        /* entry may have been split before we acquired lock */
                        order = xa_get_order(xas.xa, xas.xa_index);
-                       if (order > thp_order(page)) {
+                       if (order > folio_order(folio)) {
                                xas_split(&xas, old, order);
                                xas_reset(&xas);
                        }
                }
 
-               xas_store(&xas, page);
+               xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;
 
@@ -932,7 +932,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
 
                /* hugetlb pages do not participate in page cache accounting */
                if (!huge)
-                       __inc_lruvec_page_state(page, NR_FILE_PAGES);
+                       __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
 unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));
@@ -940,19 +940,19 @@ unlock:
        if (xas_error(&xas)) {
                error = xas_error(&xas);
                if (charged)
-                       mem_cgroup_uncharge(page);
+                       mem_cgroup_uncharge(folio);
                goto error;
        }
 
-       trace_mm_filemap_add_to_page_cache(page);
+       trace_mm_filemap_add_to_page_cache(&folio->page);
        return 0;
 error:
-       page->mapping = NULL;
+       folio->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
-       put_page(page);
+       folio_put(folio);
        return error;
 }
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
 
 /**
  * add_to_page_cache_locked - add a locked page to the pagecache
@@ -969,59 +969,58 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-       return __add_to_page_cache_locked(page, mapping, offset,
+       return __filemap_add_folio(mapping, page_folio(page), offset,
                                          gfp_mask, NULL);
 }
 EXPORT_SYMBOL(add_to_page_cache_locked);
 
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-                               pgoff_t offset, gfp_t gfp_mask)
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+                               pgoff_t index, gfp_t gfp)
 {
        void *shadow = NULL;
        int ret;
 
-       __SetPageLocked(page);
-       ret = __add_to_page_cache_locked(page, mapping, offset,
-                                        gfp_mask, &shadow);
+       __folio_set_locked(folio);
+       ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
        if (unlikely(ret))
-               __ClearPageLocked(page);
+               __folio_clear_locked(folio);
        else {
                /*
-                * The page might have been evicted from cache only
+                * The folio might have been evicted from cache only
                 * recently, in which case it should be activated like
-                * any other repeatedly accessed page.
-                * The exception is pages getting rewritten; evicting other
+                * any other repeatedly accessed folio.
+                * The exception is folios getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
-               WARN_ON_ONCE(PageActive(page));
-               if (!(gfp_mask & __GFP_WRITE) && shadow)
-                       workingset_refault(page, shadow);
-               lru_cache_add(page);
+               WARN_ON_ONCE(folio_test_active(folio));
+               if (!(gfp & __GFP_WRITE) && shadow)
+                       workingset_refault(folio, shadow);
+               folio_add_lru(folio);
        }
        return ret;
 }
-EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+EXPORT_SYMBOL_GPL(filemap_add_folio);
 
 #ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
 {
        int n;
-       struct page *page;
+       struct folio *folio;
 
        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
-                       page = __alloc_pages_node(n, gfp, 0);
-               } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+                       folio = __folio_alloc_node(gfp, order, n);
+               } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
 
-               return page;
+               return folio;
        }
-       return alloc_pages(gfp, 0);
+       return folio_alloc(gfp, order);
 }
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(filemap_alloc_folio);
 #endif
 
 /*
@@ -1074,11 +1073,11 @@ EXPORT_SYMBOL(filemap_invalidate_unlock_two);
  */
 #define PAGE_WAIT_TABLE_BITS 8
 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
 
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
 {
-       return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+       return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
 }
 
 void __init pagecache_init(void)
@@ -1086,7 +1085,7 @@ void __init pagecache_init(void)
        int i;
 
        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
-               init_waitqueue_head(&page_wait_table[i]);
+               init_waitqueue_head(&folio_wait_table[i]);
 
        page_writeback_init();
 }
@@ -1141,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_bit(key->bit_nr, &key->page->flags))
+               if (test_bit(key->bit_nr, &key->folio->flags))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
-                       if (test_and_set_bit(key->bit_nr, &key->page->flags))
+                       if (test_and_set_bit(key->bit_nr, &key->folio->flags))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
@@ -1157,7 +1156,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
-        * with the load-acquire in wait_on_page_bit_common().
+        * with the load-acquire in folio_wait_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);
@@ -1176,14 +1175,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
 }
 
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void folio_wake_bit(struct folio *folio, int bit_nr)
 {
-       wait_queue_head_t *q = page_waitqueue(page);
+       wait_queue_head_t *q = folio_waitqueue(folio);
        struct wait_page_key key;
        unsigned long flags;
        wait_queue_entry_t bookmark;
 
-       key.page = page;
+       key.folio = folio;
        key.bit_nr = bit_nr;
        key.page_match = 0;
 
@@ -1218,7 +1217,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
         * page waiters.
         */
        if (!waitqueue_active(q) || !key.page_match) {
-               ClearPageWaiters(page);
+               folio_clear_waiters(folio);
                /*
                 * It's possible to miss clearing Waiters here, when we woke
                 * our page waiters, but the hashed waitqueue has waiters for
@@ -1230,19 +1229,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
        spin_unlock_irqrestore(&q->lock, flags);
 }
 
-static void wake_up_page(struct page *page, int bit)
+static void folio_wake(struct folio *folio, int bit)
 {
-       if (!PageWaiters(page))
+       if (!folio_test_waiters(folio))
                return;
-       wake_up_page_bit(page, bit);
+       folio_wake_bit(folio, bit);
 }
 
 /*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for folio_wait_bit_common():
  */
 enum behavior {
        EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
-                        * __lock_page() waiting on then setting PG_locked.
+                        * __folio_lock() waiting on then setting PG_locked.
                         */
        SHARED,         /* Hold ref to page and check the bit when woken, like
                         * wait_on_page_writeback() waiting on PG_writeback.
@@ -1253,16 +1252,16 @@ enum behavior {
 };
 
 /*
- * Attempt to check (or get) the page bit, and mark us done
+ * Attempt to check (or get) the folio flag, and mark us done
  * if successful.
  */
-static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                        struct wait_queue_entry *wait)
 {
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_and_set_bit(bit_nr, &page->flags))
+               if (test_and_set_bit(bit_nr, &folio->flags))
                        return false;
-       } else if (test_bit(bit_nr, &page->flags))
+       } else if (test_bit(bit_nr, &folio->flags))
                return false;
 
        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1272,9 +1271,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
 /* How many times do we accept lock stealing from under a waiter? */
 int sysctl_page_lock_unfairness = 5;
 
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-       struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
+               int state, enum behavior behavior)
 {
+       wait_queue_head_t *q = folio_waitqueue(folio);
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
@@ -1283,8 +1283,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
        unsigned long pflags;
 
        if (bit_nr == PG_locked &&
-           !PageUptodate(page) && PageWorkingset(page)) {
-               if (!PageSwapBacked(page)) {
+           !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+               if (!folio_test_swapbacked(folio)) {
                        delayacct_thrashing_start();
                        delayacct = true;
                }
@@ -1294,7 +1294,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 
        init_wait(wait);
        wait->func = wake_page_function;
-       wait_page.page = page;
+       wait_page.folio = folio;
        wait_page.bit_nr = bit_nr;
 
 repeat:
@@ -1309,7 +1309,7 @@ repeat:
         * Do one last check whether we can get the
         * page bit synchronously.
         *
-        * Do the SetPageWaiters() marking before that
+        * Do the folio_set_waiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
@@ -1320,8 +1320,8 @@ repeat:
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
-       SetPageWaiters(page);
-       if (!trylock_page_bit_common(page, bit_nr, wait))
+       folio_set_waiters(folio);
+       if (!folio_trylock_flag(folio, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);
 
@@ -1331,10 +1331,10 @@ repeat:
         * see whether the page bit testing has already
         * been done by the wake function.
         *
-        * We can drop our reference to the page.
+        * We can drop our reference to the folio.
         */
        if (behavior == DROP)
-               put_page(page);
+               folio_put(folio);
 
        /*
         * Note that until the "finish_wait()", or until
@@ -1371,7 +1371,7 @@ repeat:
                 *
                 * And if that fails, we'll have to retry this all.
                 */
-               if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+               if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                        goto repeat;
 
                wait->flags |= WQ_FLAG_DONE;
@@ -1380,7 +1380,7 @@ repeat:
 
        /*
         * If a signal happened, this 'finish_wait()' may remove the last
-        * waiter from the wait-queues, but the PageWaiters bit will remain
+        * waiter from the wait-queues, but the folio waiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
@@ -1411,19 +1411,17 @@ repeat:
        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
 }
 
-void wait_on_page_bit(struct page *page, int bit_nr)
+void folio_wait_bit(struct folio *folio, int bit_nr)
 {
-       wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
+       folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
 }
-EXPORT_SYMBOL(wait_on_page_bit);
+EXPORT_SYMBOL(folio_wait_bit);
 
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+int folio_wait_bit_killable(struct folio *folio, int bit_nr)
 {
-       wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
+       return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
 }
-EXPORT_SYMBOL(wait_on_page_bit_killable);
+EXPORT_SYMBOL(folio_wait_bit_killable);
 
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
@@ -1440,31 +1438,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable);
  */
 int put_and_wait_on_page_locked(struct page *page, int state)
 {
-       wait_queue_head_t *q;
-
-       page = compound_head(page);
-       q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
+       return folio_wait_bit_common(page_folio(page), PG_locked, state,
+                       DROP);
 }
 
 /**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
+ * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
+ * @folio: Folio defining the wait queue of interest
  * @waiter: Waiter to add to the queue
  *
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ * Add an arbitrary @waiter to the wait queue for the nominated @folio.
  */
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
 {
-       wait_queue_head_t *q = page_waitqueue(page);
+       wait_queue_head_t *q = folio_waitqueue(folio);
        unsigned long flags;
 
        spin_lock_irqsave(&q->lock, flags);
        __add_wait_queue_entry_tail(q, waiter);
-       SetPageWaiters(page);
+       folio_set_waiters(folio);
        spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
+EXPORT_SYMBOL_GPL(folio_add_wait_queue);
 
 #ifndef clear_bit_unlock_is_negative_byte
 
@@ -1490,124 +1485,116 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
 #endif
 
 /**
- * unlock_page - unlock a locked page
- * @page: the page
+ * folio_unlock - Unlock a locked folio.
+ * @folio: The folio.
  *
- * Unlocks the page and wakes up sleepers in wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
  *
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context.  May not be
+ * called from NMI context.
  */
-void unlock_page(struct page *page)
+void folio_unlock(struct folio *folio)
 {
+       /* Bit 7 allows x86 to check the byte's sign bit */
        BUILD_BUG_ON(PG_waiters != 7);
-       page = compound_head(page);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
-               wake_up_page_bit(page, PG_locked);
+       BUILD_BUG_ON(PG_locked > 7);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+               folio_wake_bit(folio, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(folio_unlock);
 
 /**
- * end_page_private_2 - Clear PG_private_2 and release any waiters
- * @page: The page
+ * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
+ * @folio: The folio.
  *
- * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
- * this.  The page ref held for PG_private_2 being set is released.
+ * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
+ * it.  The folio reference held for PG_private_2 being set is released.
  *
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
+ * This is, for example, used when a netfs folio is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same folio to be
  * serialised.
  */
-void end_page_private_2(struct page *page)
+void folio_end_private_2(struct folio *folio)
 {
-       page = compound_head(page);
-       VM_BUG_ON_PAGE(!PagePrivate2(page), page);
-       clear_bit_unlock(PG_private_2, &page->flags);
-       wake_up_page_bit(page, PG_private_2);
-       put_page(page);
+       VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
+       clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+       folio_wake_bit(folio, PG_private_2);
+       folio_put(folio);
 }
-EXPORT_SYMBOL(end_page_private_2);
+EXPORT_SYMBOL(folio_end_private_2);
 
 /**
- * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
  *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
  */
-void wait_on_page_private_2(struct page *page)
+void folio_wait_private_2(struct folio *folio)
 {
-       page = compound_head(page);
-       while (PagePrivate2(page))
-               wait_on_page_bit(page, PG_private_2);
+       while (folio_test_private_2(folio))
+               folio_wait_bit(folio, PG_private_2);
 }
-EXPORT_SYMBOL(wait_on_page_private_2);
+EXPORT_SYMBOL(folio_wait_private_2);
 
 /**
- * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
  *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
  * fatal signal is received by the calling task.
  *
  * Return:
  * - 0 if successful.
  * - -EINTR if a fatal signal was encountered.
  */
-int wait_on_page_private_2_killable(struct page *page)
+int folio_wait_private_2_killable(struct folio *folio)
 {
        int ret = 0;
 
-       page = compound_head(page);
-       while (PagePrivate2(page)) {
-               ret = wait_on_page_bit_killable(page, PG_private_2);
+       while (folio_test_private_2(folio)) {
+               ret = folio_wait_bit_killable(folio, PG_private_2);
                if (ret < 0)
                        break;
        }
 
        return ret;
 }
-EXPORT_SYMBOL(wait_on_page_private_2_killable);
+EXPORT_SYMBOL(folio_wait_private_2_killable);
 
 /**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
  */
-void end_page_writeback(struct page *page)
+void folio_end_writeback(struct folio *folio)
 {
        /*
-        * TestClearPageReclaim could be used here but it is an atomic
-        * operation and overkill in this particular case. Failing to
-        * shuffle a page marked for immediate reclaim is too mild to
-        * justify taking an atomic operation penalty at the end of
-        * ever page writeback.
+        * folio_test_clear_reclaim() could be used here but it is an
+        * atomic operation and overkill in this particular case. Failing
+        * to shuffle a folio marked for immediate reclaim is too mild
+        * a gain to justify taking an atomic operation penalty at the
+        * end of every folio writeback.
         */
-       if (PageReclaim(page)) {
-               ClearPageReclaim(page);
-               rotate_reclaimable_page(page);
+       if (folio_test_reclaim(folio)) {
+               folio_clear_reclaim(folio);
+               folio_rotate_reclaimable(folio);
        }
 
        /*
-        * Writeback does not hold a page reference of its own, relying
+        * Writeback does not hold a folio reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
-        * But here we must make sure that the page is not freed and
-        * reused before the wake_up_page().
+        * But here we must make sure that the folio is not freed and
+        * reused before the folio_wake().
         */
-       get_page(page);
-       if (!test_clear_page_writeback(page))
+       folio_get(folio);
+       if (!__folio_end_writeback(folio))
                BUG();
 
        smp_mb__after_atomic();
-       wake_up_page(page, PG_writeback);
-       put_page(page);
+       folio_wake(folio, PG_writeback);
+       folio_put(folio);
 }
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(folio_end_writeback);
 
 /*
  * After completing I/O on a page, call this routine to update the page
@@ -1638,39 +1625,35 @@ void page_endio(struct page *page, bool is_write, int err)
 EXPORT_SYMBOL_GPL(page_endio);
 
 /**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
  */
-void __lock_page(struct page *__page)
+void __folio_lock(struct folio *folio)
 {
-       struct page *page = compound_head(__page);
-       wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+       folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
 }
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__folio_lock);
 
-int __lock_page_killable(struct page *__page)
+int __folio_lock_killable(struct folio *folio)
 {
-       struct page *page = compound_head(__page);
-       wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+       return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
 }
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__folio_lock_killable);
 
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
 {
-       struct wait_queue_head *q = page_waitqueue(page);
+       struct wait_queue_head *q = folio_waitqueue(folio);
        int ret = 0;
 
-       wait->page = page;
+       wait->folio = folio;
        wait->bit_nr = PG_locked;
 
        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
-       SetPageWaiters(page);
-       ret = !trylock_page(page);
+       folio_set_waiters(folio);
+       ret = !folio_trylock(folio);
        /*
         * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
@@ -1687,16 +1670,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait)
 
 /*
  * Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * true - folio is locked; mmap_lock is still held.
+ * false - folio is not locked.
  *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
  *     which case mmap_lock is still held.
  *
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * with the folio locked and the mmap_lock unperturbed.
  */
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
                         unsigned int flags)
 {
        if (fault_flag_allow_retry_first(flags)) {
@@ -1705,28 +1688,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                 * even though return 0.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
-                       return 0;
+                       return false;
 
                mmap_read_unlock(mm);
                if (flags & FAULT_FLAG_KILLABLE)
-                       wait_on_page_locked_killable(page);
+                       folio_wait_locked_killable(folio);
                else
-                       wait_on_page_locked(page);
-               return 0;
+                       folio_wait_locked(folio);
+               return false;
        }
        if (flags & FAULT_FLAG_KILLABLE) {
-               int ret;
+               bool ret;
 
-               ret = __lock_page_killable(page);
+               ret = __folio_lock_killable(folio);
                if (ret) {
                        mmap_read_unlock(mm);
-                       return 0;
+                       return false;
                }
        } else {
-               __lock_page(page);
+               __folio_lock(folio);
        }
-       return 1;
 
+       return true;
 }
 
 /**
@@ -1802,143 +1785,155 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 EXPORT_SYMBOL(page_cache_prev_miss);
 
 /*
+ * Lockless page cache protocol:
+ * On the lookup side:
+ * 1. Load the folio from i_pages
+ * 2. Increment the refcount if it's not zero
+ * 3. If the folio is not found by xas_reload(), put the refcount and retry
+ *
+ * On the removal side:
+ * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
+ * B. Remove the page from i_pages
+ * C. Return the page to the page allocator
+ *
+ * This means that any page may have its reference count temporarily
+ * increased by a speculative page cache (or fast GUP) lookup as it can
+ * be allocated by another user before the RCU grace period expires.
+ * Because the refcount temporarily acquired here may end up being the
+ * last refcount on the page, any page allocation must be freeable by
+ * folio_put().
+ */
+
+/*
  * mapping_get_entry - Get a page cache entry.
  * @mapping: the address_space to search
  * @index: The page cache index.
  *
- * Looks up the page cache slot at @mapping & @index.  If there is a
- * page cache page, the head page is returned with an increased refcount.
+ * Looks up the page cache entry at @mapping & @index.  If it is a folio,
+ * it is returned with an increased refcount.  If it is a shadow entry
+ * of a previously evicted folio, or a swap entry from shmem/tmpfs,
+ * it is returned without further action.
  *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Return: The head page or shadow entry, %NULL if nothing is found.
+ * Return: The folio, swap or shadow entry, %NULL if nothing is found.
  */
-static struct page *mapping_get_entry(struct address_space *mapping,
-               pgoff_t index)
+static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
 {
        XA_STATE(xas, &mapping->i_pages, index);
-       struct page *page;
+       struct folio *folio;
 
        rcu_read_lock();
 repeat:
        xas_reset(&xas);
-       page = xas_load(&xas);
-       if (xas_retry(&xas, page))
+       folio = xas_load(&xas);
+       if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
-       if (!page || xa_is_value(page))
+       if (!folio || xa_is_value(folio))
                goto out;
 
-       if (!page_cache_get_speculative(page))
+       if (!folio_try_get_rcu(folio))
                goto repeat;
 
-       /*
-        * Has the page moved or been split?
-        * This is part of the lockless pagecache protocol. See
-        * include/linux/pagemap.h for details.
-        */
-       if (unlikely(page != xas_reload(&xas))) {
-               put_page(page);
+       if (unlikely(folio != xas_reload(&xas))) {
+               folio_put(folio);
                goto repeat;
        }
 out:
        rcu_read_unlock();
 
-       return page;
+       return folio;
 }
 
 /**
- * pagecache_get_page - Find and get a reference to a page.
+ * __filemap_get_folio - Find and get a reference to a folio.
  * @mapping: The address_space to search.
  * @index: The page index.
- * @fgp_flags: %FGP flags modify how the page is returned.
- * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
  *
  * Looks up the page cache entry at @mapping & @index.
  *
  * @fgp_flags can be zero or more of these flags:
  *
- * * %FGP_ACCESSED - The page will be marked accessed.
- * * %FGP_LOCK - The page is returned locked.
- * * %FGP_HEAD - If the page is present and a THP, return the head page
- *   rather than the exact page specified by the index.
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
  * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- *   instead of allocating a new page to replace it.
+ *   instead of allocating a new folio to replace it.
  * * %FGP_CREAT - If no page is present then a new page is allocated using
- *   @gfp_mask and added to the page cache and the VM's LRU list.
+ *   @gfp and added to the page cache and the VM's LRU list.
  *   The page is returned locked and with an increased refcount.
  * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
  *   page is already in cache.  If the page was allocated, unlock it before
  *   returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
- * * %FGP_NOWAIT - Don't get blocked by page lock
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
  *
  * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
  * if the %GFP flags specified for %FGP_CREAT are atomic.
  *
  * If there is a page cache page, it is returned with an increased refcount.
  *
- * Return: The found page or %NULL otherwise.
+ * Return: The found folio or %NULL otherwise.
  */
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-               int fgp_flags, gfp_t gfp_mask)
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp)
 {
-       struct page *page;
+       struct folio *folio;
 
 repeat:
-       page = mapping_get_entry(mapping, index);
-       if (xa_is_value(page)) {
+       folio = mapping_get_entry(mapping, index);
+       if (xa_is_value(folio)) {
                if (fgp_flags & FGP_ENTRY)
-                       return page;
-               page = NULL;
+                       return folio;
+               folio = NULL;
        }
-       if (!page)
+       if (!folio)
                goto no_page;
 
        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
-                       if (!trylock_page(page)) {
-                               put_page(page);
+                       if (!folio_trylock(folio)) {
+                               folio_put(folio);
                                return NULL;
                        }
                } else {
-                       lock_page(page);
+                       folio_lock(folio);
                }
 
                /* Has the page been truncated? */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
+               if (unlikely(folio->mapping != mapping)) {
+                       folio_unlock(folio);
+                       folio_put(folio);
                        goto repeat;
                }
-               VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+               VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
        }
 
        if (fgp_flags & FGP_ACCESSED)
-               mark_page_accessed(page);
+               folio_mark_accessed(folio);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
-               if (page_is_idle(page))
-                       clear_page_idle(page);
+               if (folio_test_idle(folio))
+                       folio_clear_idle(folio);
        }
-       if (!(fgp_flags & FGP_HEAD))
-               page = find_subpage(page, index);
 
+       if (fgp_flags & FGP_STABLE)
+               folio_wait_stable(folio);
 no_page:
-       if (!page && (fgp_flags & FGP_CREAT)) {
+       if (!folio && (fgp_flags & FGP_CREAT)) {
                int err;
                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
-                       gfp_mask |= __GFP_WRITE;
+                       gfp |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
-                       gfp_mask &= ~__GFP_FS;
+                       gfp &= ~__GFP_FS;
 
-               page = __page_cache_alloc(gfp_mask);
-               if (!page)
+               folio = filemap_alloc_folio(gfp, 0);
+               if (!folio)
                        return NULL;
 
                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
@@ -1946,27 +1941,27 @@ no_page:
 
                /* Init accessed so avoid atomic mark_page_accessed later */
                if (fgp_flags & FGP_ACCESSED)
-                       __SetPageReferenced(page);
+                       __folio_set_referenced(folio);
 
-               err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+               err = filemap_add_folio(mapping, folio, index, gfp);
                if (unlikely(err)) {
-                       put_page(page);
-                       page = NULL;
+                       folio_put(folio);
+                       folio = NULL;
                        if (err == -EEXIST)
                                goto repeat;
                }
 
                /*
-                * add_to_page_cache_lru locks the page, and for mmap we expect
-                * an unlocked page.
+                * filemap_add_folio locks the page, and for mmap
+                * we expect an unlocked page.
                 */
-               if (page && (fgp_flags & FGP_FOR_MMAP))
-                       unlock_page(page);
+               if (folio && (fgp_flags & FGP_FOR_MMAP))
+                       folio_unlock(folio);
        }
 
-       return page;
+       return folio;
 }
-EXPORT_SYMBOL(pagecache_get_page);
+EXPORT_SYMBOL(__filemap_get_folio);
 
 static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
                xa_mark_t mark)
@@ -2421,6 +2416,7 @@ static int filemap_update_page(struct kiocb *iocb,
                struct address_space *mapping, struct iov_iter *iter,
                struct page *page)
 {
+       struct folio *folio = page_folio(page);
        int error;
 
        if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -2430,40 +2426,40 @@ static int filemap_update_page(struct kiocb *iocb,
                filemap_invalidate_lock_shared(mapping);
        }
 
-       if (!trylock_page(page)) {
+       if (!folio_trylock(folio)) {
                error = -EAGAIN;
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                        goto unlock_mapping;
                if (!(iocb->ki_flags & IOCB_WAITQ)) {
                        filemap_invalidate_unlock_shared(mapping);
-                       put_and_wait_on_page_locked(page, TASK_KILLABLE);
+                       put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE);
                        return AOP_TRUNCATED_PAGE;
                }
-               error = __lock_page_async(page, iocb->ki_waitq);
+               error = __folio_lock_async(folio, iocb->ki_waitq);
                if (error)
                        goto unlock_mapping;
        }
 
        error = AOP_TRUNCATED_PAGE;
-       if (!page->mapping)
+       if (!folio->mapping)
                goto unlock;
 
        error = 0;
-       if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+       if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
                goto unlock;
 
        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;
 
-       error = filemap_read_page(iocb->ki_filp, mapping, page);
+       error = filemap_read_page(iocb->ki_filp, mapping, &folio->page);
        goto unlock_mapping;
 unlock:
-       unlock_page(page);
+       folio_unlock(folio);
 unlock_mapping:
        filemap_invalidate_unlock_shared(mapping);
        if (error == AOP_TRUNCATED_PAGE)
-               put_page(page);
+               folio_put(folio);
        return error;
 }
 
@@ -2900,7 +2896,9 @@ unlock:
 static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
                                     struct file **fpin)
 {
-       if (trylock_page(page))
+       struct folio *folio = page_folio(page);
+
+       if (folio_trylock(folio))
                return 1;
 
        /*
@@ -2913,7 +2911,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
 
        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
-               if (__lock_page_killable(page)) {
+               if (__folio_lock_killable(folio)) {
                        /*
                         * We didn't have the right flags to drop the mmap_lock,
                         * but all fault_handlers only check for fatal signals
@@ -2925,11 +2923,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
                        return 0;
                }
        } else
-               __lock_page(page);
+               __folio_lock(folio);
+
        return 1;
 }
 
-
 /*
  * Synchronous readahead happens when we don't even find a page in the page
  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
@@ -3708,28 +3706,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_direct_write);
 
-/*
- * Find or create a page at the given pagecache position. Return the locked
- * page. This function is specifically for buffered writes.
- */
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
-                                       pgoff_t index, unsigned flags)
-{
-       struct page *page;
-       int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
-
-       if (flags & AOP_FLAG_NOFS)
-               fgp_flags |= FGP_NOFS;
-
-       page = pagecache_get_page(mapping, index, fgp_flags,
-                       mapping_gfp_mask(mapping));
-       if (page)
-               wait_for_stable_page(page);
-
-       return page;
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
-
 ssize_t generic_perform_write(struct file *file,
                                struct iov_iter *i, loff_t pos)
 {
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
new file mode 100644 (file)
index 0000000..5b6ae1d
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Compatibility functions which bloat the callers too much to make inline.
+ * All of the callers of these functions should be converted to use folios
+ * eventually.
+ */
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+struct address_space *page_mapping(struct page *page)
+{
+       return folio_mapping(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapping);
+
+void unlock_page(struct page *page)
+{
+       return folio_unlock(page_folio(page));
+}
+EXPORT_SYMBOL(unlock_page);
+
+void end_page_writeback(struct page *page)
+{
+       return folio_end_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+void wait_on_page_writeback(struct page *page)
+{
+       return folio_wait_writeback(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+void wait_for_stable_page(struct page *page)
+{
+       return folio_wait_stable(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
+
+bool page_mapped(struct page *page)
+{
+       return folio_mapped(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapped);
+
+void mark_page_accessed(struct page *page)
+{
+       folio_mark_accessed(page_folio(page));
+}
+EXPORT_SYMBOL(mark_page_accessed);
+
+#ifdef CONFIG_MIGRATION
+int migrate_page_move_mapping(struct address_space *mapping,
+               struct page *newpage, struct page *page, int extra_count)
+{
+       return folio_migrate_mapping(mapping, page_folio(newpage),
+                                       page_folio(page), extra_count);
+}
+EXPORT_SYMBOL(migrate_page_move_mapping);
+
+void migrate_page_states(struct page *newpage, struct page *page)
+{
+       folio_migrate_flags(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+       folio_migrate_copy(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_copy);
+#endif
+
+bool set_page_writeback(struct page *page)
+{
+       return folio_start_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_writeback);
+
+bool set_page_dirty(struct page *page)
+{
+       return folio_mark_dirty(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+int __set_page_dirty_nobuffers(struct page *page)
+{
+       return filemap_dirty_folio(page_mapping(page), page_folio(page));
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+bool clear_page_dirty_for_io(struct page *page)
+{
+       return folio_clear_dirty_for_io(page_folio(page));
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+bool redirty_page_for_writepage(struct writeback_control *wbc,
+               struct page *page)
+{
+       return folio_redirty_for_writepage(wbc, page_folio(page));
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+void lru_cache_add(struct page *page)
+{
+       folio_add_lru(page_folio(page));
+}
+EXPORT_SYMBOL(lru_cache_add);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+               pgoff_t index, gfp_t gfp)
+{
+       return filemap_add_folio(mapping, page_folio(page), index, gfp);
+}
+EXPORT_SYMBOL(add_to_page_cache_lru);
+
+noinline
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp)
+{
+       struct folio *folio;
+
+       folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
+       if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+               return &folio->page;
+       return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL(pagecache_get_page);
+
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+                                       pgoff_t index, unsigned flags)
+{
+       unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+
+       if (flags & AOP_FLAG_NOFS)
+               fgp_flags |= FGP_NOFS;
+       return pagecache_get_page(mapping, index, fgp_flags,
+                       mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
index 4212ad0..471d977 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
-#include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
index 92192cb..e548334 100644 (file)
@@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
@@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
 static void __split_huge_page(struct page *page, struct list_head *list,
                pgoff_t end)
 {
-       struct page *head = compound_head(page);
+       struct folio *folio = page_folio(page);
+       struct page *head = &folio->page;
        struct lruvec *lruvec;
        struct address_space *swap_cache = NULL;
        unsigned long offset = 0;
@@ -2424,7 +2425,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        }
 
        /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
-       lruvec = lock_page_lruvec(head);
+       lruvec = folio_lruvec_lock(folio);
+
+       ClearPageHasHWPoisoned(head);
 
        for (i = nr - 1; i >= 1; i--) {
                __split_huge_page_tail(head, i, lruvec, list);
index 95dc7b8..6378c10 100644 (file)
@@ -5302,7 +5302,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                        *pagep = NULL;
                        goto out;
                }
-               copy_huge_page(page, *pagep);
+               folio_copy(page_folio(page), page_folio(*pagep));
                put_page(*pagep);
                *pagep = NULL;
        }
index cf3cb93..b1001eb 100644 (file)
 
 void page_writeback_init(void);
 
+static inline void *folio_raw_mapping(struct folio *folio)
+{
+       unsigned long mapping = (unsigned long)folio->mapping;
+
+       return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+}
+
 vm_fault_t do_swap_page(struct vm_fault *vmf);
+void folio_rotate_reclaimable(struct folio *folio);
+bool __folio_end_writeback(struct folio *folio);
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
@@ -63,17 +72,28 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
                pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
 
 /**
- * page_evictable - test whether a page is evictable
- * @page: the page to test
+ * folio_evictable - Test whether a folio is evictable.
+ * @folio: The folio to test.
  *
- * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
- *
- * Reasons page might not be evictable:
- * (1) page's mapping marked unevictable
- * (2) page is part of an mlocked VMA
+ * Test whether @folio is evictable -- i.e., should be placed on
+ * active/inactive lists vs unevictable list.
  *
+ * Reasons folio might not be evictable:
+ * 1. folio's mapping marked unevictable
+ * 2. One of the pages in the folio is part of an mlocked VMA
  */
+static inline bool folio_evictable(struct folio *folio)
+{
+       bool ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(folio_mapping(folio)) &&
+                       !folio_test_mlocked(folio);
+       rcu_read_unlock();
+       return ret;
+}
+
 static inline bool page_evictable(struct page *page)
 {
        bool ret;
index 045cc57..5f02fda 100644 (file)
@@ -445,22 +445,25 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
        if (!transhuge_vma_enabled(vma, vm_flags))
                return false;
 
+       if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+                               vma->vm_pgoff, HPAGE_PMD_NR))
+               return false;
+
        /* Enabled via shmem mount options or sysfs settings. */
-       if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
-       }
+       if (shmem_file(vma->vm_file))
+               return shmem_huge_enabled(vma);
 
        /* THP settings require madvise. */
        if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
                return false;
 
-       /* Read-only file mappings need to be aligned for THP to work. */
+       /* Only regular file is valid */
        if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
-           !inode_is_open_for_write(vma->vm_file->f_inode) &&
            (vm_flags & VM_EXEC)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
+               struct inode *inode = vma->vm_file->f_inode;
+
+               return !inode_is_open_for_write(inode) &&
+                       S_ISREG(inode->i_mode);
        }
 
        if (!vma->anon_vma || vma->vm_ops)
@@ -1087,7 +1090,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out_nolock;
        }
 
-       if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out_nolock;
        }
@@ -1211,7 +1214,7 @@ out_up_write:
        mmap_write_unlock(mm);
 out_nolock:
        if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(*hpage);
+               mem_cgroup_uncharge(page_folio(*hpage));
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
 }
@@ -1658,7 +1661,7 @@ static void collapse_file(struct mm_struct *mm,
                goto out;
        }
 
-       if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out;
        }
@@ -1763,6 +1766,10 @@ static void collapse_file(struct mm_struct *mm,
                                filemap_flush(mapping);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
+                       } else if (PageWriteback(page)) {
+                               xas_unlock_irq(&xas);
+                               result = SCAN_FAIL;
+                               goto xa_unlocked;
                        } else if (trylock_page(page)) {
                                get_page(page);
                                xas_unlock_irq(&xas);
@@ -1798,7 +1805,8 @@ static void collapse_file(struct mm_struct *mm,
                        goto out_unlock;
                }
 
-               if (!is_shmem && PageDirty(page)) {
+               if (!is_shmem && (PageDirty(page) ||
+                                 PageWriteback(page))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * page is dirty because it hasn't been flushed
@@ -1975,7 +1983,7 @@ xa_unlocked:
 out:
        VM_BUG_ON(!list_empty(&pagelist));
        if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(*hpage);
+               mem_cgroup_uncharge(page_folio(*hpage));
        /* TODO: tracepoints */
 }
 
index a5716fd..0662093 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ stale:
        /*
         * We come here from above when page->mapping or !PageSwapCache
         * suggests that the node is stale; but it might be under migration.
-        * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+        * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
@@ -852,9 +852,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
        return err;
 }
 
+static inline struct stable_node *folio_stable_node(struct folio *folio)
+{
+       return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
+}
+
 static inline struct stable_node *page_stable_node(struct page *page)
 {
-       return PageKsm(page) ? page_rmapping(page) : NULL;
+       return folio_stable_node(page_folio(page));
 }
 
 static inline void set_page_stable_node(struct page *page,
@@ -2578,7 +2583,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
                return page;            /* let do_swap_page report the error */
 
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-       if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+       if (new_page &&
+           mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
                put_page(new_page);
                new_page = NULL;
        }
@@ -2658,26 +2664,26 @@ again:
 }
 
 #ifdef CONFIG_MIGRATION
-void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
 {
        struct stable_node *stable_node;
 
-       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
+       VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
 
-       stable_node = page_stable_node(newpage);
+       stable_node = folio_stable_node(folio);
        if (stable_node) {
-               VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
-               stable_node->kpfn = page_to_pfn(newpage);
+               VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
+               stable_node->kpfn = folio_pfn(newfolio);
                /*
-                * newpage->mapping was set in advance; now we need smp_wmb()
+                * newfolio->mapping was set in advance; now we need smp_wmb()
                 * to make sure that the new stable_node->kpfn is visible
-                * to get_ksm_page() before it can see that oldpage->mapping
-                * has gone stale (or that PageSwapCache has been cleared).
+                * to get_ksm_page() before it can see that folio->mapping
+                * has gone stale (or that folio_test_swapcache has been cleared).
                 */
                smp_wmb();
-               set_page_stable_node(oldpage, NULL);
+               set_page_stable_node(&folio->page, NULL);
        }
 }
 #endif /* CONFIG_MIGRATION */
index 6da5020..8dab23a 100644 (file)
@@ -456,28 +456,6 @@ ino_t page_cgroup_ino(struct page *page)
        return ino;
 }
 
-static struct mem_cgroup_per_node *
-mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
-{
-       int nid = page_to_nid(page);
-
-       return memcg->nodeinfo[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_node(int nid)
-{
-       return soft_limit_tree.rb_tree_per_node[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_from_page(struct page *page)
-{
-       int nid = page_to_nid(page);
-
-       return soft_limit_tree.rb_tree_per_node[nid];
-}
-
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
                                         struct mem_cgroup_tree_per_node *mctz,
                                         unsigned long new_usage_in_excess)
@@ -548,13 +526,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
        return excess;
 }
 
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
 {
        unsigned long excess;
        struct mem_cgroup_per_node *mz;
        struct mem_cgroup_tree_per_node *mctz;
 
-       mctz = soft_limit_tree_from_page(page);
+       mctz = soft_limit_tree.rb_tree_per_node[nid];
        if (!mctz)
                return;
        /*
@@ -562,7 +540,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
         * because their event counter is not touched.
         */
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-               mz = mem_cgroup_page_nodeinfo(memcg, page);
+               mz = memcg->nodeinfo[nid];
                excess = soft_limit_excess(memcg);
                /*
                 * We have to update the tree if mz is on RB-tree or
@@ -593,7 +571,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 
        for_each_node(nid) {
                mz = memcg->nodeinfo[nid];
-               mctz = soft_limit_tree_node(nid);
+               mctz = soft_limit_tree.rb_tree_per_node[nid];
                if (mctz)
                        mem_cgroup_remove_exceeded(mz, mctz);
        }
@@ -799,7 +777,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
-                                        struct page *page,
                                         int nr_pages)
 {
        /* pagein of a big page is an event. So, ignore page size */
@@ -842,7 +819,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  * Check events in order.
  *
  */
-static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, int nid)
 {
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
@@ -853,7 +830,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
                                                MEM_CGROUP_TARGET_SOFTLIMIT);
                mem_cgroup_threshold(memcg);
                if (unlikely(do_softlimit))
-                       mem_cgroup_update_tree(memcg, page);
+                       mem_cgroup_update_tree(memcg, nid);
        }
 }
 
@@ -1149,64 +1126,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 {
        struct mem_cgroup *memcg;
 
        if (mem_cgroup_disabled())
                return;
 
-       memcg = page_memcg(page);
+       memcg = folio_memcg(folio);
 
        if (!memcg)
-               VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+               VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
        else
-               VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+               VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
 }
 #endif
 
 /**
- * lock_page_lruvec - lock and return lruvec for a given page.
- * @page: the page
+ * folio_lruvec_lock - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
  *
  * These functions are safe to use under any of the following conditions:
- * - page locked
- * - PageLRU cleared
- * - lock_page_memcg()
- * - page->_refcount is zero
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held.
  */
-struct lruvec *lock_page_lruvec(struct page *page)
+struct lruvec *folio_lruvec_lock(struct folio *folio)
 {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
 
-       lruvec = mem_cgroup_page_lruvec(page);
        spin_lock(&lruvec->lru_lock);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
 
        return lruvec;
 }
 
-struct lruvec *lock_page_lruvec_irq(struct page *page)
+/**
+ * folio_lruvec_lock_irq - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
 {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
 
-       lruvec = mem_cgroup_page_lruvec(page);
        spin_lock_irq(&lruvec->lru_lock);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
 
        return lruvec;
 }
 
-struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+/**
+ * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ * @flags: Pointer to irqsave flags.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
+               unsigned long *flags)
 {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
 
-       lruvec = mem_cgroup_page_lruvec(page);
        spin_lock_irqsave(&lruvec->lru_lock, *flags);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
 
        return lruvec;
 }
@@ -1956,18 +1957,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 }
 
 /**
- * lock_page_memcg - lock a page and memcg binding
- * @page: the page
+ * folio_memcg_lock - Bind a folio to its memcg.
+ * @folio: The folio.
  *
- * This function protects unlocked LRU pages from being moved to
+ * This function prevents unlocked LRU folios from being moved to
  * another cgroup.
  *
- * It ensures lifetime of the locked memcg. Caller is responsible
- * for the lifetime of the page.
+ * It ensures lifetime of the bound memcg.  The caller is responsible
+ * for the lifetime of the folio.
  */
-void lock_page_memcg(struct page *page)
+void folio_memcg_lock(struct folio *folio)
 {
-       struct page *head = compound_head(page); /* rmap on tail pages */
        struct mem_cgroup *memcg;
        unsigned long flags;
 
@@ -1981,7 +1981,7 @@ void lock_page_memcg(struct page *page)
        if (mem_cgroup_disabled())
                return;
 again:
-       memcg = page_memcg(head);
+       memcg = folio_memcg(folio);
        if (unlikely(!memcg))
                return;
 
@@ -1995,7 +1995,7 @@ again:
                return;
 
        spin_lock_irqsave(&memcg->move_lock, flags);
-       if (memcg != page_memcg(head)) {
+       if (memcg != folio_memcg(folio)) {
                spin_unlock_irqrestore(&memcg->move_lock, flags);
                goto again;
        }
@@ -2009,9 +2009,15 @@ again:
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
 }
+EXPORT_SYMBOL(folio_memcg_lock);
+
+void lock_page_memcg(struct page *page)
+{
+       folio_memcg_lock(page_folio(page));
+}
 EXPORT_SYMBOL(lock_page_memcg);
 
-static void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __folio_memcg_unlock(struct mem_cgroup *memcg)
 {
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
@@ -2026,14 +2032,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg)
 }
 
 /**
- * unlock_page_memcg - unlock a page and memcg binding
- * @page: the page
+ * folio_memcg_unlock - Release the binding between a folio and its memcg.
+ * @folio: The folio.
+ *
+ * This releases the binding created by folio_memcg_lock().  This does
+ * not change the accounting of this folio to its memcg, but it does
+ * permit others to change it.
  */
-void unlock_page_memcg(struct page *page)
+void folio_memcg_unlock(struct folio *folio)
 {
-       struct page *head = compound_head(page);
+       __folio_memcg_unlock(folio_memcg(folio));
+}
+EXPORT_SYMBOL(folio_memcg_unlock);
 
-       __unlock_page_memcg(page_memcg(head));
+void unlock_page_memcg(struct page *page)
+{
+       folio_memcg_unlock(page_folio(page));
 }
 EXPORT_SYMBOL(unlock_page_memcg);
 
@@ -2734,9 +2748,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 }
 #endif
 
-static void commit_charge(struct page *page, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 {
-       VM_BUG_ON_PAGE(page_memcg(page), page);
+       VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
        /*
         * Any of the following ensures page's memcg stability:
         *
@@ -2745,7 +2759,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
         * - lock_page_memcg()
         * - exclusive reference
         */
-       page->memcg_data = (unsigned long)memcg;
+       folio->memcg_data = (unsigned long)memcg;
 }
 
 static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
@@ -3015,15 +3029,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
  */
 void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
+       struct folio *folio = page_folio(page);
        struct obj_cgroup *objcg;
        unsigned int nr_pages = 1 << order;
 
-       if (!PageMemcgKmem(page))
+       if (!folio_memcg_kmem(folio))
                return;
 
-       objcg = __page_objcg(page);
+       objcg = __folio_objcg(folio);
        obj_cgroup_uncharge_pages(objcg, nr_pages);
-       page->memcg_data = 0;
+       folio->memcg_data = 0;
        obj_cgroup_put(objcg);
 }
 
@@ -3257,17 +3272,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
  */
 void split_page_memcg(struct page *head, unsigned int nr)
 {
-       struct mem_cgroup *memcg = page_memcg(head);
+       struct folio *folio = page_folio(head);
+       struct mem_cgroup *memcg = folio_memcg(folio);
        int i;
 
        if (mem_cgroup_disabled() || !memcg)
                return;
 
        for (i = 1; i < nr; i++)
-               head[i].memcg_data = head->memcg_data;
+               folio_page(folio, i)->memcg_data = folio->memcg_data;
 
-       if (PageMemcgKmem(head))
-               obj_cgroup_get_many(__page_objcg(head), nr - 1);
+       if (folio_memcg_kmem(folio))
+               obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
        else
                css_get_many(&memcg->css, nr - 1);
 }
@@ -3381,7 +3397,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
        if (order > 0)
                return 0;
 
-       mctz = soft_limit_tree_node(pgdat->node_id);
+       mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
 
        /*
         * Do not even bother to check the largest node if the root
@@ -4537,17 +4553,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * As being wrong occasionally doesn't matter, updates and accesses to the
  * records are lockless and racy.
  */
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb)
 {
-       struct mem_cgroup *memcg = page_memcg(page);
+       struct mem_cgroup *memcg = folio_memcg(folio);
        struct memcg_cgwb_frn *frn;
        u64 now = get_jiffies_64();
        u64 oldest_at = now;
        int oldest = -1;
        int i;
 
-       trace_track_foreign_dirty(page, wb);
+       trace_track_foreign_dirty(folio, wb);
 
        /*
         * Pick the slot to use.  If there is already a slot for @wb, keep
@@ -5575,38 +5591,39 @@ static int mem_cgroup_move_account(struct page *page,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
 {
+       struct folio *folio = page_folio(page);
        struct lruvec *from_vec, *to_vec;
        struct pglist_data *pgdat;
-       unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
-       int ret;
+       unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
+       int nid, ret;
 
        VM_BUG_ON(from == to);
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON(compound && !PageTransHuge(page));
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+       VM_BUG_ON(compound && !folio_test_multi(folio));
 
        /*
         * Prevent mem_cgroup_migrate() from looking at
         * page's memory cgroup of its source page while we change it.
         */
        ret = -EBUSY;
-       if (!trylock_page(page))
+       if (!folio_trylock(folio))
                goto out;
 
        ret = -EINVAL;
-       if (page_memcg(page) != from)
+       if (folio_memcg(folio) != from)
                goto out_unlock;
 
-       pgdat = page_pgdat(page);
+       pgdat = folio_pgdat(folio);
        from_vec = mem_cgroup_lruvec(from, pgdat);
        to_vec = mem_cgroup_lruvec(to, pgdat);
 
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
 
-       if (PageAnon(page)) {
-               if (page_mapped(page)) {
+       if (folio_test_anon(folio)) {
+               if (folio_mapped(folio)) {
                        __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
                        __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
-                       if (PageTransHuge(page)) {
+                       if (folio_test_transhuge(folio)) {
                                __mod_lruvec_state(from_vec, NR_ANON_THPS,
                                                   -nr_pages);
                                __mod_lruvec_state(to_vec, NR_ANON_THPS,
@@ -5617,18 +5634,18 @@ static int mem_cgroup_move_account(struct page *page,
                __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
                __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
 
-               if (PageSwapBacked(page)) {
+               if (folio_test_swapbacked(folio)) {
                        __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
                        __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
                }
 
-               if (page_mapped(page)) {
+               if (folio_mapped(folio)) {
                        __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
                        __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
                }
 
-               if (PageDirty(page)) {
-                       struct address_space *mapping = page_mapping(page);
+               if (folio_test_dirty(folio)) {
+                       struct address_space *mapping = folio_mapping(folio);
 
                        if (mapping_can_writeback(mapping)) {
                                __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
@@ -5639,7 +5656,7 @@ static int mem_cgroup_move_account(struct page *page,
                }
        }
 
-       if (PageWriteback(page)) {
+       if (folio_test_writeback(folio)) {
                __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
                __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
        }
@@ -5662,20 +5679,21 @@ static int mem_cgroup_move_account(struct page *page,
        css_get(&to->css);
        css_put(&from->css);
 
-       page->memcg_data = (unsigned long)to;
+       folio->memcg_data = (unsigned long)to;
 
-       __unlock_page_memcg(from);
+       __folio_memcg_unlock(from);
 
        ret = 0;
+       nid = folio_nid(folio);
 
        local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
-       memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
-       memcg_check_events(from, page);
+       mem_cgroup_charge_statistics(to, nr_pages);
+       memcg_check_events(to, nid);
+       mem_cgroup_charge_statistics(from, -nr_pages);
+       memcg_check_events(from, nid);
        local_irq_enable();
 out_unlock:
-       unlock_page(page);
+       folio_unlock(folio);
 out:
        return ret;
 }
@@ -6680,9 +6698,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                        atomic_long_read(&parent->memory.children_low_usage)));
 }
 
-static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
+static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+                       gfp_t gfp)
 {
-       unsigned int nr_pages = thp_nr_pages(page);
+       long nr_pages = folio_nr_pages(folio);
        int ret;
 
        ret = try_charge(memcg, gfp, nr_pages);
@@ -6690,38 +6709,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
                goto out;
 
        css_get(&memcg->css);
-       commit_charge(page, memcg);
+       commit_charge(folio, memcg);
 
        local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
-       memcg_check_events(memcg, page);
+       mem_cgroup_charge_statistics(memcg, nr_pages);
+       memcg_check_events(memcg, folio_nid(folio));
        local_irq_enable();
 out:
        return ret;
 }
 
-/**
- * __mem_cgroup_charge - charge a newly allocated page to a cgroup
- * @page: page to charge
- * @mm: mm context of the victim
- * @gfp_mask: reclaim mode
- *
- * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary. if @mm is NULL, try to
- * charge to the active memcg.
- *
- * Do not use this for pages allocated for swapin.
- *
- * Returns 0 on success. Otherwise, an error code is returned.
- */
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                       gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 {
        struct mem_cgroup *memcg;
        int ret;
 
        memcg = get_mem_cgroup_from_mm(mm);
-       ret = charge_memcg(page, memcg, gfp_mask);
+       ret = charge_memcg(folio, memcg, gfp);
        css_put(&memcg->css);
 
        return ret;
@@ -6742,6 +6746,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry)
 {
+       struct folio *folio = page_folio(page);
        struct mem_cgroup *memcg;
        unsigned short id;
        int ret;
@@ -6756,7 +6761,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();
 
-       ret = charge_memcg(page, memcg, gfp);
+       ret = charge_memcg(folio, memcg, gfp);
 
        css_put(&memcg->css);
        return ret;
@@ -6800,7 +6805,7 @@ struct uncharge_gather {
        unsigned long nr_memory;
        unsigned long pgpgout;
        unsigned long nr_kmem;
-       struct page *dummy_page;
+       int nid;
 };
 
 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6824,36 +6829,36 @@ static void uncharge_batch(const struct uncharge_gather *ug)
        local_irq_save(flags);
        __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
        __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-       memcg_check_events(ug->memcg, ug->dummy_page);
+       memcg_check_events(ug->memcg, ug->nid);
        local_irq_restore(flags);
 
-       /* drop reference from uncharge_page */
+       /* drop reference from uncharge_folio */
        css_put(&ug->memcg->css);
 }
 
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 {
-       unsigned long nr_pages;
+       long nr_pages;
        struct mem_cgroup *memcg;
        struct obj_cgroup *objcg;
-       bool use_objcg = PageMemcgKmem(page);
+       bool use_objcg = folio_memcg_kmem(folio);
 
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
        /*
         * Nobody should be changing or seriously looking at
-        * page memcg or objcg at this point, we have fully
-        * exclusive access to the page.
+        * folio memcg or objcg at this point, we have fully
+        * exclusive access to the folio.
         */
        if (use_objcg) {
-               objcg = __page_objcg(page);
+               objcg = __folio_objcg(folio);
                /*
                 * This get matches the put at the end of the function and
                 * kmem pages do not hold memcg references anymore.
                 */
                memcg = get_mem_cgroup_from_objcg(objcg);
        } else {
-               memcg = __page_memcg(page);
+               memcg = __folio_memcg(folio);
        }
 
        if (!memcg)
@@ -6865,19 +6870,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                        uncharge_gather_clear(ug);
                }
                ug->memcg = memcg;
-               ug->dummy_page = page;
+               ug->nid = folio_nid(folio);
 
                /* pairs with css_put in uncharge_batch */
                css_get(&memcg->css);
        }
 
-       nr_pages = compound_nr(page);
+       nr_pages = folio_nr_pages(folio);
 
        if (use_objcg) {
                ug->nr_memory += nr_pages;
                ug->nr_kmem += nr_pages;
 
-               page->memcg_data = 0;
+               folio->memcg_data = 0;
                obj_cgroup_put(objcg);
        } else {
                /* LRU pages aren't accounted at the root level */
@@ -6885,28 +6890,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                        ug->nr_memory += nr_pages;
                ug->pgpgout++;
 
-               page->memcg_data = 0;
+               folio->memcg_data = 0;
        }
 
        css_put(&memcg->css);
 }
 
-/**
- * __mem_cgroup_uncharge - uncharge a page
- * @page: page to uncharge
- *
- * Uncharge a page previously charged with __mem_cgroup_charge().
- */
-void __mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio)
 {
        struct uncharge_gather ug;
 
-       /* Don't touch page->lru of any random page, pre-check: */
-       if (!page_memcg(page))
+       /* Don't touch folio->lru of any random page, pre-check: */
+       if (!folio_memcg(folio))
                return;
 
        uncharge_gather_clear(&ug);
-       uncharge_page(page, &ug);
+       uncharge_folio(folio, &ug);
        uncharge_batch(&ug);
 }
 
@@ -6920,52 +6919,49 @@ void __mem_cgroup_uncharge(struct page *page)
 void __mem_cgroup_uncharge_list(struct list_head *page_list)
 {
        struct uncharge_gather ug;
-       struct page *page;
+       struct folio *folio;
 
        uncharge_gather_clear(&ug);
-       list_for_each_entry(page, page_list, lru)
-               uncharge_page(page, &ug);
+       list_for_each_entry(folio, page_list, lru)
+               uncharge_folio(folio, &ug);
        if (ug.memcg)
                uncharge_batch(&ug);
 }
 
 /**
- * mem_cgroup_migrate - charge a page's replacement
- * @oldpage: currently circulating page
- * @newpage: replacement page
+ * mem_cgroup_migrate - Charge a folio's replacement.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
  *
- * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * Charge @new as a replacement folio for @old. @old will
  * be uncharged upon free.
  *
- * Both pages must be locked, @newpage->mapping must be set up.
+ * Both folios must be locked, @new->mapping must be set up.
  */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
        struct mem_cgroup *memcg;
-       unsigned int nr_pages;
+       long nr_pages = folio_nr_pages(new);
        unsigned long flags;
 
-       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
-       VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
-                      newpage);
+       VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+       VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+       VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+       VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
 
        if (mem_cgroup_disabled())
                return;
 
-       /* Page cache replacement: new page already charged? */
-       if (page_memcg(newpage))
+       /* Page cache replacement: new folio already charged? */
+       if (folio_memcg(new))
                return;
 
-       memcg = page_memcg(oldpage);
-       VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
+       memcg = folio_memcg(old);
+       VM_WARN_ON_ONCE_FOLIO(!memcg, old);
        if (!memcg)
                return;
 
        /* Force-charge the new page. The old one will be freed soon */
-       nr_pages = thp_nr_pages(newpage);
-
        if (!mem_cgroup_is_root(memcg)) {
                page_counter_charge(&memcg->memory, nr_pages);
                if (do_memsw_account())
@@ -6973,11 +6969,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
        }
 
        css_get(&memcg->css);
-       commit_charge(newpage, memcg);
+       commit_charge(new, memcg);
 
        local_irq_save(flags);
-       mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
-       memcg_check_events(memcg, newpage);
+       mem_cgroup_charge_statistics(memcg, nr_pages);
+       memcg_check_events(memcg, folio_nid(new));
        local_irq_restore(flags);
 }
 
@@ -7204,8 +7200,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for updating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -nr_entries);
-       memcg_check_events(memcg, page);
+       mem_cgroup_charge_statistics(memcg, -nr_entries);
+       memcg_check_events(memcg, page_to_nid(page));
 
        css_put(&memcg->css);
 }
index 3e6449f..93078a2 100644 (file)
@@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p)
                 * Poisoned page might never drop its ref count to 0 so we have
                 * to uncharge it manually from its memcg.
                 */
-               mem_cgroup_uncharge(p);
+               mem_cgroup_uncharge(page_folio(p));
 
                /*
                 * drop the page count elevated by isolate_lru_page()
@@ -1147,20 +1147,6 @@ static int __get_hwpoison_page(struct page *page)
        if (!HWPoisonHandlable(head))
                return -EBUSY;
 
-       if (PageTransHuge(head)) {
-               /*
-                * Non anonymous thp exists only in allocation/free time. We
-                * can't handle such a case correctly, so let's give it up.
-                * This should be better than triggering BUG_ON when kernel
-                * tries to touch the "partially handled" page.
-                */
-               if (!PageAnon(head)) {
-                       pr_err("Memory failure: %#lx: non anonymous thp\n",
-                               page_to_pfn(page));
-                       return 0;
-               }
-       }
-
        if (get_page_unless_zero(head)) {
                if (head == compound_head(page))
                        return 1;
@@ -1708,6 +1694,20 @@ try_again:
        }
 
        if (PageTransHuge(hpage)) {
+               /*
+                * The flag must be set after the refcount is bumped
+                * otherwise it may race with THP split.
+                * And the flag can't be set in get_hwpoison_page() since
+                * it is called by soft offline too and it is just called
+                * for !MF_COUNT_INCREASE.  So here seems to be the best
+                * place.
+                *
+                * Don't need care about the above error handling paths for
+                * get_hwpoison_page() since they handle either free page
+                * or unhandlable page.  The refcount is bumped iff the
+                * page is a valid handlable page.
+                */
+               SetPageHasHWPoisoned(hpage);
                if (try_to_split_thp_page(p, "Memory Failure") < 0) {
                        action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
                        res = -EBUSY;
index adf9b9e..4b1de80 100644 (file)
@@ -990,7 +990,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
        if (!new_page)
                return NULL;
 
-       if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+       if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
                put_page(new_page);
                return NULL;
        }
@@ -3019,7 +3019,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                }
        }
 
-       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
                goto oom_free_new;
        cgroup_throttle_swaprate(new_page, GFP_KERNEL);
 
@@ -3539,7 +3539,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
-                                       workingset_refault(page, shadow);
+                                       workingset_refault(page_folio(page),
+                                                               shadow);
 
                                lru_cache_add(page);
 
@@ -3769,7 +3770,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        if (!page)
                goto oom;
 
-       if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
                goto oom_free_page;
        cgroup_throttle_swaprate(page, GFP_KERNEL);
 
@@ -3907,6 +3908,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                return ret;
 
        /*
+        * Just backoff if any subpage of a THP is corrupted otherwise
+        * the corrupted page may mapped by PMD silently to escape the
+        * check.  This kind of THP just can be PTE mapped.  Access to
+        * the corrupted subpage should trigger SIGBUS as expected.
+        */
+       if (unlikely(PageHasHWPoisoned(page)))
+               return ret;
+
+       /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
@@ -4193,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
        if (!vmf->cow_page)
                return VM_FAULT_OOM;
 
-       if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
+       if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
+                               GFP_KERNEL)) {
                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
@@ -4258,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults).
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
  * If mmap_lock is released, vma may become invalid (for example
  * by other thread calling munmap()).
  */
@@ -4499,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
  * concurrent faults).
  *
  * The mmap_lock may have been released depending on flags and our return value.
- * See filemap_fault() and __lock_page_or_retry().
+ * See filemap_fault() and __folio_lock_or_retry().
  */
 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 {
@@ -4603,7 +4614,7 @@ unlock:
  * By the time we get here, we already hold the mm semaphore
  *
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
  */
 static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
@@ -4759,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
  * By the time we get here, we already hold the mm semaphore
  *
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
  */
 vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
index d12e060..f4b4be7 100644 (file)
@@ -2196,6 +2196,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages);
 
+struct folio *folio_alloc(gfp_t gfp, unsigned order)
+{
+       struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+       if (page && order > 1)
+               prep_transhuge_page(page);
+       return (struct folio *)page;
+}
+EXPORT_SYMBOL(folio_alloc);
+
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
        struct mempolicy *pol = mpol_dup(vma_policy(src));
index 0b8afbe..b933d0f 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
-#include <linux/blkdev.h>
 #include <linux/writeback.h>
 #include "slab.h"
 
index ed593bf..5a66a71 100644 (file)
@@ -505,7 +505,7 @@ void free_devmap_managed_page(struct page *page)
 
        __ClearPageWaiters(page);
 
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
 
        /*
         * When a device_private page is freed, the page->mapping field
index 1852d78..efa9941 100644 (file)
@@ -364,7 +364,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
         */
        expected_count += is_device_private_page(page);
        if (mapping)
-               expected_count += thp_nr_pages(page) + page_has_private(page);
+               expected_count += compound_nr(page) + page_has_private(page);
 
        return expected_count;
 }
@@ -377,74 +377,75 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
-int migrate_page_move_mapping(struct address_space *mapping,
-               struct page *newpage, struct page *page, int extra_count)
+int folio_migrate_mapping(struct address_space *mapping,
+               struct folio *newfolio, struct folio *folio, int extra_count)
 {
-       XA_STATE(xas, &mapping->i_pages, page_index(page));
+       XA_STATE(xas, &mapping->i_pages, folio_index(folio));
        struct zone *oldzone, *newzone;
        int dirty;
-       int expected_count = expected_page_refs(mapping, page) + extra_count;
-       int nr = thp_nr_pages(page);
+       int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
+       long nr = folio_nr_pages(folio);
 
        if (!mapping) {
                /* Anonymous page without mapping */
-               if (page_count(page) != expected_count)
+               if (folio_ref_count(folio) != expected_count)
                        return -EAGAIN;
 
                /* No turning back from here */
-               newpage->index = page->index;
-               newpage->mapping = page->mapping;
-               if (PageSwapBacked(page))
-                       __SetPageSwapBacked(newpage);
+               newfolio->index = folio->index;
+               newfolio->mapping = folio->mapping;
+               if (folio_test_swapbacked(folio))
+                       __folio_set_swapbacked(newfolio);
 
                return MIGRATEPAGE_SUCCESS;
        }
 
-       oldzone = page_zone(page);
-       newzone = page_zone(newpage);
+       oldzone = folio_zone(folio);
+       newzone = folio_zone(newfolio);
 
        xas_lock_irq(&xas);
-       if (page_count(page) != expected_count || xas_load(&xas) != page) {
+       if (folio_ref_count(folio) != expected_count ||
+           xas_load(&xas) != folio) {
                xas_unlock_irq(&xas);
                return -EAGAIN;
        }
 
-       if (!page_ref_freeze(page, expected_count)) {
+       if (!folio_ref_freeze(folio, expected_count)) {
                xas_unlock_irq(&xas);
                return -EAGAIN;
        }
 
        /*
-        * Now we know that no one else is looking at the page:
+        * Now we know that no one else is looking at the folio:
         * no turning back from here.
         */
-       newpage->index = page->index;
-       newpage->mapping = page->mapping;
-       page_ref_add(newpage, nr); /* add cache reference */
-       if (PageSwapBacked(page)) {
-               __SetPageSwapBacked(newpage);
-               if (PageSwapCache(page)) {
-                       SetPageSwapCache(newpage);
-                       set_page_private(newpage, page_private(page));
+       newfolio->index = folio->index;
+       newfolio->mapping = folio->mapping;
+       folio_ref_add(newfolio, nr); /* add cache reference */
+       if (folio_test_swapbacked(folio)) {
+               __folio_set_swapbacked(newfolio);
+               if (folio_test_swapcache(folio)) {
+                       folio_set_swapcache(newfolio);
+                       newfolio->private = folio_get_private(folio);
                }
        } else {
-               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+               VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
        }
 
        /* Move dirty while page refs frozen and newpage not yet exposed */
-       dirty = PageDirty(page);
+       dirty = folio_test_dirty(folio);
        if (dirty) {
-               ClearPageDirty(page);
-               SetPageDirty(newpage);
+               folio_clear_dirty(folio);
+               folio_set_dirty(newfolio);
        }
 
-       xas_store(&xas, newpage);
-       if (PageTransHuge(page)) {
+       xas_store(&xas, newfolio);
+       if (nr > 1) {
                int i;
 
                for (i = 1; i < nr; i++) {
                        xas_next(&xas);
-                       xas_store(&xas, newpage);
+                       xas_store(&xas, newfolio);
                }
        }
 
@@ -453,7 +454,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
         * to one less reference.
         * We know this isn't the last reference.
         */
-       page_ref_unfreeze(page, expected_count - nr);
+       folio_ref_unfreeze(folio, expected_count - nr);
 
        xas_unlock(&xas);
        /* Leave irq disabled to prevent preemption while updating stats */
@@ -472,18 +473,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
                struct lruvec *old_lruvec, *new_lruvec;
                struct mem_cgroup *memcg;
 
-               memcg = page_memcg(page);
+               memcg = folio_memcg(folio);
                old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
                new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
 
                __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
                __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
-               if (PageSwapBacked(page) && !PageSwapCache(page)) {
+               if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
                        __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
                        __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
                }
 #ifdef CONFIG_SWAP
-               if (PageSwapCache(page)) {
+               if (folio_test_swapcache(folio)) {
                        __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
                        __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
                }
@@ -499,11 +500,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
 
        return MIGRATEPAGE_SUCCESS;
 }
-EXPORT_SYMBOL(migrate_page_move_mapping);
+EXPORT_SYMBOL(folio_migrate_mapping);
 
 /*
  * The expected number of remaining references is the same as that
- * of migrate_page_move_mapping().
+ * of folio_migrate_mapping().
  */
 int migrate_huge_page_move_mapping(struct address_space *mapping,
                                   struct page *newpage, struct page *page)
@@ -538,91 +539,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 }
 
 /*
- * Copy the page to its new location
+ * Copy the flags and some other ancillary information
  */
-void migrate_page_states(struct page *newpage, struct page *page)
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 {
        int cpupid;
 
-       if (PageError(page))
-               SetPageError(newpage);
-       if (PageReferenced(page))
-               SetPageReferenced(newpage);
-       if (PageUptodate(page))
-               SetPageUptodate(newpage);
-       if (TestClearPageActive(page)) {
-               VM_BUG_ON_PAGE(PageUnevictable(page), page);
-               SetPageActive(newpage);
-       } else if (TestClearPageUnevictable(page))
-               SetPageUnevictable(newpage);
-       if (PageWorkingset(page))
-               SetPageWorkingset(newpage);
-       if (PageChecked(page))
-               SetPageChecked(newpage);
-       if (PageMappedToDisk(page))
-               SetPageMappedToDisk(newpage);
-
-       /* Move dirty on pages not done by migrate_page_move_mapping() */
-       if (PageDirty(page))
-               SetPageDirty(newpage);
-
-       if (page_is_young(page))
-               set_page_young(newpage);
-       if (page_is_idle(page))
-               set_page_idle(newpage);
+       if (folio_test_error(folio))
+               folio_set_error(newfolio);
+       if (folio_test_referenced(folio))
+               folio_set_referenced(newfolio);
+       if (folio_test_uptodate(folio))
+               folio_mark_uptodate(newfolio);
+       if (folio_test_clear_active(folio)) {
+               VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+               folio_set_active(newfolio);
+       } else if (folio_test_clear_unevictable(folio))
+               folio_set_unevictable(newfolio);
+       if (folio_test_workingset(folio))
+               folio_set_workingset(newfolio);
+       if (folio_test_checked(folio))
+               folio_set_checked(newfolio);
+       if (folio_test_mappedtodisk(folio))
+               folio_set_mappedtodisk(newfolio);
+
+       /* Move dirty on pages not done by folio_migrate_mapping() */
+       if (folio_test_dirty(folio))
+               folio_set_dirty(newfolio);
+
+       if (folio_test_young(folio))
+               folio_set_young(newfolio);
+       if (folio_test_idle(folio))
+               folio_set_idle(newfolio);
 
        /*
         * Copy NUMA information to the new page, to prevent over-eager
         * future migrations of this same page.
         */
-       cpupid = page_cpupid_xchg_last(page, -1);
-       page_cpupid_xchg_last(newpage, cpupid);
+       cpupid = page_cpupid_xchg_last(&folio->page, -1);
+       page_cpupid_xchg_last(&newfolio->page, cpupid);
 
-       ksm_migrate_page(newpage, page);
+       folio_migrate_ksm(newfolio, folio);
        /*
         * Please do not reorder this without considering how mm/ksm.c's
         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
         */
-       if (PageSwapCache(page))
-               ClearPageSwapCache(page);
-       ClearPagePrivate(page);
+       if (folio_test_swapcache(folio))
+               folio_clear_swapcache(folio);
+       folio_clear_private(folio);
 
        /* page->private contains hugetlb specific flags */
-       if (!PageHuge(page))
-               set_page_private(page, 0);
+       if (!folio_test_hugetlb(folio))
+               folio->private = NULL;
 
        /*
         * If any waiters have accumulated on the new page then
         * wake them up.
         */
-       if (PageWriteback(newpage))
-               end_page_writeback(newpage);
+       if (folio_test_writeback(newfolio))
+               folio_end_writeback(newfolio);
 
        /*
         * PG_readahead shares the same bit with PG_reclaim.  The above
         * end_page_writeback() may clear PG_readahead mistakenly, so set the
         * bit after that.
         */
-       if (PageReadahead(page))
-               SetPageReadahead(newpage);
+       if (folio_test_readahead(folio))
+               folio_set_readahead(newfolio);
 
-       copy_page_owner(page, newpage);
+       folio_copy_owner(newfolio, folio);
 
-       if (!PageHuge(page))
-               mem_cgroup_migrate(page, newpage);
+       if (!folio_test_hugetlb(folio))
+               mem_cgroup_migrate(folio, newfolio);
 }
-EXPORT_SYMBOL(migrate_page_states);
+EXPORT_SYMBOL(folio_migrate_flags);
 
-void migrate_page_copy(struct page *newpage, struct page *page)
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
 {
-       if (PageHuge(page) || PageTransHuge(page))
-               copy_huge_page(newpage, page);
-       else
-               copy_highpage(newpage, page);
-
-       migrate_page_states(newpage, page);
+       folio_copy(newfolio, folio);
+       folio_migrate_flags(newfolio, folio);
 }
-EXPORT_SYMBOL(migrate_page_copy);
+EXPORT_SYMBOL(folio_migrate_copy);
 
 /************************************************************
  *                    Migration functions
@@ -638,19 +635,21 @@ int migrate_page(struct address_space *mapping,
                struct page *newpage, struct page *page,
                enum migrate_mode mode)
 {
+       struct folio *newfolio = page_folio(newpage);
+       struct folio *folio = page_folio(page);
        int rc;
 
-       BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+       BUG_ON(folio_test_writeback(folio));    /* Writeback must be complete */
 
-       rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+       rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
 
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
 
        if (mode != MIGRATE_SYNC_NO_COPY)
-               migrate_page_copy(newpage, page);
+               folio_migrate_copy(newfolio, folio);
        else
-               migrate_page_states(newpage, page);
+               folio_migrate_flags(newfolio, folio);
        return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(migrate_page);
@@ -2468,7 +2467,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
  * @page: struct page to check
  *
  * Pinned pages cannot be migrated. This is the same test as in
- * migrate_page_move_mapping(), except that here we allow migration of a
+ * folio_migrate_mapping(), except that here we allow migration of a
  * ZONE_DEVICE page.
  */
 static bool migrate_vma_check_page(struct page *page)
@@ -2846,7 +2845,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
        if (unlikely(anon_vma_prepare(vma)))
                goto abort;
-       if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
                goto abort;
 
        /*
index 16d2ee1..e263d62 100644 (file)
@@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
        /* Phase 1: page isolation */
        for (i = 0; i < nr; i++) {
                struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
 
                if (TestClearPageMlocked(page)) {
                        /*
@@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
                         * so we can spare the get_page() here.
                         */
                        if (TestClearPageLRU(page)) {
-                               lruvec = relock_page_lruvec_irq(page, lruvec);
+                               lruvec = folio_lruvec_relock_irq(folio, lruvec);
                                del_page_from_lru_list(page, lruvec);
                                continue;
                        } else
index 02d2427..41ef204 100644 (file)
@@ -27,7 +27,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/compiler.h>
 #include <linux/mount.h>
index 831340e..989f35a 100644 (file)
@@ -1150,7 +1150,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
        struct task_struct *task;
        struct task_struct *p;
        unsigned int f_flags;
-       bool reap = true;
+       bool reap = false;
        struct pid *pid;
        long ret = 0;
 
@@ -1177,15 +1177,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
                goto put_task;
        }
 
-       mm = p->mm;
-       mmgrab(mm);
-
-       /* If the work has been done already, just exit with success */
-       if (test_bit(MMF_OOM_SKIP, &mm->flags))
-               reap = false;
-       else if (!task_will_free_mem(p)) {
-               reap = false;
-               ret = -EINVAL;
+       if (mmget_not_zero(p->mm)) {
+               mm = p->mm;
+               if (task_will_free_mem(p))
+                       reap = true;
+               else {
+                       /* Error only if the work has not been done already */
+                       if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+                               ret = -EINVAL;
+               }
        }
        task_unlock(p);
 
@@ -1201,7 +1201,8 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
        mmap_read_unlock(mm);
 
 drop_mm:
-       mmdrop(mm);
+       if (mm)
+               mmput(mm);
 put_task:
        put_task_struct(task);
 put_pid:
index 4812a17..9c64490 100644 (file)
@@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
        return cur_time;
 }
 
-static void wb_domain_writeout_inc(struct wb_domain *dom,
+static void wb_domain_writeout_add(struct wb_domain *dom,
                                   struct fprop_local_percpu *completions,
-                                  unsigned int max_prop_frac)
+                                  unsigned int max_prop_frac, long nr)
 {
-       __fprop_inc_percpu_max(&dom->completions, completions,
-                              max_prop_frac);
+       __fprop_add_percpu_max(&dom->completions, completions,
+                              max_prop_frac, nr);
        /* First event after period switching was turned off? */
        if (unlikely(!dom->period_time)) {
                /*
@@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
 
 /*
  * Increment @wb's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
+ * completion count. Called from __folio_end_writeback().
  */
-static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
 {
        struct wb_domain *cgdom;
 
-       inc_wb_stat(wb, WB_WRITTEN);
-       wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
-                              wb->bdi->max_prop_frac);
+       wb_stat_mod(wb, WB_WRITTEN, nr);
+       wb_domain_writeout_add(&global_wb_domain, &wb->completions,
+                              wb->bdi->max_prop_frac, nr);
 
        cgdom = mem_cgroup_wb_domain(wb);
        if (cgdom)
-               wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
-                                      wb->bdi->max_prop_frac);
+               wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
+                                      wb->bdi->max_prop_frac, nr);
 }
 
 void wb_writeout_inc(struct bdi_writeback *wb)
@@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
        unsigned long flags;
 
        local_irq_save(flags);
-       __wb_writeout_inc(wb);
+       __wb_writeout_add(wb, 1);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(wb_writeout_inc);
@@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
         * write_bandwidth = ---------------------------------------------------
         *                                          period
         *
-        * @written may have decreased due to account_page_redirty().
+        * @written may have decreased due to folio_account_redirty().
         * Avoid underflowing @bw calculation.
         */
        bw = written - min(written, wb->written_stamp);
@@ -2381,44 +2381,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 }
 
 /**
- * write_one_page - write out a single page and wait on I/O
- * @page: the page to write
+ * folio_write_one - write out a single folio and wait on I/O.
+ * @folio: The folio to write.
  *
- * The page must be locked by the caller and will be unlocked upon return.
+ * The folio must be locked by the caller and will be unlocked upon return.
  *
  * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
  * function returns.
  *
  * Return: %0 on success, negative error code otherwise
  */
-int write_one_page(struct page *page)
+int folio_write_one(struct folio *folio)
 {
-       struct address_space *mapping = page->mapping;
+       struct address_space *mapping = folio->mapping;
        int ret = 0;
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
-               .nr_to_write = 1,
+               .nr_to_write = folio_nr_pages(folio),
        };
 
-       BUG_ON(!PageLocked(page));
+       BUG_ON(!folio_test_locked(folio));
 
-       wait_on_page_writeback(page);
+       folio_wait_writeback(folio);
 
-       if (clear_page_dirty_for_io(page)) {
-               get_page(page);
-               ret = mapping->a_ops->writepage(page, &wbc);
+       if (folio_clear_dirty_for_io(folio)) {
+               folio_get(folio);
+               ret = mapping->a_ops->writepage(&folio->page, &wbc);
                if (ret == 0)
-                       wait_on_page_writeback(page);
-               put_page(page);
+                       folio_wait_writeback(folio);
+               folio_put(folio);
        } else {
-               unlock_page(page);
+               folio_unlock(folio);
        }
 
        if (!ret)
                ret = filemap_check_errors(mapping);
        return ret;
 }
-EXPORT_SYMBOL(write_one_page);
+EXPORT_SYMBOL(folio_write_one);
 
 /*
  * For address_spaces which do not use buffers nor write back.
@@ -2438,29 +2438,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback);
  *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-static void account_page_dirtied(struct page *page,
+static void folio_account_dirtied(struct folio *folio,
                struct address_space *mapping)
 {
        struct inode *inode = mapping->host;
 
-       trace_writeback_dirty_page(page, mapping);
+       trace_writeback_dirty_folio(folio, mapping);
 
        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;
+               long nr = folio_nr_pages(folio);
 
-               inode_attach_wb(inode, page);
+               inode_attach_wb(inode, &folio->page);
                wb = inode_to_wb(inode);
 
-               __inc_lruvec_page_state(page, NR_FILE_DIRTY);
-               __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               __inc_node_page_state(page, NR_DIRTIED);
-               inc_wb_stat(wb, WB_RECLAIMABLE);
-               inc_wb_stat(wb, WB_DIRTIED);
-               task_io_account_write(PAGE_SIZE);
-               current->nr_dirtied++;
-               __this_cpu_inc(bdp_ratelimits);
+               __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+               __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
+               __node_stat_mod_folio(folio, NR_DIRTIED, nr);
+               wb_stat_mod(wb, WB_RECLAIMABLE, nr);
+               wb_stat_mod(wb, WB_DIRTIED, nr);
+               task_io_account_write(nr * PAGE_SIZE);
+               current->nr_dirtied += nr;
+               __this_cpu_add(bdp_ratelimits, nr);
 
-               mem_cgroup_track_foreign_dirty(page, wb);
+               mem_cgroup_track_foreign_dirty(folio, wb);
        }
 }
 
@@ -2469,130 +2470,152 @@ static void account_page_dirtied(struct page *page,
  *
  * Caller must hold lock_page_memcg().
  */
-void account_page_cleaned(struct page *page, struct address_space *mapping,
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
                          struct bdi_writeback *wb)
 {
        if (mapping_can_writeback(mapping)) {
-               dec_lruvec_page_state(page, NR_FILE_DIRTY);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               dec_wb_stat(wb, WB_RECLAIMABLE);
-               task_io_account_cancelled_write(PAGE_SIZE);
+               long nr = folio_nr_pages(folio);
+               lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+               wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+               task_io_account_cancelled_write(nr * PAGE_SIZE);
        }
 }
 
 /*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
+ * Mark the folio dirty, and set it dirty in the page cache, and mark
+ * the inode dirty.
  *
- * If warn is true, then emit a warning if the page is not uptodate and has
+ * If warn is true, then emit a warning if the folio is not uptodate and has
  * not been truncated.
  *
  * The caller must hold lock_page_memcg().
  */
-void __set_page_dirty(struct page *page, struct address_space *mapping,
+void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
                             int warn)
 {
        unsigned long flags;
 
        xa_lock_irqsave(&mapping->i_pages, flags);
-       if (page->mapping) {    /* Race with truncate? */
-               WARN_ON_ONCE(warn && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
-               __xa_set_mark(&mapping->i_pages, page_index(page),
+       if (folio->mapping) {   /* Race with truncate? */
+               WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
+               folio_account_dirtied(folio, mapping);
+               __xa_set_mark(&mapping->i_pages, folio_index(folio),
                                PAGECACHE_TAG_DIRTY);
        }
        xa_unlock_irqrestore(&mapping->i_pages, flags);
 }
 
-/*
- * For address_spaces which do not use buffers.  Just tag the page as dirty in
- * the xarray.
+/**
+ * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
+ * @mapping: Address space this folio belongs to.
+ * @folio: Folio to be marked as dirty.
+ *
+ * Filesystems which do not use buffer heads should call this function
+ * from their set_page_dirty address space operation.  It ignores the
+ * contents of folio_get_private(), so if the filesystem marks individual
+ * blocks as dirty, the filesystem should handle that itself.
  *
- * This is also used when a single buffer is being dirtied: we want to set the
- * page dirty in that case, but not all the buffers.  This is a "bottom-up"
- * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ * This is also sometimes used by filesystems which use buffer_heads when
+ * a single buffer is being dirtied: we want to set the folio dirty in
+ * that case, but not all the buffers.  This is a "bottom-up" dirtying,
+ * whereas __set_page_dirty_buffers() is a "top-down" dirtying.
  *
- * The caller must ensure this doesn't race with truncation.  Most will simply
- * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * the pte lock held, which also locks out truncation.
+ * The caller must ensure this doesn't race with truncation.  Most will
+ * simply hold the folio lock, but e.g. zap_pte_range() calls with the
+ * folio mapped and the pte lock held, which also locks out truncation.
  */
-int __set_page_dirty_nobuffers(struct page *page)
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
 {
-       lock_page_memcg(page);
-       if (!TestSetPageDirty(page)) {
-               struct address_space *mapping = page_mapping(page);
+       folio_memcg_lock(folio);
+       if (folio_test_set_dirty(folio)) {
+               folio_memcg_unlock(folio);
+               return false;
+       }
 
-               if (!mapping) {
-                       unlock_page_memcg(page);
-                       return 1;
-               }
-               __set_page_dirty(page, mapping, !PagePrivate(page));
-               unlock_page_memcg(page);
+       __folio_mark_dirty(folio, mapping, !folio_test_private(folio));
+       folio_memcg_unlock(folio);
 
-               if (mapping->host) {
-                       /* !PageAnon && !swapper_space */
-                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-               }
-               return 1;
+       if (mapping->host) {
+               /* !PageAnon && !swapper_space */
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
-       unlock_page_memcg(page);
-       return 0;
+       return true;
 }
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+EXPORT_SYMBOL(filemap_dirty_folio);
 
-/*
- * Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
- * systematic errors in balanced_dirty_ratelimit and the dirty pages position
- * control.
+/**
+ * folio_account_redirty - Manually account for redirtying a page.
+ * @folio: The folio which is being redirtied.
+ *
+ * Most filesystems should call folio_redirty_for_writepage() instead
+ * of this fuction.  If your filesystem is doing writeback outside the
+ * context of a writeback_control(), it can call this when redirtying
+ * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
+ * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
+ * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
+ * in balanced_dirty_ratelimit and the dirty pages position control.
  */
-void account_page_redirty(struct page *page)
+void folio_account_redirty(struct folio *folio)
 {
-       struct address_space *mapping = page->mapping;
+       struct address_space *mapping = folio->mapping;
 
        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};
+               long nr = folio_nr_pages(folio);
 
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
-               current->nr_dirtied--;
-               dec_node_page_state(page, NR_DIRTIED);
-               dec_wb_stat(wb, WB_DIRTIED);
+               current->nr_dirtied -= nr;
+               node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+               wb_stat_mod(wb, WB_DIRTIED, -nr);
                unlocked_inode_to_wb_end(inode, &cookie);
        }
 }
-EXPORT_SYMBOL(account_page_redirty);
+EXPORT_SYMBOL(folio_account_redirty);
 
-/*
- * When a writepage implementation decides that it doesn't want to write this
- * page for some reason, it should redirty the locked page via
- * redirty_page_for_writepage() and it should then unlock the page and return 0
+/**
+ * folio_redirty_for_writepage - Decline to write a dirty folio.
+ * @wbc: The writeback control.
+ * @folio: The folio.
+ *
+ * When a writepage implementation decides that it doesn't want to write
+ * @folio for some reason, it should call this function, unlock @folio and
+ * return 0.
+ *
+ * Return: True if we redirtied the folio.  False if someone else dirtied
+ * it first.
  */
-int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+bool folio_redirty_for_writepage(struct writeback_control *wbc,
+               struct folio *folio)
 {
-       int ret;
+       bool ret;
+       long nr = folio_nr_pages(folio);
+
+       wbc->pages_skipped += nr;
+       ret = filemap_dirty_folio(folio->mapping, folio);
+       folio_account_redirty(folio);
 
-       wbc->pages_skipped++;
-       ret = __set_page_dirty_nobuffers(page);
-       account_page_redirty(page);
        return ret;
 }
-EXPORT_SYMBOL(redirty_page_for_writepage);
+EXPORT_SYMBOL(folio_redirty_for_writepage);
 
-/*
- * Dirty a page.
+/**
+ * folio_mark_dirty - Mark a folio as being modified.
+ * @folio: The folio.
  *
- * For pages with a mapping this should be done under the page lock for the
- * benefit of asynchronous memory errors who prefer a consistent dirty state.
- * This rule can be broken in some special cases, but should be better not to.
+ * For folios with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
+ * Return: True if the folio was newly dirtied, false if it was already dirty.
  */
-int set_page_dirty(struct page *page)
+bool folio_mark_dirty(struct folio *folio)
 {
-       struct address_space *mapping = page_mapping(page);
+       struct address_space *mapping = folio_mapping(folio);
 
-       page = compound_head(page);
        if (likely(mapping)) {
                /*
                 * readahead/lru_deactivate_page could remain
@@ -2604,17 +2627,17 @@ int set_page_dirty(struct page *page)
                 * it will confuse readahead and make it restart the size rampup
                 * process. But it's a trivial problem.
                 */
-               if (PageReclaim(page))
-                       ClearPageReclaim(page);
-               return mapping->a_ops->set_page_dirty(page);
+               if (folio_test_reclaim(folio))
+                       folio_clear_reclaim(folio);
+               return mapping->a_ops->set_page_dirty(&folio->page);
        }
-       if (!PageDirty(page)) {
-               if (!TestSetPageDirty(page))
-                       return 1;
+       if (!folio_test_dirty(folio)) {
+               if (!folio_test_set_dirty(folio))
+                       return true;
        }
-       return 0;
+       return false;
 }
-EXPORT_SYMBOL(set_page_dirty);
+EXPORT_SYMBOL(folio_mark_dirty);
 
 /*
  * set_page_dirty() is racy if the caller has no reference against
@@ -2650,49 +2673,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  * page without actually doing it through the VM. Can you say "ext3 is
  * horribly ugly"? Thought you could.
  */
-void __cancel_dirty_page(struct page *page)
+void __folio_cancel_dirty(struct folio *folio)
 {
-       struct address_space *mapping = page_mapping(page);
+       struct address_space *mapping = folio_mapping(folio);
 
        if (mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};
 
-               lock_page_memcg(page);
+               folio_memcg_lock(folio);
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
 
-               if (TestClearPageDirty(page))
-                       account_page_cleaned(page, mapping, wb);
+               if (folio_test_clear_dirty(folio))
+                       folio_account_cleaned(folio, mapping, wb);
 
                unlocked_inode_to_wb_end(inode, &cookie);
-               unlock_page_memcg(page);
+               folio_memcg_unlock(folio);
        } else {
-               ClearPageDirty(page);
+               folio_clear_dirty(folio);
        }
 }
-EXPORT_SYMBOL(__cancel_dirty_page);
+EXPORT_SYMBOL(__folio_cancel_dirty);
 
 /*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- *
- * This is for preparing to put the page under writeout.  We leave the page
- * tagged as dirty in the xarray so that a concurrent write-for-sync
- * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
- * implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and xarray dirty tag
- * back into sync.
- *
- * This incoherency between the page's dirty flag and xarray tag is
- * unfortunate, but it only exists while the page is locked.
+ * Clear a folio's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the folio was previously dirty.
+ *
+ * This is for preparing to put the folio under writeout.  We leave
+ * the folio tagged as dirty in the xarray so that a concurrent
+ * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
+ * The ->writepage implementation will run either folio_start_writeback()
+ * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
+ * and xarray dirty tag back into sync.
+ *
+ * This incoherency between the folio's dirty flag and xarray tag is
+ * unfortunate, but it only exists while the folio is locked.
  */
-int clear_page_dirty_for_io(struct page *page)
+bool folio_clear_dirty_for_io(struct folio *folio)
 {
-       struct address_space *mapping = page_mapping(page);
-       int ret = 0;
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret = false;
 
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
@@ -2705,48 +2728,49 @@ int clear_page_dirty_for_io(struct page *page)
                 * We use this sequence to make sure that
                 *  (a) we account for dirty stats properly
                 *  (b) we tell the low-level filesystem to
-                *      mark the whole page dirty if it was
+                *      mark the whole folio dirty if it was
                 *      dirty in a pagetable. Only to then
-                *  (c) clean the page again and return 1 to
+                *  (c) clean the folio again and return 1 to
                 *      cause the writeback.
                 *
                 * This way we avoid all nasty races with the
                 * dirty bit in multiple places and clearing
                 * them concurrently from different threads.
                 *
-                * Note! Normally the "set_page_dirty(page)"
+                * Note! Normally the "folio_mark_dirty(folio)"
                 * has no effect on the actual dirty bit - since
                 * that will already usually be set. But we
                 * need the side effects, and it can help us
                 * avoid races.
                 *
-                * We basically use the page "master dirty bit"
+                * We basically use the folio "master dirty bit"
                 * as a serialization point for all the different
                 * threads doing their things.
                 */
-               if (page_mkclean(page))
-                       set_page_dirty(page);
+               if (folio_mkclean(folio))
+                       folio_mark_dirty(folio);
                /*
                 * We carefully synchronise fault handlers against
-                * installing a dirty pte and marking the page dirty
+                * installing a dirty pte and marking the folio dirty
                 * at this point.  We do this by having them hold the
-                * page lock while dirtying the page, and pages are
+                * page lock while dirtying the folio, and folios are
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
-               if (TestClearPageDirty(page)) {
-                       dec_lruvec_page_state(page, NR_FILE_DIRTY);
-                       dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-                       dec_wb_stat(wb, WB_RECLAIMABLE);
-                       ret = 1;
+               if (folio_test_clear_dirty(folio)) {
+                       long nr = folio_nr_pages(folio);
+                       lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+                       zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+                       wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+                       ret = true;
                }
                unlocked_inode_to_wb_end(inode, &cookie);
                return ret;
        }
-       return TestClearPageDirty(page);
+       return folio_test_clear_dirty(folio);
 }
-EXPORT_SYMBOL(clear_page_dirty_for_io);
+EXPORT_SYMBOL(folio_clear_dirty_for_io);
 
 static void wb_inode_writeback_start(struct bdi_writeback *wb)
 {
@@ -2766,27 +2790,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb)
        queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
 }
 
-int test_clear_page_writeback(struct page *page)
+bool __folio_end_writeback(struct folio *folio)
 {
-       struct address_space *mapping = page_mapping(page);
-       int ret;
+       long nr = folio_nr_pages(folio);
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret;
 
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
 
                xa_lock_irqsave(&mapping->i_pages, flags);
-               ret = TestClearPageWriteback(page);
+               ret = folio_test_clear_writeback(folio);
                if (ret) {
-                       __xa_clear_mark(&mapping->i_pages, page_index(page),
+                       __xa_clear_mark(&mapping->i_pages, folio_index(folio),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                                struct bdi_writeback *wb = inode_to_wb(inode);
 
-                               dec_wb_stat(wb, WB_WRITEBACK);
-                               __wb_writeout_inc(wb);
+                               wb_stat_mod(wb, WB_WRITEBACK, -nr);
+                               __wb_writeout_add(wb, nr);
                                if (!mapping_tagged(mapping,
                                                    PAGECACHE_TAG_WRITEBACK))
                                        wb_inode_writeback_end(wb);
@@ -2799,32 +2824,34 @@ int test_clear_page_writeback(struct page *page)
 
                xa_unlock_irqrestore(&mapping->i_pages, flags);
        } else {
-               ret = TestClearPageWriteback(page);
+               ret = folio_test_clear_writeback(folio);
        }
        if (ret) {
-               dec_lruvec_page_state(page, NR_WRITEBACK);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               inc_node_page_state(page, NR_WRITTEN);
+               lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+               node_stat_mod_folio(folio, NR_WRITTEN, nr);
        }
-       unlock_page_memcg(page);
+       folio_memcg_unlock(folio);
        return ret;
 }
 
-int __test_set_page_writeback(struct page *page, bool keep_write)
+bool __folio_start_writeback(struct folio *folio, bool keep_write)
 {
-       struct address_space *mapping = page_mapping(page);
-       int ret, access_ret;
+       long nr = folio_nr_pages(folio);
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret;
+       int access_ret;
 
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
        if (mapping && mapping_use_writeback_tags(mapping)) {
-               XA_STATE(xas, &mapping->i_pages, page_index(page));
+               XA_STATE(xas, &mapping->i_pages, folio_index(folio));
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
 
                xas_lock_irqsave(&xas, flags);
                xas_load(&xas);
-               ret = TestSetPageWriteback(page);
+               ret = folio_test_set_writeback(folio);
                if (!ret) {
                        bool on_wblist;
 
@@ -2835,84 +2862,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                        if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                                struct bdi_writeback *wb = inode_to_wb(inode);
 
-                               inc_wb_stat(wb, WB_WRITEBACK);
+                               wb_stat_mod(wb, WB_WRITEBACK, nr);
                                if (!on_wblist)
                                        wb_inode_writeback_start(wb);
                        }
 
                        /*
-                        * We can come through here when swapping anonymous
-                        * pages, so we don't necessarily have an inode to track
-                        * for sync.
+                        * We can come through here when swapping
+                        * anonymous folios, so we don't necessarily
+                        * have an inode to track for sync.
                         */
                        if (mapping->host && !on_wblist)
                                sb_mark_inode_writeback(mapping->host);
                }
-               if (!PageDirty(page))
+               if (!folio_test_dirty(folio))
                        xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                if (!keep_write)
                        xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                xas_unlock_irqrestore(&xas, flags);
        } else {
-               ret = TestSetPageWriteback(page);
+               ret = folio_test_set_writeback(folio);
        }
        if (!ret) {
-               inc_lruvec_page_state(page, NR_WRITEBACK);
-               inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+               lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
        }
-       unlock_page_memcg(page);
-       access_ret = arch_make_page_accessible(page);
+       folio_memcg_unlock(folio);
+       access_ret = arch_make_folio_accessible(folio);
        /*
         * If writeback has been triggered on a page that cannot be made
         * accessible, it is too late to recover here.
         */
-       VM_BUG_ON_PAGE(access_ret != 0, page);
+       VM_BUG_ON_FOLIO(access_ret != 0, folio);
 
        return ret;
-
 }
-EXPORT_SYMBOL(__test_set_page_writeback);
+EXPORT_SYMBOL(__folio_start_writeback);
 
-/*
- * Wait for a page to complete writeback
+/**
+ * folio_wait_writeback - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete.
+ *
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
  */
-void wait_on_page_writeback(struct page *page)
+void folio_wait_writeback(struct folio *folio)
 {
-       while (PageWriteback(page)) {
-               trace_wait_on_page_writeback(page, page_mapping(page));
-               wait_on_page_bit(page, PG_writeback);
+       while (folio_test_writeback(folio)) {
+               trace_folio_wait_writeback(folio, folio_mapping(folio));
+               folio_wait_bit(folio, PG_writeback);
        }
 }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+EXPORT_SYMBOL_GPL(folio_wait_writeback);
 
-/*
- * Wait for a page to complete writeback.  Returns -EINTR if we get a
- * fatal signal while waiting.
+/**
+ * folio_wait_writeback_killable - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete or a fatal signal to arrive.
+ *
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
+ * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
  */
-int wait_on_page_writeback_killable(struct page *page)
+int folio_wait_writeback_killable(struct folio *folio)
 {
-       while (PageWriteback(page)) {
-               trace_wait_on_page_writeback(page, page_mapping(page));
-               if (wait_on_page_bit_killable(page, PG_writeback))
+       while (folio_test_writeback(folio)) {
+               trace_folio_wait_writeback(folio, folio_mapping(folio));
+               if (folio_wait_bit_killable(folio, PG_writeback))
                        return -EINTR;
        }
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
 
 /**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page:      The page to wait on.
+ * folio_wait_stable() - wait for writeback to finish, if necessary.
+ * @folio: The folio to wait on.
+ *
+ * This function determines if the given folio is related to a backing
+ * device that requires folio contents to be held stable during writeback.
+ * If so, then it will wait for any pending writeback to complete.
  *
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback.  If so, then
- * it will wait for any pending writeback to complete.
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
  */
-void wait_for_stable_page(struct page *page)
+void folio_wait_stable(struct folio *folio)
 {
-       page = thp_head(page);
-       if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
-               wait_on_page_writeback(page);
+       if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
+               folio_wait_writeback(folio);
 }
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
+EXPORT_SYMBOL_GPL(folio_wait_stable);
index b37435c..fee18ad 100644 (file)
@@ -724,7 +724,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
 
 void free_compound_page(struct page *page)
 {
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
        free_the_page(page, compound_order(page));
 }
 
@@ -1312,8 +1312,10 @@ static __always_inline bool free_pages_prepare(struct page *page,
 
                VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 
-               if (compound)
+               if (compound) {
                        ClearPageDoubleMap(page);
+                       ClearPageHasHWPoisoned(page);
+               }
                for (i = 1; i < (1 << order); i++) {
                        if (compound)
                                bad += free_tail_pages_check(page, page + i);
@@ -5223,6 +5225,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
        if (unlikely(page_array && nr_pages - nr_populated == 0))
                goto out;
 
+       /* Bulk allocator does not support memcg accounting. */
+       if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT))
+               goto failed;
+
        /* Use the single page allocator for one page. */
        if (nr_pages - nr_populated == 1)
                goto failed;
@@ -5400,6 +5406,18 @@ out:
 }
 EXPORT_SYMBOL(__alloc_pages);
 
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+               nodemask_t *nodemask)
+{
+       struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+                       preferred_nid, nodemask);
+
+       if (page && order > 1)
+               prep_transhuge_page(page);
+       return (struct folio *)page;
+}
+EXPORT_SYMBOL(__folio_alloc);
+
 /*
  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
  * address cannot represent highmem pages. Use alloc_pages and then kmap if
index c493ce9..9725c7e 100644 (file)
@@ -38,7 +38,7 @@ void end_swap_bio_write(struct bio *bio)
                 * Also print a dire warning that things will go BAD (tm)
                 * very quickly.
                 *
-                * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+                * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
                 */
                set_page_dirty(page);
                pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
@@ -317,7 +317,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                         * temporary failure if the system has limited
                         * memory for allocating transmit buffers.
                         * Mark the page dirty and avoid
-                        * rotate_reclaimable_page but rate-limit the
+                        * folio_rotate_reclaimable but rate-limit the
                         * messages but do not flag PageError like
                         * the normal direct-to-bio case as it could
                         * be temporary.
@@ -358,8 +358,6 @@ int swap_readpage(struct page *page, bool synchronous)
        struct bio *bio;
        int ret = 0;
        struct swap_info_struct *sis = page_swap_info(page);
-       blk_qc_t qc;
-       struct gendisk *disk;
        unsigned long pflags;
 
        VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@@ -409,26 +407,24 @@ int swap_readpage(struct page *page, bool synchronous)
        bio->bi_iter.bi_sector = swap_page_sector(page);
        bio->bi_end_io = end_swap_bio_read;
        bio_add_page(bio, page, thp_size(page), 0);
-
-       disk = bio->bi_bdev->bd_disk;
        /*
         * Keep this task valid during swap readpage because the oom killer may
         * attempt to access it in the page fault retry time check.
         */
        if (synchronous) {
-               bio->bi_opf |= REQ_HIPRI;
+               bio->bi_opf |= REQ_POLLED;
                get_task_struct(current);
                bio->bi_private = current;
        }
        count_vm_event(PSWPIN);
        bio_get(bio);
-       qc = submit_bio(bio);
+       submit_bio(bio);
        while (synchronous) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(bio->bi_private))
                        break;
 
-               if (!blk_poll(disk->queue, qc, true))
+               if (!bio_poll(bio, NULL, 0))
                        blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);
index 62402d2..d24ed22 100644 (file)
@@ -210,10 +210,10 @@ void __split_page_owner(struct page *page, unsigned int nr)
        }
 }
 
-void __copy_page_owner(struct page *oldpage, struct page *newpage)
+void __folio_copy_owner(struct folio *newfolio, struct folio *old)
 {
-       struct page_ext *old_ext = lookup_page_ext(oldpage);
-       struct page_ext *new_ext = lookup_page_ext(newpage);
+       struct page_ext *old_ext = lookup_page_ext(&old->page);
+       struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
        struct page_owner *old_page_owner, *new_page_owner;
 
        if (unlikely(!old_ext || !new_ext))
@@ -231,11 +231,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
        new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
 
        /*
-        * We don't clear the bit on the oldpage as it's going to be freed
+        * We don't clear the bit on the old folio as it's going to be freed
         * after migration. Until then, the info can be useful in case of
         * a bug, and the overall stats will be off a bit only temporarily.
         * Also, migrate_misplaced_transhuge_page() can still fail the
-        * migration and then we want the oldpage to retain the info. But
+        * migration and then we want the old folio to retain the info. But
         * in that case we also don't need to explicitly clear the info from
         * the new page, which will be freed.
         */
index 41b75d7..e71e719 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/dax.h>
 #include <linux/gfp.h>
 #include <linux/export.h>
-#include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
index 6aebd17..3a1059c 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,7 +34,7 @@
  *                   mapping->private_lock (in __set_page_dirty_buffers)
  *                     lock_page_memcg move_lock (in __set_page_dirty_buffers)
  *                       i_pages lock (widely used)
- *                         lruvec->lru_lock (in lock_page_lruvec_irq)
+ *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
  *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                     sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -981,7 +981,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
        return true;
 }
 
-int page_mkclean(struct page *page)
+int folio_mkclean(struct folio *folio)
 {
        int cleaned = 0;
        struct address_space *mapping;
@@ -991,20 +991,20 @@ int page_mkclean(struct page *page)
                .invalid_vma = invalid_mkclean_vma,
        };
 
-       BUG_ON(!PageLocked(page));
+       BUG_ON(!folio_test_locked(folio));
 
-       if (!page_mapped(page))
+       if (!folio_mapped(folio))
                return 0;
 
-       mapping = page_mapping(page);
+       mapping = folio_mapping(folio);
        if (!mapping)
                return 0;
 
-       rmap_walk(page, &rwc);
+       rmap_walk(&folio->page, &rwc);
 
        return cleaned;
 }
-EXPORT_SYMBOL_GPL(page_mkclean);
+EXPORT_SYMBOL_GPL(folio_mkclean);
 
 /**
  * page_move_anon_rmap - move a page to our anon_vma
index c2dda40..22b310a 100644 (file)
@@ -218,8 +218,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
 
        file->f_flags |= O_LARGEFILE;
 
-       fd_install(fd, file);
        atomic_inc(&secretmem_users);
+       fd_install(fd, file);
        return fd;
 
 err_put_fd:
index b5860f4..17e344e 100644 (file)
@@ -59,7 +59,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/backing-dev.h>
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
-#include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
 #include <linux/falloc.h>
@@ -710,7 +709,7 @@ static int shmem_add_to_page_cache(struct page *page,
        page->index = index;
 
        if (!PageSwapCache(page)) {
-               error = mem_cgroup_charge(page, charge_mm, gfp);
+               error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
                if (error) {
                        if (PageTransHuge(page)) {
                                count_vm_event(THP_FILE_FALLBACK);
@@ -1637,6 +1636,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index)
 {
        struct page *oldpage, *newpage;
+       struct folio *old, *new;
        struct address_space *swap_mapping;
        swp_entry_t entry;
        pgoff_t swap_index;
@@ -1673,7 +1673,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        xa_lock_irq(&swap_mapping->i_pages);
        error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
        if (!error) {
-               mem_cgroup_migrate(oldpage, newpage);
+               old = page_folio(oldpage);
+               new = page_folio(newpage);
+               mem_cgroup_migrate(old, new);
                __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
                __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
        }
index af3cad4..8ff9ba7 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 static void __page_cache_release(struct page *page)
 {
        if (PageLRU(page)) {
+               struct folio *folio = page_folio(page);
                struct lruvec *lruvec;
                unsigned long flags;
 
-               lruvec = lock_page_lruvec_irqsave(page, &flags);
+               lruvec = folio_lruvec_lock_irqsave(folio, &flags);
                del_page_from_lru_list(page, lruvec);
                __clear_page_lru_flags(page);
                unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page)
 static void __put_single_page(struct page *page)
 {
        __page_cache_release(page);
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
        free_unref_page(page, 0);
 }
 
@@ -188,12 +189,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
 
                /* block memcg migration during page moving between lru */
                if (!TestClearPageLRU(page))
                        continue;
 
-               lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
+               lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
                (*move_fn)(page, lruvec);
 
                SetPageLRU(page);
@@ -206,11 +208,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 
 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 {
-       if (!PageUnevictable(page)) {
-               del_page_from_lru_list(page, lruvec);
-               ClearPageActive(page);
-               add_page_to_lru_list_tail(page, lruvec);
-               __count_vm_events(PGROTATED, thp_nr_pages(page));
+       struct folio *folio = page_folio(page);
+
+       if (!folio_test_unevictable(folio)) {
+               lruvec_del_folio(lruvec, folio);
+               folio_clear_active(folio);
+               lruvec_add_folio_tail(lruvec, folio);
+               __count_vm_events(PGROTATED, folio_nr_pages(folio));
        }
 }
 
@@ -227,23 +231,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
 }
 
 /*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim.  If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+ * Writeback is about to end against a folio which has been marked for
+ * immediate reclaim.  If it still appears to be reclaimable, move it
+ * to the tail of the inactive list.
  *
- * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
+ * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
  */
-void rotate_reclaimable_page(struct page *page)
+void folio_rotate_reclaimable(struct folio *folio)
 {
-       if (!PageLocked(page) && !PageDirty(page) &&
-           !PageUnevictable(page) && PageLRU(page)) {
+       if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
+           !folio_test_unevictable(folio) && folio_test_lru(folio)) {
                struct pagevec *pvec;
                unsigned long flags;
 
-               get_page(page);
+               folio_get(folio);
                local_lock_irqsave(&lru_rotate.lock, flags);
                pvec = this_cpu_ptr(&lru_rotate.pvec);
-               if (pagevec_add_and_need_flush(pvec, page))
+               if (pagevec_add_and_need_flush(pvec, &folio->page))
                        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
@@ -289,21 +293,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
        } while ((lruvec = parent_lruvec(lruvec)));
 }
 
-void lru_note_cost_page(struct page *page)
+void lru_note_cost_folio(struct folio *folio)
 {
-       lru_note_cost(mem_cgroup_page_lruvec(page),
-                     page_is_file_lru(page), thp_nr_pages(page));
+       lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
+                       folio_nr_pages(folio));
 }
 
-static void __activate_page(struct page *page, struct lruvec *lruvec)
+static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
 {
-       if (!PageActive(page) && !PageUnevictable(page)) {
-               int nr_pages = thp_nr_pages(page);
+       if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
+               long nr_pages = folio_nr_pages(folio);
 
-               del_page_from_lru_list(page, lruvec);
-               SetPageActive(page);
-               add_page_to_lru_list(page, lruvec);
-               trace_mm_lru_activate(page);
+               lruvec_del_folio(lruvec, folio);
+               folio_set_active(folio);
+               lruvec_add_folio(lruvec, folio);
+               trace_mm_lru_activate(folio);
 
                __count_vm_events(PGACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
@@ -312,6 +316,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
 }
 
 #ifdef CONFIG_SMP
+static void __activate_page(struct page *page, struct lruvec *lruvec)
+{
+       return __folio_activate(page_folio(page), lruvec);
+}
+
 static void activate_page_drain(int cpu)
 {
        struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
@@ -325,16 +334,16 @@ static bool need_activate_page_drain(int cpu)
        return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
 }
 
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
 {
-       page = compound_head(page);
-       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+       if (folio_test_lru(folio) && !folio_test_active(folio) &&
+           !folio_test_unevictable(folio)) {
                struct pagevec *pvec;
 
+               folio_get(folio);
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.activate_page);
-               get_page(page);
-               if (pagevec_add_and_need_flush(pvec, page))
+               if (pagevec_add_and_need_flush(pvec, &folio->page))
                        pagevec_lru_move_fn(pvec, __activate_page);
                local_unlock(&lru_pvecs.lock);
        }
@@ -345,21 +354,20 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
 {
        struct lruvec *lruvec;
 
-       page = compound_head(page);
-       if (TestClearPageLRU(page)) {
-               lruvec = lock_page_lruvec_irq(page);
-               __activate_page(page, lruvec);
+       if (folio_test_clear_lru(folio)) {
+               lruvec = folio_lruvec_lock_irq(folio);
+               __folio_activate(folio, lruvec);
                unlock_page_lruvec_irq(lruvec);
-               SetPageLRU(page);
+               folio_set_lru(folio);
        }
 }
 #endif
 
-static void __lru_cache_activate_page(struct page *page)
+static void __lru_cache_activate_folio(struct folio *folio)
 {
        struct pagevec *pvec;
        int i;
@@ -380,8 +388,8 @@ static void __lru_cache_activate_page(struct page *page)
        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
                struct page *pagevec_page = pvec->pages[i];
 
-               if (pagevec_page == page) {
-                       SetPageActive(page);
+               if (pagevec_page == &folio->page) {
+                       folio_set_active(folio);
                        break;
                }
        }
@@ -399,61 +407,59 @@ static void __lru_cache_activate_page(struct page *page)
  * When a newly allocated page is not yet visible, so safe for non-atomic ops,
  * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
  */
-void mark_page_accessed(struct page *page)
+void folio_mark_accessed(struct folio *folio)
 {
-       page = compound_head(page);
-
-       if (!PageReferenced(page)) {
-               SetPageReferenced(page);
-       } else if (PageUnevictable(page)) {
+       if (!folio_test_referenced(folio)) {
+               folio_set_referenced(folio);
+       } else if (folio_test_unevictable(folio)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * evictable page accessed has no effect.
                 */
-       } else if (!PageActive(page)) {
+       } else if (!folio_test_active(folio)) {
                /*
                 * If the page is on the LRU, queue it for activation via
                 * lru_pvecs.activate_page. Otherwise, assume the page is on a
                 * pagevec, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
-               if (PageLRU(page))
-                       activate_page(page);
+               if (folio_test_lru(folio))
+                       folio_activate(folio);
                else
-                       __lru_cache_activate_page(page);
-               ClearPageReferenced(page);
-               workingset_activation(page);
+                       __lru_cache_activate_folio(folio);
+               folio_clear_referenced(folio);
+               workingset_activation(folio);
        }
-       if (page_is_idle(page))
-               clear_page_idle(page);
+       if (folio_test_idle(folio))
+               folio_clear_idle(folio);
 }
-EXPORT_SYMBOL(mark_page_accessed);
+EXPORT_SYMBOL(folio_mark_accessed);
 
 /**
- * lru_cache_add - add a page to a page list
- * @page: the page to be added to the LRU.
+ * folio_add_lru - Add a folio to an LRU list.
+ * @folio: The folio to be added to the LRU.
  *
- * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * Queue the folio for addition to the LRU. The decision on whether
  * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * pagevec is drained. This gives a chance for the caller of folio_add_lru()
+ * have the folio added to the active list using folio_mark_accessed().
  */
-void lru_cache_add(struct page *page)
+void folio_add_lru(struct folio *folio)
 {
        struct pagevec *pvec;
 
-       VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
-       get_page(page);
+       folio_get(folio);
        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-       if (pagevec_add_and_need_flush(pvec, page))
+       if (pagevec_add_and_need_flush(pvec, &folio->page))
                __pagevec_lru_add(pvec);
        local_unlock(&lru_pvecs.lock);
 }
-EXPORT_SYMBOL(lru_cache_add);
+EXPORT_SYMBOL(folio_add_lru);
 
 /**
  * lru_cache_add_inactive_or_unevictable
@@ -888,11 +894,12 @@ void release_pages(struct page **pages, int nr)
        int i;
        LIST_HEAD(pages_to_free);
        struct lruvec *lruvec = NULL;
-       unsigned long flags;
+       unsigned long flags = 0;
        unsigned int lock_batch;
 
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
+               struct folio *folio = page_folio(page);
 
                /*
                 * Make sure the IRQ-safe lock-holding time does not get
@@ -904,7 +911,7 @@ void release_pages(struct page **pages, int nr)
                        lruvec = NULL;
                }
 
-               page = compound_head(page);
+               page = &folio->page;
                if (is_huge_zero_page(page))
                        continue;
 
@@ -943,7 +950,7 @@ void release_pages(struct page **pages, int nr)
                if (PageLRU(page)) {
                        struct lruvec *prev_lruvec = lruvec;
 
-                       lruvec = relock_page_lruvec_irqsave(page, lruvec,
+                       lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
                                                                        &flags);
                        if (prev_lruvec != lruvec)
                                lock_batch = 0;
@@ -985,17 +992,18 @@ void __pagevec_release(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_release);
 
-static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
+static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
 {
-       int was_unevictable = TestClearPageUnevictable(page);
-       int nr_pages = thp_nr_pages(page);
+       int was_unevictable = folio_test_clear_unevictable(folio);
+       long nr_pages = folio_nr_pages(folio);
 
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
        /*
-        * Page becomes evictable in two ways:
+        * A folio becomes evictable in two ways:
         * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
-        * 2) Before acquiring LRU lock to put the page to correct LRU and then
+        * 2) Before acquiring LRU lock to put the folio on the correct LRU
+        *    and then
         *   a) do PageLRU check with lock [check_move_unevictable_pages]
         *   b) do PageLRU check before lock [clear_page_mlock]
         *
@@ -1004,35 +1012,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
         *
         * #0: __pagevec_lru_add_fn             #1: clear_page_mlock
         *
-        * SetPageLRU()                         TestClearPageMlocked()
+        * folio_set_lru()                      folio_test_clear_mlocked()
         * smp_mb() // explicit ordering        // above provides strict
         *                                      // ordering
-        * PageMlocked()                        PageLRU()
+        * folio_test_mlocked()                 folio_test_lru()
         *
         *
-        * if '#1' does not observe setting of PG_lru by '#0' and fails
-        * isolation, the explicit barrier will make sure that page_evictable
-        * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
-        * can be reordered after PageMlocked check and can make '#1' to fail
-        * the isolation of the page whose Mlocked bit is cleared (#0 is also
-        * looking at the same page) and the evictable page will be stranded
-        * in an unevictable LRU.
+        * if '#1' does not observe setting of PG_lru by '#0' and
+        * fails isolation, the explicit barrier will make sure that
+        * folio_evictable check will put the folio on the correct
+        * LRU. Without smp_mb(), folio_set_lru() can be reordered
+        * after folio_test_mlocked() check and can make '#1' fail the
+        * isolation of the folio whose mlocked bit is cleared (#0 is
+        * also looking at the same folio) and the evictable folio will
+        * be stranded on an unevictable LRU.
         */
-       SetPageLRU(page);
+       folio_set_lru(folio);
        smp_mb__after_atomic();
 
-       if (page_evictable(page)) {
+       if (folio_evictable(folio)) {
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
-               ClearPageActive(page);
-               SetPageUnevictable(page);
+               folio_clear_active(folio);
+               folio_set_unevictable(folio);
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }
 
-       add_page_to_lru_list(page, lruvec);
-       trace_mm_lru_insertion(page);
+       lruvec_add_folio(lruvec, folio);
+       trace_mm_lru_insertion(folio);
 }
 
 /*
@@ -1046,10 +1055,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
        unsigned long flags = 0;
 
        for (i = 0; i < pagevec_count(pvec); i++) {
-               struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(pvec->pages[i]);
 
-               lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
-               __pagevec_lru_add_fn(page, lruvec);
+               lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+               __pagevec_lru_add_fn(folio, lruvec);
        }
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
index bc7cee6..8d41042 100644 (file)
@@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        mem_cgroup_swapin_uncharge_swap(entry);
 
        if (shadow)
-               workingset_refault(page, shadow);
+               workingset_refault(page_folio(page), shadow);
 
        /* Caller will initiate read into locked page */
        lru_cache_add(page);
index 22d10f7..41c9e92 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include <linux/shmem_fs.h>
-#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
 #include <linux/random.h>
 #include <linux/writeback.h>
 #include <linux/proc_fs.h>
@@ -3534,13 +3534,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
 }
 
 /*
- * out-of-line __page_file_ methods to avoid include hell.
+ * out-of-line methods to avoid include hell.
  */
-struct address_space *__page_file_mapping(struct page *page)
+struct address_space *swapcache_mapping(struct folio *folio)
 {
-       return page_swap_info(page)->swap_file->f_mapping;
+       return page_swap_info(&folio->page)->swap_file->f_mapping;
 }
-EXPORT_SYMBOL_GPL(__page_file_mapping);
+EXPORT_SYMBOL_GPL(swapcache_mapping);
 
 pgoff_t __page_file_index(struct page *page)
 {
index 7a90084..36e5f6a 100644 (file)
@@ -164,7 +164,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        __SetPageUptodate(page);
 
        ret = -ENOMEM;
-       if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
                goto out_release;
 
        ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
index bacabe4..e58151a 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -654,81 +654,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 }
 EXPORT_SYMBOL(kvrealloc);
 
-static inline void *__page_rmapping(struct page *page)
-{
-       unsigned long mapping;
-
-       mapping = (unsigned long)page->mapping;
-       mapping &= ~PAGE_MAPPING_FLAGS;
-
-       return (void *)mapping;
-}
-
 /* Neutral page->mapping pointer to address_space or anon_vma or other */
 void *page_rmapping(struct page *page)
 {
-       page = compound_head(page);
-       return __page_rmapping(page);
+       return folio_raw_mapping(page_folio(page));
 }
 
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
+/**
+ * folio_mapped - Is this folio mapped into userspace?
+ * @folio: The folio.
+ *
+ * Return: True if any page in this folio is referenced by user page tables.
  */
-bool page_mapped(struct page *page)
+bool folio_mapped(struct folio *folio)
 {
-       int i;
+       long i, nr;
 
-       if (likely(!PageCompound(page)))
-               return atomic_read(&page->_mapcount) >= 0;
-       page = compound_head(page);
-       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+       if (folio_test_single(folio))
+               return atomic_read(&folio->_mapcount) >= 0;
+       if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
                return true;
-       if (PageHuge(page))
+       if (folio_test_hugetlb(folio))
                return false;
-       for (i = 0; i < compound_nr(page); i++) {
-               if (atomic_read(&page[i]._mapcount) >= 0)
+
+       nr = folio_nr_pages(folio);
+       for (i = 0; i < nr; i++) {
+               if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
                        return true;
        }
        return false;
 }
-EXPORT_SYMBOL(page_mapped);
+EXPORT_SYMBOL(folio_mapped);
 
 struct anon_vma *page_anon_vma(struct page *page)
 {
-       unsigned long mapping;
+       struct folio *folio = page_folio(page);
+       unsigned long mapping = (unsigned long)folio->mapping;
 
-       page = compound_head(page);
-       mapping = (unsigned long)page->mapping;
        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
-       return __page_rmapping(page);
+       return (void *)(mapping - PAGE_MAPPING_ANON);
 }
 
-struct address_space *page_mapping(struct page *page)
+/**
+ * folio_mapping - Find the mapping where this folio is stored.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the swap mapping
+ * this page is stored in (which is different from the mapping for the
+ * swap file or swap device where the data is stored).
+ *
+ * You can call this for folios which aren't in the swap cache or page
+ * cache and it will return NULL.
+ */
+struct address_space *folio_mapping(struct folio *folio)
 {
        struct address_space *mapping;
 
-       page = compound_head(page);
-
        /* This happens if someone calls flush_dcache_page on slab page */
-       if (unlikely(PageSlab(page)))
+       if (unlikely(folio_test_slab(folio)))
                return NULL;
 
-       if (unlikely(PageSwapCache(page))) {
-               swp_entry_t entry;
+       if (unlikely(folio_test_swapcache(folio)))
+               return swap_address_space(folio_swap_entry(folio));
 
-               entry.val = page_private(page);
-               return swap_address_space(entry);
-       }
-
-       mapping = page->mapping;
+       mapping = folio->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                return NULL;
 
        return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
 }
-EXPORT_SYMBOL(page_mapping);
+EXPORT_SYMBOL(folio_mapping);
 
 /* Slow path of page_mapcount() for compound pages */
 int __page_mapcount(struct page *page)
@@ -750,13 +747,26 @@ int __page_mapcount(struct page *page)
 }
 EXPORT_SYMBOL_GPL(__page_mapcount);
 
-void copy_huge_page(struct page *dst, struct page *src)
+/**
+ * folio_copy - Copy the contents of one folio to another.
+ * @dst: Folio to copy to.
+ * @src: Folio to copy from.
+ *
+ * The bytes in the folio represented by @src are copied to @dst.
+ * Assumes the caller has validated that @dst is at least as large as @src.
+ * Can be called in atomic context for order-0 folios, but if the folio is
+ * larger, it may sleep.
+ */
+void folio_copy(struct folio *dst, struct folio *src)
 {
-       unsigned i, nr = compound_nr(src);
+       long i = 0;
+       long nr = folio_nr_pages(src);
 
-       for (i = 0; i < nr; i++) {
+       for (;;) {
+               copy_highpage(folio_page(dst, i), folio_page(src, i));
+               if (++i == nr)
+                       break;
                cond_resched();
-               copy_highpage(nth_page(dst, i), nth_page(src, i));
        }
 }
 
@@ -1079,3 +1089,14 @@ void page_offline_end(void)
        up_write(&page_offline_rwsem);
 }
 EXPORT_SYMBOL(page_offline_end);
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio)
+{
+       long i, nr = folio_nr_pages(folio);
+
+       for (i = 0; i < nr; i++)
+               flush_dcache_page(folio_page(folio, i));
+}
+EXPORT_SYMBOL(flush_dcache_folio);
+#endif
index d77830f..e8a807c 100644 (file)
@@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
 {
        unsigned int nr_allocated = 0;
+       struct page *page;
+       int i;
 
        /*
         * For order-0 pages we make use of bulk allocator, if
@@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
-       if (!order) {
+       if (!order && nid != NUMA_NO_NODE) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;
 
@@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                        if (nr != nr_pages_request)
                                break;
                }
-       } else
+       } else if (order)
                /*
                 * Compound pages required for remap_vmalloc_page if
                 * high-order pages.
@@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                gfp |= __GFP_COMP;
 
        /* High-order pages or fallback path if "bulk" fails. */
-       while (nr_allocated < nr_pages) {
-               struct page *page;
-               int i;
 
-               page = alloc_pages_node(nid, gfp, order);
+       while (nr_allocated < nr_pages) {
+               if (nid == NUMA_NO_NODE)
+                       page = alloc_pages(gfp, order);
+               else
+                       page = alloc_pages_node(nid, gfp, order);
                if (unlikely(!page))
                        break;
 
index 74296c2..306229c 100644 (file)
@@ -2090,6 +2090,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  */
 int isolate_lru_page(struct page *page)
 {
+       struct folio *folio = page_folio(page);
        int ret = -EBUSY;
 
        VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2099,7 +2100,7 @@ int isolate_lru_page(struct page *page)
                struct lruvec *lruvec;
 
                get_page(page);
-               lruvec = lock_page_lruvec_irq(page);
+               lruvec = folio_lruvec_lock_irq(folio);
                del_page_from_lru_list(page, lruvec);
                unlock_page_lruvec_irq(lruvec);
                ret = 0;
@@ -2199,7 +2200,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
                 * All pages were isolated from the same lruvec (and isolation
                 * inhibits memcg migration).
                 */
-               VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
+               VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
                add_page_to_lru_list(page, lruvec);
                nr_pages = thp_nr_pages(page);
                nr_moved += nr_pages;
@@ -4665,6 +4666,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 
        for (i = 0; i < pvec->nr; i++) {
                struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
                int nr_pages;
 
                if (PageTransTail(page))
@@ -4677,7 +4679,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
                if (!TestClearPageLRU(page))
                        continue;
 
-               lruvec = relock_page_lruvec_irq(page, lruvec);
+               lruvec = folio_lruvec_relock_irq(folio, lruvec);
                if (page_evictable(page) && PageUnevictable(page)) {
                        del_page_from_lru_list(page, lruvec);
                        ClearPageUnevictable(page);
index d5b81e4..109ab97 100644 (file)
@@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
 }
 
 /**
- * workingset_refault - evaluate the refault of a previously evicted page
- * @page: the freshly allocated replacement page
- * @shadow: shadow entry of the evicted page
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
  *
  * Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node and the memcg whose memory
+ * evicted folio in the context of the node and the memcg whose memory
  * pressure caused the eviction.
  */
-void workingset_refault(struct page *page, void *shadow)
+void workingset_refault(struct folio *folio, void *shadow)
 {
-       bool file = page_is_file_lru(page);
+       bool file = folio_is_file_lru(folio);
        struct mem_cgroup *eviction_memcg;
        struct lruvec *eviction_lruvec;
        unsigned long refault_distance;
@@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow)
        unsigned long refault;
        bool workingset;
        int memcgid;
+       long nr;
 
        unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
 
        rcu_read_lock();
        /*
         * Look up the memcg associated with the stored ID. It might
-        * have been deleted since the page's eviction.
+        * have been deleted since the folio's eviction.
         *
         * Note that in rare events the ID could have been recycled
-        * for a new cgroup that refaults a shared page. This is
+        * for a new cgroup that refaults a shared folio. This is
         * impossible to tell from the available data. However, this
         * should be a rare and limited disturbance, and activations
         * are always speculative anyway. Ultimately, it's the aging
@@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow)
        refault_distance = (refault - eviction) & EVICTION_MASK;
 
        /*
-        * The activation decision for this page is made at the level
+        * The activation decision for this folio is made at the level
         * where the eviction occurred, as that is where the LRU order
-        * during page reclaim is being determined.
+        * during folio reclaim is being determined.
         *
-        * However, the cgroup that will own the page is the one that
+        * However, the cgroup that will own the folio is the one that
         * is actually experiencing the refault event.
         */
-       memcg = page_memcg(page);
+       nr = folio_nr_pages(folio);
+       memcg = folio_memcg(folio);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
-       inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
+       mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
 
        mem_cgroup_flush_stats();
        /*
@@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow)
        if (refault_distance > workingset_size)
                goto out;
 
-       SetPageActive(page);
-       workingset_age_nonresident(lruvec, thp_nr_pages(page));
-       inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+       folio_set_active(folio);
+       workingset_age_nonresident(lruvec, nr);
+       mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
 
-       /* Page was active prior to eviction */
+       /* Folio was active prior to eviction */
        if (workingset) {
-               SetPageWorkingset(page);
+               folio_set_workingset(folio);
                /* XXX: Move to lru_cache_add() when it supports new vs putback */
-               lru_note_cost_page(page);
-               inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
+               lru_note_cost_folio(folio);
+               mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
        }
 out:
        rcu_read_unlock();
@@ -393,12 +395,11 @@ out:
 
 /**
  * workingset_activation - note a page activation
- * @page: page that is being activated
+ * @folio: Folio that is being activated.
  */
-void workingset_activation(struct page *page)
+void workingset_activation(struct folio *folio)
 {
        struct mem_cgroup *memcg;
-       struct lruvec *lruvec;
 
        rcu_read_lock();
        /*
@@ -408,11 +409,10 @@ void workingset_activation(struct page *page)
         * XXX: See workingset_refault() - this should return
         * root_mem_cgroup even for !CONFIG_MEMCG.
         */
-       memcg = page_memcg_rcu(page);
+       memcg = folio_memcg_rcu(folio);
        if (!mem_cgroup_disabled() && !memcg)
                goto out;
-       lruvec = mem_cgroup_page_lruvec(page);
-       workingset_age_nonresident(lruvec, thp_nr_pages(page));
+       workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
 out:
        rcu_read_unlock();
 }
index 1669744..1768784 100644 (file)
@@ -1560,10 +1560,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
                return 0;
 
        bat_priv->bla.claim_hash = batadv_hash_new(128);
-       bat_priv->bla.backbone_hash = batadv_hash_new(32);
+       if (!bat_priv->bla.claim_hash)
+               return -ENOMEM;
 
-       if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash)
+       bat_priv->bla.backbone_hash = batadv_hash_new(32);
+       if (!bat_priv->bla.backbone_hash) {
+               batadv_hash_destroy(bat_priv->bla.claim_hash);
                return -ENOMEM;
+       }
 
        batadv_hash_set_lock_class(bat_priv->bla.claim_hash,
                                   &batadv_claim_hash_lock_class_key);
index 3ddd66e..5207cd8 100644 (file)
@@ -190,29 +190,41 @@ int batadv_mesh_init(struct net_device *soft_iface)
 
        bat_priv->gw.generation = 0;
 
-       ret = batadv_v_mesh_init(bat_priv);
-       if (ret < 0)
-               goto err;
-
        ret = batadv_originator_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_orig;
+       }
 
        ret = batadv_tt_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_tt;
+       }
+
+       ret = batadv_v_mesh_init(bat_priv);
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_v;
+       }
 
        ret = batadv_bla_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_bla;
+       }
 
        ret = batadv_dat_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_dat;
+       }
 
        ret = batadv_nc_mesh_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_nc;
+       }
 
        batadv_gw_init(bat_priv);
        batadv_mcast_init(bat_priv);
@@ -222,8 +234,20 @@ int batadv_mesh_init(struct net_device *soft_iface)
 
        return 0;
 
-err:
-       batadv_mesh_free(soft_iface);
+err_nc:
+       batadv_dat_free(bat_priv);
+err_dat:
+       batadv_bla_free(bat_priv);
+err_bla:
+       batadv_v_mesh_free(bat_priv);
+err_v:
+       batadv_tt_free(bat_priv);
+err_tt:
+       batadv_originator_free(bat_priv);
+err_orig:
+       batadv_purge_outstanding_packets(bat_priv, NULL);
+       atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+
        return ret;
 }
 
index 9f06132..0a7f1d3 100644 (file)
@@ -152,8 +152,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
                                   &batadv_nc_coding_hash_lock_class_key);
 
        bat_priv->nc.decoding_hash = batadv_hash_new(128);
-       if (!bat_priv->nc.decoding_hash)
+       if (!bat_priv->nc.decoding_hash) {
+               batadv_hash_destroy(bat_priv->nc.coding_hash);
                goto err;
+       }
 
        batadv_hash_set_lock_class(bat_priv->nc.decoding_hash,
                                   &batadv_nc_decoding_hash_lock_class_key);
index e0b3dac..4b7ad66 100644 (file)
@@ -4162,8 +4162,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
                return ret;
 
        ret = batadv_tt_global_init(bat_priv);
-       if (ret < 0)
+       if (ret < 0) {
+               batadv_tt_local_table_free(bat_priv);
                return ret;
+       }
 
        batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1,
                                     batadv_tt_tvlv_unicast_handler_v1,
index 7ee9fec..eb3a366 100644 (file)
@@ -3163,6 +3163,12 @@ static u16 skb_tx_hash(const struct net_device *dev,
 
                qoffset = sb_dev->tc_to_txq[tc].offset;
                qcount = sb_dev->tc_to_txq[tc].count;
+               if (unlikely(!qcount)) {
+                       net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+                                            sb_dev->name, qoffset, tc);
+                       qoffset = 0;
+                       qcount = dev->real_num_tx_queues;
+               }
        }
 
        if (skb_rx_queue_recorded(skb)) {
@@ -3906,7 +3912,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
        skb->pkt_type = PACKET_LOOPBACK;
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
+       if (skb->ip_summed == CHECKSUM_NONE)
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
        WARN_ON(!skb_dst(skb));
        skb_dst_force(skb);
        netif_rx_ni(skb);
index f619777..b2e49eb 100644 (file)
@@ -1973,9 +1973,9 @@ int netdev_register_kobject(struct net_device *ndev)
 int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
                        const struct net *net_new)
 {
+       kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID;
+       kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID;
        struct device *dev = &ndev->dev;
-       kuid_t old_uid, new_uid;
-       kgid_t old_gid, new_gid;
        int error;
 
        net_ns_get_ownership(net_old, &old_uid, &old_gid);
index 2170bea..fe93584 100644 (file)
@@ -80,6 +80,7 @@
 #include <linux/indirect_call_wrapper.h>
 
 #include "datagram.h"
+#include "sock_destructor.h"
 
 struct kmem_cache *skbuff_head_cache __ro_after_init;
 static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
@@ -1804,30 +1805,39 @@ EXPORT_SYMBOL(skb_realloc_headroom);
 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
 {
        int delta = headroom - skb_headroom(skb);
+       int osize = skb_end_offset(skb);
+       struct sock *sk = skb->sk;
 
        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;
 
-       /* pskb_expand_head() might crash, if skb is shared */
-       if (skb_shared(skb)) {
+       delta = SKB_DATA_ALIGN(delta);
+       /* pskb_expand_head() might crash, if skb is shared. */
+       if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 
-               if (likely(nskb)) {
-                       if (skb->sk)
-                               skb_set_owner_w(nskb, skb->sk);
-                       consume_skb(skb);
-               } else {
-                       kfree_skb(skb);
-               }
+               if (unlikely(!nskb))
+                       goto fail;
+
+               if (sk)
+                       skb_set_owner_w(nskb, sk);
+               consume_skb(skb);
                skb = nskb;
        }
-       if (skb &&
-           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
-               kfree_skb(skb);
-               skb = NULL;
+       if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
+               goto fail;
+
+       if (sk && is_skb_wmem(skb)) {
+               delta = skb_end_offset(skb) - osize;
+               refcount_add(delta, &sk->sk_wmem_alloc);
+               skb->truesize += delta;
        }
        return skb;
+
+fail:
+       kfree_skb(skb);
+       return NULL;
 }
 EXPORT_SYMBOL(skb_expand_head);
 
index 2d6249b..a86ef7e 100644 (file)
@@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 }
 EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
 
+bool sk_msg_is_readable(struct sock *sk)
+{
+       struct sk_psock *psock;
+       bool empty = true;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock))
+               empty = list_empty(&psock->ingress_msg);
+       rcu_read_unlock();
+       return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
                                                  struct sk_buff *skb)
 {
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h
new file mode 100644 (file)
index 0000000..2f396e6
--- /dev/null
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+       return skb->destructor == sock_wfree ||
+              skb->destructor == __sock_wfree ||
+              (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
index c8496c1..5f88526 100644 (file)
@@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = {
                .mode           = 0600,
                .proc_handler   = proc_dolongvec_minmax_bpf_restricted,
                .extra1         = &long_one,
-               .extra2         = &long_max,
+               .extra2         = &bpf_jit_limit_max,
        },
 #endif
        {
index e8b48df..f5c336f 100644 (file)
@@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target)
 {
        if (tcp_epollin_ready(sk, target))
                return true;
-
-       if (sk->sk_prot->stream_memory_read)
-               return sk->sk_prot->stream_memory_read(sk);
-       return false;
+       return sk_is_readable(sk);
 }
 
 /*
index d3e9386..5f4d6f4 100644 (file)
@@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
 EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 
 #ifdef CONFIG_BPF_SYSCALL
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
-       struct sk_psock *psock;
-       bool empty = true;
-
-       rcu_read_lock();
-       psock = sk_psock(sk);
-       if (likely(psock))
-               empty = list_empty(&psock->ingress_msg);
-       rcu_read_unlock();
-       return !empty;
-}
-
 static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
                             long timeo)
 {
@@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
        bool cork = false, enospc = sk_msg_full(msg);
        struct sock *sk_redir;
        u32 tosend, delta = 0;
+       u32 eval = __SK_NONE;
        int ret;
 
 more_data:
@@ -275,13 +263,24 @@ more_data:
        case __SK_REDIRECT:
                sk_redir = psock->sk_redir;
                sk_msg_apply_bytes(psock, tosend);
+               if (!psock->apply_bytes) {
+                       /* Clean up before releasing the sock lock. */
+                       eval = psock->eval;
+                       psock->eval = __SK_NONE;
+                       psock->sk_redir = NULL;
+               }
                if (psock->cork) {
                        cork = true;
                        psock->cork = NULL;
                }
                sk_msg_return(sk, msg, tosend);
                release_sock(sk);
+
                ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+
+               if (eval == __SK_REDIRECT)
+                       sock_put(sk_redir);
+
                lock_sock(sk);
                if (unlikely(ret < 0)) {
                        int free = sk_msg_free_nocharge(sk, msg);
@@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
        prot[TCP_BPF_BASE].unhash               = sock_map_unhash;
        prot[TCP_BPF_BASE].close                = sock_map_close;
        prot[TCP_BPF_BASE].recvmsg              = tcp_bpf_recvmsg;
-       prot[TCP_BPF_BASE].stream_memory_read   = tcp_bpf_stream_read;
+       prot[TCP_BPF_BASE].sock_is_readable     = sk_msg_is_readable;
 
        prot[TCP_BPF_TX]                        = prot[TCP_BPF_BASE];
        prot[TCP_BPF_TX].sendmsg                = tcp_bpf_sendmsg;
index 8536b2a..2fffcf2 100644 (file)
@@ -2867,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
            !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
                mask &= ~(EPOLLIN | EPOLLRDNORM);
 
+       /* psock ingress_msg queue should not contain any bad checksum frames */
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
        return mask;
 
 }
index 7a1d5f4..bbe6569 100644 (file)
@@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
        *prot        = *base;
        prot->close  = sock_map_close;
        prot->recvmsg = udp_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
 }
 
 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
index 97095b7..5dcfd53 100644 (file)
@@ -672,7 +672,7 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
                                 u8 *ie, u8 ie_len)
 {
        struct ieee80211_supported_band *sband;
-       const u8 *cap;
+       const struct element *cap;
        const struct ieee80211_he_operation *he_oper = NULL;
 
        sband = ieee80211_get_sband(sdata);
@@ -687,9 +687,10 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
 
        sdata->vif.bss_conf.he_support = true;
 
-       cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len);
-       if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3]))
-               he_oper = (void *)(cap + 3);
+       cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ie_len);
+       if (cap && cap->datalen >= 1 + sizeof(*he_oper) &&
+           cap->datalen >= 1 + ieee80211_he_oper_size(cap->data + 1))
+               he_oper = (void *)(cap->data + 1);
 
        if (he_oper)
                sdata->vif.bss_conf.he_oper.params =
index c41273c..f0f22eb 100644 (file)
@@ -485,11 +485,11 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                mpext = mptcp_get_ext(skb);
                data_len = mpext ? mpext->data_len : 0;
 
-               /* we will check ext_copy.data_len in mptcp_write_options() to
+               /* we will check ops->data_len in mptcp_write_options() to
                 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
                 * TCPOLEN_MPTCP_MPC_ACK
                 */
-               opts->ext_copy.data_len = data_len;
+               opts->data_len = data_len;
                opts->suboptions = OPTION_MPTCP_MPC_ACK;
                opts->sndr_key = subflow->local_key;
                opts->rcvr_key = subflow->remote_key;
@@ -505,9 +505,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                        len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                        if (opts->csum_reqd) {
                                /* we need to propagate more info to csum the pseudo hdr */
-                               opts->ext_copy.data_seq = mpext->data_seq;
-                               opts->ext_copy.subflow_seq = mpext->subflow_seq;
-                               opts->ext_copy.csum = mpext->csum;
+                               opts->data_seq = mpext->data_seq;
+                               opts->subflow_seq = mpext->subflow_seq;
+                               opts->csum = mpext->csum;
                                len += TCPOLEN_MPTCP_DSS_CHECKSUM;
                        }
                        *size = ALIGN(len, 4);
@@ -1227,7 +1227,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp)
                WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
 }
 
-static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
+static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum)
 {
        struct csum_pseudo_header header;
        __wsum csum;
@@ -1237,15 +1237,21 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
         * always the 64-bit value, irrespective of what length is used in the
         * DSS option itself.
         */
-       header.data_seq = cpu_to_be64(mpext->data_seq);
-       header.subflow_seq = htonl(mpext->subflow_seq);
-       header.data_len = htons(mpext->data_len);
+       header.data_seq = cpu_to_be64(data_seq);
+       header.subflow_seq = htonl(subflow_seq);
+       header.data_len = htons(data_len);
        header.csum = 0;
 
-       csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum));
+       csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum));
        return (__force u16)csum_fold(csum);
 }
 
+static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
+{
+       return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+                                mpext->csum);
+}
+
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                         struct mptcp_out_options *opts)
 {
@@ -1337,7 +1343,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                        len = TCPOLEN_MPTCP_MPC_SYN;
                } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) {
                        len = TCPOLEN_MPTCP_MPC_SYNACK;
-               } else if (opts->ext_copy.data_len) {
+               } else if (opts->data_len) {
                        len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                        if (opts->csum_reqd)
                                len += TCPOLEN_MPTCP_DSS_CHECKSUM;
@@ -1366,14 +1372,17 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 
                put_unaligned_be64(opts->rcvr_key, ptr);
                ptr += 2;
-               if (!opts->ext_copy.data_len)
+               if (!opts->data_len)
                        goto mp_capable_done;
 
                if (opts->csum_reqd) {
-                       put_unaligned_be32(opts->ext_copy.data_len << 16 |
-                                          mptcp_make_csum(&opts->ext_copy), ptr);
+                       put_unaligned_be32(opts->data_len << 16 |
+                                          __mptcp_make_csum(opts->data_seq,
+                                                            opts->subflow_seq,
+                                                            opts->data_len,
+                                                            opts->csum), ptr);
                } else {
-                       put_unaligned_be32(opts->ext_copy.data_len << 16 |
+                       put_unaligned_be32(opts->data_len << 16 |
                                           TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
                }
                ptr += 1;
index 32df65f..fb3da4d 100644 (file)
@@ -156,6 +156,12 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort(
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
 
+static enum sctp_disposition
+__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                          const struct sctp_association *asoc,
+                          const union sctp_subtype type, void *arg,
+                          struct sctp_cmd_seq *commands);
+
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
  * is set to be the size of a specific chunk we are testing.
@@ -337,6 +343,14 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
        if (!chunk->singleton)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
+       /* Make sure that the INIT chunk has a valid length.
+        * Normally, this would cause an ABORT with a Protocol Violation
+        * error, but since we don't have an association, we'll
+        * just discard the packet.
+        */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
        /* If the packet is an OOTB packet which is temporarily on the
         * control endpoint, respond with an ABORT.
         */
@@ -351,14 +365,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
        if (chunk->sctp_hdr->vtag != 0)
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
 
-       /* Make sure that the INIT chunk has a valid length.
-        * Normally, this would cause an ABORT with a Protocol Violation
-        * error, but since we don't have an association, we'll
-        * just discard the packet.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
        /* If the INIT is coming toward a closing socket, we'll send back
         * and ABORT.  Essentially, this catches the race of INIT being
         * backloged to the socket at the same time as the user issues close().
@@ -704,6 +710,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
        struct sock *sk;
        int error = 0;
 
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
        /* If the packet is an OOTB packet which is temporarily on the
         * control endpoint, respond with an ABORT.
         */
@@ -718,7 +727,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
         * in sctp_unpack_cookie().
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
 
        /* If the endpoint is not listening or if the number of associations
         * on the TCP-style socket exceed the max backlog, respond with an
@@ -1524,20 +1534,16 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
        if (!chunk->singleton)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
+       /* Make sure that the INIT chunk has a valid length. */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
        /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
         * Tag.
         */
        if (chunk->sctp_hdr->vtag != 0)
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
 
-       /* Make sure that the INIT chunk has a valid length.
-        * In this case, we generate a protocol violation since we have
-        * an association established.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
-
        if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port)
                return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands);
 
@@ -1882,9 +1888,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
         * its peer.
        */
        if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) {
-               disposition = sctp_sf_do_9_2_reshutack(net, ep, asoc,
-                               SCTP_ST_CHUNK(chunk->chunk_hdr->type),
-                               chunk, commands);
+               disposition = __sctp_sf_do_9_2_reshutack(net, ep, asoc,
+                                                        SCTP_ST_CHUNK(chunk->chunk_hdr->type),
+                                                        chunk, commands);
                if (SCTP_DISPOSITION_NOMEM == disposition)
                        goto nomem;
 
@@ -2202,9 +2208,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
         * enough for the chunk header.  Cookie length verification is
         * done later.
         */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) {
+               if (!sctp_vtag_verify(chunk, asoc))
+                       asoc = NULL;
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands);
+       }
 
        /* "Decode" the chunk.  We have no optional parameters so we
         * are in good shape.
@@ -2341,7 +2349,7 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort(
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2387,7 +2395,7 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort(
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2657,7 +2665,7 @@ enum sctp_disposition sctp_sf_do_9_1_abort(
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2970,13 +2978,11 @@ enum sctp_disposition sctp_sf_do_9_2_shut_ctsn(
  * that belong to this association, it should discard the INIT chunk and
  * retransmit the SHUTDOWN ACK chunk.
  */
-enum sctp_disposition sctp_sf_do_9_2_reshutack(
-                                       struct net *net,
-                                       const struct sctp_endpoint *ep,
-                                       const struct sctp_association *asoc,
-                                       const union sctp_subtype type,
-                                       void *arg,
-                                       struct sctp_cmd_seq *commands)
+static enum sctp_disposition
+__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                          const struct sctp_association *asoc,
+                          const union sctp_subtype type, void *arg,
+                          struct sctp_cmd_seq *commands)
 {
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *reply;
@@ -3010,6 +3016,26 @@ nomem:
        return SCTP_DISPOSITION_NOMEM;
 }
 
+enum sctp_disposition
+sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                        const struct sctp_association *asoc,
+                        const union sctp_subtype type, void *arg,
+                        struct sctp_cmd_seq *commands)
+{
+       struct sctp_chunk *chunk = arg;
+
+       if (!chunk->singleton)
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+       if (chunk->sctp_hdr->vtag != 0)
+               return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+       return __sctp_sf_do_9_2_reshutack(net, ep, asoc, type, arg, commands);
+}
+
 /*
  * sctp_sf_do_ecn_cwr
  *
@@ -3662,6 +3688,9 @@ enum sctp_disposition sctp_sf_ootb(struct net *net,
 
        SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
 
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               asoc = NULL;
+
        ch = (struct sctp_chunkhdr *)chunk->chunk_hdr;
        do {
                /* Report violation if the chunk is less then minimal */
@@ -3777,12 +3806,6 @@ static enum sctp_disposition sctp_sf_shut_8_4_5(
 
        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 
-       /* If the chunk length is invalid, we don't want to process
-        * the reset of the packet.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
        /* We need to discard the rest of the packet to prevent
         * potential boomming attacks from additional bundled chunks.
         * This is documented in SCTP Threats ID.
@@ -3810,6 +3833,9 @@ enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net,
 {
        struct sctp_chunk *chunk = arg;
 
+       if (!sctp_vtag_verify(chunk, asoc))
+               asoc = NULL;
+
        /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3845,6 +3871,11 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net,
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }
 
+       /* Make sure that the ASCONF ADDIP chunk has a valid length.  */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
+
        /* ADD-IP: Section 4.1.1
         * This chunk MUST be sent in an authenticated way by using
         * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
@@ -3853,13 +3884,7 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net,
         */
        if (!asoc->peer.asconf_capable ||
            (!net->sctp.addip_noauth && !chunk->auth))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
-                                            commands);
-
-       /* Make sure that the ASCONF ADDIP chunk has a valid length.  */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
        hdr = (struct sctp_addiphdr *)chunk->skb->data;
        serial = ntohl(hdr->serial);
@@ -3988,6 +4013,12 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }
 
+       /* Make sure that the ADDIP chunk has a valid length.  */
+       if (!sctp_chunk_length_valid(asconf_ack,
+                                    sizeof(struct sctp_addip_chunk)))
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
+
        /* ADD-IP, Section 4.1.2:
         * This chunk MUST be sent in an authenticated way by using
         * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
@@ -3996,14 +4027,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
         */
        if (!asoc->peer.asconf_capable ||
            (!net->sctp.addip_noauth && !asconf_ack->auth))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
-                                            commands);
-
-       /* Make sure that the ADDIP chunk has a valid length.  */
-       if (!sctp_chunk_length_valid(asconf_ack,
-                                    sizeof(struct sctp_addip_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
        addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data;
        rcvd_serial = ntohl(addip_hdr->serial);
@@ -4575,6 +4599,9 @@ enum sctp_disposition sctp_sf_discard_chunk(struct net *net,
 {
        struct sctp_chunk *chunk = arg;
 
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
        /* Make sure that the chunk has a valid length.
         * Since we don't know the chunk type, we use a general
         * chunkhdr structure to make a comparison.
@@ -4642,6 +4669,9 @@ enum sctp_disposition sctp_sf_violation(struct net *net,
 {
        struct sctp_chunk *chunk = arg;
 
+       if (!sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
        /* Make sure that the chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -6348,6 +6378,7 @@ static struct sctp_packet *sctp_ootb_pkt_new(
                 * yet.
                 */
                switch (chunk->chunk_hdr->type) {
+               case SCTP_CID_INIT:
                case SCTP_CID_INIT_ACK:
                {
                        struct sctp_initack_chunk *initack;
index c038efc..78b663d 100644 (file)
@@ -1057,7 +1057,7 @@ static void smc_connect_work(struct work_struct *work)
        if (smc->clcsock->sk->sk_err) {
                smc->sk.sk_err = smc->clcsock->sk->sk_err;
        } else if ((1 << smc->clcsock->sk->sk_state) &
-                                       (TCPF_SYN_SENT | TCP_SYN_RECV)) {
+                                       (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
                if ((rc == -EPIPE) &&
                    ((1 << smc->clcsock->sk->sk_state) &
index 72f4b72..f1d3234 100644 (file)
@@ -1822,7 +1822,7 @@ void smc_llc_link_active(struct smc_link *link)
                            link->smcibdev->ibdev->name, link->ibport);
        link->state = SMC_LNK_ACTIVE;
        if (link->lgr->llc_testlink_time) {
-               link->llc_testlink_time = link->lgr->llc_testlink_time * HZ;
+               link->llc_testlink_time = link->lgr->llc_testlink_time;
                schedule_delayed_work(&link->llc_testlink_wrk,
                                      link->llc_testlink_time);
        }
index c9391d3..dc60c32 100644 (file)
@@ -2285,43 +2285,53 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr)
        u16 key_gen = msg_key_gen(hdr);
        u16 size = msg_data_sz(hdr);
        u8 *data = msg_data(hdr);
+       unsigned int keylen;
+
+       /* Verify whether the size can exist in the packet */
+       if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) {
+               pr_debug("%s: message data size is too small\n", rx->name);
+               goto exit;
+       }
+
+       keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+
+       /* Verify the supplied size values */
+       if (unlikely(size != keylen + sizeof(struct tipc_aead_key) ||
+                    keylen > TIPC_AEAD_KEY_SIZE_MAX)) {
+               pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name);
+               goto exit;
+       }
 
        spin_lock(&rx->lock);
        if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) {
                pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name,
                       rx->skey, key_gen, rx->key_gen);
-               goto exit;
+               goto exit_unlock;
        }
 
        /* Allocate memory for the key */
        skey = kmalloc(size, GFP_ATOMIC);
        if (unlikely(!skey)) {
                pr_err("%s: unable to allocate memory for skey\n", rx->name);
-               goto exit;
+               goto exit_unlock;
        }
 
        /* Copy key from msg data */
-       skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+       skey->keylen = keylen;
        memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME);
        memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32),
               skey->keylen);
 
-       /* Sanity check */
-       if (unlikely(size != tipc_aead_key_size(skey))) {
-               kfree(skey);
-               skey = NULL;
-               goto exit;
-       }
-
        rx->key_gen = key_gen;
        rx->skey_mode = msg_key_mode(hdr);
        rx->skey = skey;
        rx->nokey = 0;
        mb(); /* for nokey flag */
 
-exit:
+exit_unlock:
        spin_unlock(&rx->lock);
 
+exit:
        /* Schedule the key attaching on this crypto */
        if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0)))
                return true;
index fde56ff..9ab81db 100644 (file)
@@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 
        prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
        prot[TLS_BASE][TLS_SW].recvmsg            = tls_sw_recvmsg;
-       prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+       prot[TLS_BASE][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
        prot[TLS_BASE][TLS_SW].close              = tls_sk_proto_close;
 
        prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
        prot[TLS_SW][TLS_SW].recvmsg            = tls_sw_recvmsg;
-       prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
+       prot[TLS_SW][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
        prot[TLS_SW][TLS_SW].close              = tls_sk_proto_close;
 
 #ifdef CONFIG_TLS_DEVICE
index 4feb95e..1b08b87 100644 (file)
@@ -35,6 +35,7 @@
  * SOFTWARE.
  */
 
+#include <linux/bug.h>
 #include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/splice.h>
 #include <net/strparser.h>
 #include <net/tls.h>
 
+noinline void tls_err_abort(struct sock *sk, int err)
+{
+       WARN_ON_ONCE(err >= 0);
+       /* sk->sk_err should contain a positive error code. */
+       sk->sk_err = -err;
+       sk_error_report(sk);
+}
+
 static int __skb_nsg(struct sk_buff *skb, int offset, int len,
                      unsigned int recursion_level)
 {
@@ -419,7 +428,7 @@ int tls_tx_records(struct sock *sk, int flags)
 
 tx_err:
        if (rc < 0 && rc != -EAGAIN)
-               tls_err_abort(sk, EBADMSG);
+               tls_err_abort(sk, -EBADMSG);
 
        return rc;
 }
@@ -450,7 +459,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 
                /* If err is already set on socket, return the same code */
                if (sk->sk_err) {
-                       ctx->async_wait.err = sk->sk_err;
+                       ctx->async_wait.err = -sk->sk_err;
                } else {
                        ctx->async_wait.err = err;
                        tls_err_abort(sk, err);
@@ -763,7 +772,7 @@ static int tls_push_record(struct sock *sk, int flags,
                               msg_pl->sg.size + prot->tail_size, i);
        if (rc < 0) {
                if (rc != -EINPROGRESS) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                        if (split) {
                                tls_ctx->pending_open_record_frags = true;
                                tls_merge_open_record(sk, rec, tmp, orig_end);
@@ -1827,7 +1836,7 @@ int tls_sw_recvmsg(struct sock *sk,
                err = decrypt_skb_update(sk, skb, &msg->msg_iter,
                                         &chunk, &zc, async_capable);
                if (err < 0 && err != -EINPROGRESS) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                        goto recv_end;
                }
 
@@ -2007,7 +2016,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
                }
 
                if (err < 0) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                        goto splice_read_end;
                }
                ctx->decrypted = 1;
@@ -2026,7 +2035,7 @@ splice_read_end:
        return copied ? : err;
 }
 
-bool tls_sw_stream_read(const struct sock *sk)
+bool tls_sw_sock_is_readable(struct sock *sk)
 {
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
index 89f9e85..78e08e8 100644 (file)
@@ -3052,6 +3052,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
 
        /* Connection-based need to check for termination and startup */
        if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
@@ -3091,6 +3093,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
 
        /* Connection-based need to check for termination and startup */
        if (sk->sk_type == SOCK_SEQPACKET) {
index b927e2b..452376c 100644 (file)
@@ -102,6 +102,7 @@ static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto
        *prot        = *base;
        prot->close  = sock_map_close;
        prot->recvmsg = unix_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
 }
 
 static void unix_stream_bpf_rebuild_protos(struct proto *prot,
@@ -110,6 +111,7 @@ static void unix_stream_bpf_rebuild_protos(struct proto *prot,
        *prot        = *base;
        prot->close  = sock_map_close;
        prot->recvmsg = unix_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
        prot->unhash  = sock_map_unhash;
 }
 
index 0332312..aaba847 100644 (file)
@@ -524,6 +524,7 @@ use_default_name:
        INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk);
        INIT_WORK(&rdev->mgmt_registrations_update_wk,
                  cfg80211_mgmt_registrations_update_wk);
+       spin_lock_init(&rdev->mgmt_registrations_lock);
 
 #ifdef CONFIG_CFG80211_DEFAULT_PS
        rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
@@ -1279,7 +1280,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev)
        INIT_LIST_HEAD(&wdev->event_list);
        spin_lock_init(&wdev->event_lock);
        INIT_LIST_HEAD(&wdev->mgmt_registrations);
-       spin_lock_init(&wdev->mgmt_registrations_lock);
        INIT_LIST_HEAD(&wdev->pmsr_list);
        spin_lock_init(&wdev->pmsr_lock);
        INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
index b35d0db..1720abf 100644 (file)
@@ -100,6 +100,8 @@ struct cfg80211_registered_device {
        struct work_struct propagate_cac_done_wk;
 
        struct work_struct mgmt_registrations_update_wk;
+       /* lock for all wdev lists */
+       spinlock_t mgmt_registrations_lock;
 
        /* must be last because of the way we do wiphy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN */
index 3aa69b3..783acd2 100644 (file)
@@ -452,9 +452,9 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
 
        lockdep_assert_held(&rdev->wiphy.mtx);
 
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
        if (!wdev->mgmt_registrations_need_update) {
-               spin_unlock_bh(&wdev->mgmt_registrations_lock);
+               spin_unlock_bh(&rdev->mgmt_registrations_lock);
                return;
        }
 
@@ -479,7 +479,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
        rcu_read_unlock();
 
        wdev->mgmt_registrations_need_update = 0;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        rdev_update_mgmt_frame_registrations(rdev, wdev, &upd);
 }
@@ -503,6 +503,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
                                int match_len, bool multicast_rx,
                                struct netlink_ext_ack *extack)
 {
+       struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_mgmt_registration *reg, *nreg;
        int err = 0;
        u16 mgmt_type;
@@ -548,7 +549,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
        if (!nreg)
                return -ENOMEM;
 
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
 
        list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                int mlen = min(match_len, reg->match_len);
@@ -583,7 +584,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
                list_add(&nreg->list, &wdev->mgmt_registrations);
        }
        wdev->mgmt_registrations_need_update = 1;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        cfg80211_mgmt_registrations_update(wdev);
 
@@ -591,7 +592,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 
  out:
        kfree(nreg);
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        return err;
 }
@@ -602,7 +603,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_mgmt_registration *reg, *tmp;
 
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
 
        list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                if (reg->nlportid != nlportid)
@@ -615,7 +616,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
                schedule_work(&rdev->mgmt_registrations_update_wk);
        }
 
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        if (nlportid && rdev->crit_proto_nlportid == nlportid) {
                rdev->crit_proto_nlportid = 0;
@@ -628,15 +629,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
 
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
 {
+       struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_mgmt_registration *reg, *tmp;
 
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
        list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                list_del(&reg->list);
                kfree(reg);
        }
        wdev->mgmt_registrations_need_update = 1;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        cfg80211_mgmt_registrations_update(wdev);
 }
@@ -784,7 +786,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
        data = buf + ieee80211_hdrlen(mgmt->frame_control);
        data_len = len - ieee80211_hdrlen(mgmt->frame_control);
 
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
 
        list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                if (reg->frame_type != ftype)
@@ -808,7 +810,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
                break;
        }
 
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
 
        trace_cfg80211_return_bool(result);
        return result;
index 11c68b1..adc0d14 100644 (file)
@@ -418,14 +418,17 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss,
        }
        ssid_len = ssid[1];
        ssid = ssid + 2;
-       rcu_read_unlock();
 
        /* check if nontrans_bss is in the list */
        list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) {
-               if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len))
+               if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) {
+                       rcu_read_unlock();
                        return 0;
+               }
        }
 
+       rcu_read_unlock();
+
        /* add to the list */
        list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list);
        return 0;
index 18dba3d..a1a99a5 100644 (file)
@@ -1028,14 +1028,14 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
            !(rdev->wiphy.interface_modes & (1 << ntype)))
                return -EOPNOTSUPP;
 
-       /* if it's part of a bridge, reject changing type to station/ibss */
-       if (netif_is_bridge_port(dev) &&
-           (ntype == NL80211_IFTYPE_ADHOC ||
-            ntype == NL80211_IFTYPE_STATION ||
-            ntype == NL80211_IFTYPE_P2P_CLIENT))
-               return -EBUSY;
-
        if (ntype != otype) {
+               /* if it's part of a bridge, reject changing type to station/ibss */
+               if (netif_is_bridge_port(dev) &&
+                   (ntype == NL80211_IFTYPE_ADHOC ||
+                    ntype == NL80211_IFTYPE_STATION ||
+                    ntype == NL80211_IFTYPE_P2P_CLIENT))
+                       return -EBUSY;
+
                dev->ieee80211_ptr->use_4addr = false;
                dev->ieee80211_ptr->mesh_id_up_len = 0;
                wdev_lock(dev->ieee80211_ptr);
index 5cd7020..b856afa 100644 (file)
@@ -787,6 +787,8 @@ $(OUTPUT)dlfilters/%.o: dlfilters/%.c include/perf/perf_dlfilter.h
        $(Q)$(MKDIR) -p $(OUTPUT)dlfilters
        $(QUIET_CC)$(CC) -c -Iinclude $(EXTRA_CFLAGS) -o $@ -fpic $<
 
+.SECONDARY: $(DLFILTERS:.so=.o)
+
 $(OUTPUT)dlfilters/%.so: $(OUTPUT)dlfilters/%.o
        $(QUIET_LINK)$(CC) $(EXTRA_CFLAGS) -shared -o $@ $<
 
index 3018a05..20cd624 100644 (file)
@@ -45,7 +45,7 @@ static const Dwfl_Callbacks offline_callbacks = {
  */
 static int check_return_reg(int ra_regno, Dwarf_Frame *frame)
 {
-       Dwarf_Op ops_mem[2];
+       Dwarf_Op ops_mem[3];
        Dwarf_Op dummy;
        Dwarf_Op *ops = &dummy;
        size_t nops;
index 6211d0b..c32c2eb 100644 (file)
@@ -459,7 +459,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session)
                return -EINVAL;
 
        if (PRINT_FIELD(WEIGHT) &&
-           evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", PERF_OUTPUT_WEIGHT))
+           evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT))
                return -EINVAL;
 
        if (PRINT_FIELD(SYM) &&
@@ -4039,11 +4039,15 @@ script_found:
                goto out_delete;
 
        uname(&uts);
-       if (data.is_pipe ||  /* assume pipe_mode indicates native_arch */
-           !strcmp(uts.machine, session->header.env.arch) ||
-           (!strcmp(uts.machine, "x86_64") &&
-            !strcmp(session->header.env.arch, "i386")))
+       if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */
                native_arch = true;
+       } else if (session->header.env.arch) {
+               if (!strcmp(uts.machine, session->header.env.arch))
+                       native_arch = true;
+               else if (!strcmp(uts.machine, "x86_64") &&
+                        !strcmp(session->header.env.arch, "i386"))
+                       native_arch = true;
+       }
 
        script.session = session;
        script__setup_sample_type(&script);
index 5c59790..d88bb65 100644 (file)
@@ -949,7 +949,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
        int err, n;
        u32 key;
        char b;
-       int retries = 100;
 
        zero_verdict_count(verd_mapfd);
 
@@ -1002,17 +1001,11 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
                goto close_peer1;
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
-again:
-       n = read(c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
        if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
 
 close_peer1:
        xclose(p1);
@@ -1571,7 +1564,6 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
        const char *log_prefix = redir_mode_str(mode);
        int c0, c1, p0, p1;
        unsigned int pass;
-       int retries = 100;
        int err, n;
        int sfd[2];
        u32 key;
@@ -1606,17 +1598,11 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
        if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
 
 close:
        xclose(c1);
@@ -1748,7 +1734,6 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
        const char *log_prefix = redir_mode_str(mode);
        int c0, c1, p0, p1;
        unsigned int pass;
-       int retries = 100;
        int err, n;
        u32 key;
        char b;
@@ -1781,17 +1766,11 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
        if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
 
 close_cli1:
        xclose(c1);
@@ -1841,7 +1820,6 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
        const char *log_prefix = redir_mode_str(mode);
        int c0, c1, p0, p1;
        unsigned int pass;
-       int retries = 100;
        int err, n;
        int sfd[2];
        u32 key;
@@ -1876,17 +1854,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
        if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
 
 close_cli1:
        xclose(c1);
@@ -1932,7 +1904,6 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
        int sfd[2];
        u32 key;
        char b;
-       int retries = 100;
 
        zero_verdict_count(verd_mapfd);
 
@@ -1963,17 +1934,11 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
        if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
 
 close:
        xclose(c1);
index 8e67a25..3313566 100755 (executable)
@@ -445,10 +445,13 @@ cleanup()
                ip -netns ${NSA} link set dev ${NSA_DEV} down
                ip -netns ${NSA} link del dev ${NSA_DEV}
 
+               ip netns pids ${NSA} | xargs kill 2>/dev/null
                ip netns del ${NSA}
        fi
 
+       ip netns pids ${NSB} | xargs kill 2>/dev/null
        ip netns del ${NSB}
+       ip netns pids ${NSC} | xargs kill 2>/dev/null
        ip netns del ${NSC} >/dev/null 2>&1
 }
 
index 1af16d2..52497b7 100644 (file)
@@ -341,7 +341,7 @@ void split_file_backed_thp(void)
        }
 
        /* write something to the file, so a file-backed THP can be allocated */
-       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
+       num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
        close(fd);
 
        if (num_written < 1) {