Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst

index 7f9b40d..4d151fb 100644 (file)
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -1,5 +1,7 @@
  .. SPDX-License-Identifier: GPL-2.0
  
+.. _inline_encryption:
+
  =================
  Inline Encryption
  =================
@@ -7,230 +9,269 @@ Inline Encryption
  Background
  ==========
  
-Inline encryption hardware sits logically between memory and the disk, and can
-en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a
-fixed number of "keyslots" - slots into which encryption contexts (i.e. the
-encryption key, encryption algorithm, data unit size) can be programmed by the
-kernel at any time. Each request sent to the disk can be tagged with the index
-of a keyslot (and also a data unit number to act as an encryption tweak), and
-the inline encryption hardware will en/decrypt the data in the request with the
-encryption context programmed into that keyslot. This is very different from
-full disk encryption solutions like self encrypting drives/TCG OPAL/ATA
-Security standards, since with inline encryption, any block on disk could be
-encrypted with any encryption context the kernel chooses.
-
+Inline encryption hardware sits logically between memory and disk, and can
+en/decrypt data as it goes in/out of the disk.  For each I/O request, software
+can control exactly how the inline encryption hardware will en/decrypt the data
+in terms of key, algorithm, data unit size (the granularity of en/decryption),
+and data unit number (a value that determines the initialization vector(s)).
+
+Some inline encryption hardware accepts all encryption parameters including raw
+keys directly in low-level I/O requests.  However, most inline encryption
+hardware instead has a fixed number of "keyslots" and requires that the key,
+algorithm, and data unit size first be programmed into a keyslot.  Each
+low-level I/O request then just contains a keyslot index and data unit number.
+
+Note that inline encryption hardware is very different from traditional crypto
+accelerators, which are supported through the kernel crypto API.  Traditional
+crypto accelerators operate on memory regions, whereas inline encryption
+hardware operates on I/O requests.  Thus, inline encryption hardware needs to be
+managed by the block layer, not the kernel crypto API.
+
+Inline encryption hardware is also very different from "self-encrypting drives",
+such as those based on the TCG Opal or ATA Security standards.  Self-encrypting
+drives don't provide fine-grained control of encryption and provide no way to
+verify the correctness of the resulting ciphertext.  Inline encryption hardware
+provides fine-grained control of encryption, including the choice of key and
+initialization vector for each sector, and can be tested for correctness.
  
  Objective
  =========
  
-We want to support inline encryption (IE) in the kernel.
-To allow for testing, we also want a crypto API fallback when actual
-IE hardware is absent. We also want IE to work with layered devices
-like dm and loopback (i.e. we want to be able to use the IE hardware
-of the underlying devices if present, or else fall back to crypto API
-en/decryption).
-
+We want to support inline encryption in the kernel.  To make testing easier, we
+also want support for falling back to the kernel crypto API when actual inline
+encryption hardware is absent.  We also want inline encryption to work with
+layered devices like device-mapper and loopback (i.e. we want to be able to use
+the inline encryption hardware of the underlying devices if present, or else
+fall back to crypto API en/decryption).
  
  Constraints and notes
  =====================
  
-- IE hardware has a limited number of "keyslots" that can be programmed
-  with an encryption context (key, algorithm, data unit size, etc.) at any time.
-  One can specify a keyslot in a data request made to the device, and the
-  device will en/decrypt the data using the encryption context programmed into
-  that specified keyslot. When possible, we want to make multiple requests with
-  the same encryption context share the same keyslot.
-
-- We need a way for upper layers like filesystems to specify an encryption
-  context to use for en/decrypting a struct bio, and a device driver (like UFS)
-  needs to be able to use that encryption context when it processes the bio.
-
-- We need a way for device drivers to expose their inline encryption
-  capabilities in a unified way to the upper layers.
-
-
-Design
-======
-
-We add a struct bio_crypt_ctx to struct bio that can
-represent an encryption context, because we need to be able to pass this
-encryption context from the upper layers (like the fs layer) to the
-device driver to act upon.
-
-While IE hardware works on the notion of keyslots, the FS layer has no
-knowledge of keyslots - it simply wants to specify an encryption context to
-use while en/decrypting a bio.
-
-We introduce a keyslot manager (KSM) that handles the translation from
-encryption contexts specified by the FS to keyslots on the IE hardware.
-This KSM also serves as the way IE hardware can expose its capabilities to
-upper layers. The generic mode of operation is: each device driver that wants
-to support IE will construct a KSM and set it up in its struct request_queue.
-Upper layers that want to use IE on this device can then use this KSM in
-the device's struct request_queue to translate an encryption context into
-a keyslot. The presence of the KSM in the request queue shall be used to mean
-that the device supports IE.
-
-The KSM uses refcounts to track which keyslots are idle (either they have no
-encryption context programmed, or there are no in-flight struct bios
-referencing that keyslot). When a new encryption context needs a keyslot, it
-tries to find a keyslot that has already been programmed with the same
-encryption context, and if there is no such keyslot, it evicts the least
-recently used idle keyslot and programs the new encryption context into that
-one. If no idle keyslots are available, then the caller will sleep until there
-is at least one.
-
-
-blk-mq changes, other block layer changes and blk-crypto-fallback
-=================================================================
-
-We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to
-struct request. These will be referred to as the ``crypto fields``
-for the request. This ``keyslot`` is the keyslot into which the
-``bi_crypt_context`` has been programmed in the KSM of the ``request_queue``
-that this request is being sent to.
-
-We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain
-blissfully unaware of whether or not real inline encryption hardware is present
-underneath. When a bio is submitted with a target ``request_queue`` that doesn't
-support the encryption context specified with the bio, the block layer will
-en/decrypt the bio with the blk-crypto-fallback.
-
-If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio
-is encrypted stored in the bounce bio - blk-mq will then proceed to process the
-bounce bio as if it were not encrypted at all (except when blk-integrity is
-concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an
-internal function that cleans up the bounce bio and ends the original bio.
-
-If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``)
-is saved and overwritten by ``blk-crypto-fallback`` to
-``bio_crypto_fallback_decrypt_bio``.  The bio's ``bi_crypt_context`` is also
-overwritten with ``NULL``, so that to the rest of the stack, the bio looks
-as if it was a regular bio that never had an encryption context specified.
-``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original
-``bi_end_io`` (and also ``bi_private``) and end the bio again.
-
-Regardless of whether real inline encryption hardware is used or the
+- We need a way for upper layers (e.g. filesystems) to specify an encryption
+  context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need
+  to be able to use that encryption context when they process the request.
+  Encryption contexts also introduce constraints on bio merging; the block layer
+  needs to be aware of these constraints.
+
+- Different inline encryption hardware has different supported algorithms,
+  supported data unit sizes, maximum data unit numbers, etc.  We call these
+  properties the "crypto capabilities".  We need a way for device drivers to
+  advertise crypto capabilities to upper layers in a generic way.
+
+- Inline encryption hardware usually (but not always) requires that keys be
+  programmed into keyslots before being used.  Since programming keyslots may be
+  slow and there may not be very many keyslots, we shouldn't just program the
+  key for every I/O request, but rather keep track of which keys are in the
+  keyslots and reuse an already-programmed keyslot when possible.
+
+- Upper layers typically define a specific end-of-life for crypto keys, e.g.
+  when an encrypted directory is locked or when a crypto mapping is torn down.
+  At these times, keys are wiped from memory.  We must provide a way for upper
+  layers to also evict keys from any keyslots they are present in.
+
+- When possible, device-mapper devices must be able to pass through the inline
+  encryption support of their underlying devices.  However, it doesn't make
+  sense for device-mapper devices to have keyslots themselves.
+
+Basic design
+============
+
+We introduce ``struct blk_crypto_key`` to represent an inline encryption key and
+how it will be used.  This includes the actual bytes of the key; the size of the
+key; the algorithm and data unit size the key will be used with; and the number
+of bytes needed to represent the maximum data unit number the key will be used
+with.
+
+We introduce ``struct bio_crypt_ctx`` to represent an encryption context.  It
+contains a data unit number and a pointer to a blk_crypto_key.  We add pointers
+to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users
+of the block layer (e.g. filesystems) to provide an encryption context when
+creating a bio and have it be passed down the stack for processing by the block
+layer and device drivers.  Note that the encryption context doesn't explicitly
+say whether to encrypt or decrypt, as that is implicit from the direction of the
+bio; WRITE means encrypt, and READ means decrypt.
+
+We also introduce ``struct blk_crypto_profile`` to contain all generic inline
+encryption-related state for a particular inline encryption device.  The
+blk_crypto_profile serves as the way that drivers for inline encryption hardware
+advertise their crypto capabilities and provide certain functions (e.g.,
+functions to program and evict keys) to upper layers.  Each device driver that
+wants to support inline encryption will construct a blk_crypto_profile, then
+associate it with the disk's request_queue.
+
+The blk_crypto_profile also manages the hardware's keyslots, when applicable.
+This happens in the block layer, so that users of the block layer can just
+specify encryption contexts and don't need to know about keyslots at all, nor do
+device drivers need to care about most details of keyslot management.
+
+Specifically, for each keyslot, the block layer (via the blk_crypto_profile)
+keeps track of which blk_crypto_key that keyslot contains (if any), and how many
+in-flight I/O requests are using it.  When the block layer creates a
+``struct request`` for a bio that has an encryption context, it grabs a keyslot
+that already contains the key if possible.  Otherwise it waits for an idle
+keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the
+least-recently-used idle keyslot using the function the device driver provided.
+In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of
+the request, where it is then accessible to device drivers and is released after
+the request completes.
+
+``struct request`` also contains a pointer to the original bio_crypt_ctx.
+Requests can be built from multiple bios, and the block layer must take the
+encryption context into account when trying to merge bios and requests.  For two
+bios/requests to be merged, they must have compatible encryption contexts: both
+unencrypted, or both encrypted with the same key and contiguous data unit
+numbers.  Only the encryption context for the first bio in a request is
+retained, since the remaining bios have been verified to be merge-compatible
+with the first bio.
+
+To make it possible for inline encryption to work with request_queue based
+layered devices, when a request is cloned, its encryption context is cloned as
+well.  When the cloned request is submitted, it is then processed as usual; this
+includes getting a keyslot from the clone's target device if needed.
+
+blk-crypto-fallback
+===================
+
+It is desirable for the inline encryption support of upper layers (e.g.
+filesystems) to be testable without real inline encryption hardware, and
+likewise for the block layer's keyslot management logic.  It is also desirable
+to allow upper layers to just always use inline encryption rather than have to
+implement encryption in multiple ways.
+
+Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
+of inline encryption using the kernel crypto API.  blk-crypto-fallback is built
+into the block layer, so it works on any block device without any special setup.
+Essentially, when a bio with an encryption context is submitted to a
+request_queue that doesn't support that encryption context, the block layer will
+handle en/decryption of the bio using blk-crypto-fallback.
+
+For encryption, the data cannot be encrypted in-place, as callers usually rely
+on it being unmodified.  Instead, blk-crypto-fallback allocates bounce pages,
+fills a new bio with those bounce pages, encrypts the data into those bounce
+pages, and submits that "bounce" bio.  When the bounce bio completes,
+blk-crypto-fallback completes the original bio.  If the original bio is too
+large, multiple bounce bios may be required; see the code for details.
+
+For decryption, blk-crypto-fallback "wraps" the bio's completion callback
+(``bi_complete``) and private data (``bi_private``) with its own, unsets the
+bio's encryption context, then submits the bio.  If the read completes
+successfully, blk-crypto-fallback restores the bio's original completion
+callback and private data, then decrypts the bio's data in-place using the
+kernel crypto API.  Decryption happens from a workqueue, as it may sleep.
+Afterwards, blk-crypto-fallback completes the bio.
+
+In both cases, the bios that blk-crypto-fallback submits no longer have an
+encryption context.  Therefore, lower layers only see standard unencrypted I/O.
+
+blk-crypto-fallback also defines its own blk_crypto_profile and has its own
+"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects.  The reason
+for this is twofold.  First, it allows the keyslot management logic to be tested
+without actual inline encryption hardware.  Second, similar to actual inline
+encryption hardware, the crypto API doesn't accept keys directly in requests but
+rather requires that keys be set ahead of time, and setting keys can be
+expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path
+at all due to the locks it takes.  Therefore, the concept of keyslots still
+makes sense for blk-crypto-fallback.
+
+Note that regardless of whether real inline encryption hardware or
  blk-crypto-fallback is used, the ciphertext written to disk (and hence the
-on-disk format of data) will be the same (assuming the hardware's implementation
-of the algorithm being used adheres to spec and functions correctly).
-
-If a ``request queue``'s inline encryption hardware claimed to support the
-encryption context specified with a bio, then it will not be handled by the
-``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a
-struct request needs to be allocated for that bio. At that point,
-blk-mq tries to program the encryption context into the ``request_queue``'s
-keyslot_manager, and obtain a keyslot, which it stores in its newly added
-``keyslot`` field. This keyslot is released when the request is completed.
-
-When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called,
-which sets the request's ``crypt_ctx`` to a copy of the bio's
-``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent
-bio is merged to the front of the request, which updates the ``crypt_ctx`` of
-the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first
-bio in its bio-list (blk-mq needs to be careful to maintain this invariant
-during bio and request merges).
-
-To make it possible for inline encryption to work with request queue based
-layered devices, when a request is cloned, its ``crypto fields`` are cloned as
-well. When the cloned request is submitted, blk-mq programs the
-``bi_crypt_context`` of the request into the clone's request_queue's keyslot
-manager, and stores the returned keyslot in the clone's ``keyslot``.
+on-disk format of data) will be the same (assuming that both the inline
+encryption hardware's implementation and the kernel crypto API's implementation
+of the algorithm being used adhere to spec and function correctly).
  
+blk-crypto-fallback is optional and is controlled by the
+``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option.
  
  API presented to users of the block layer
  =========================================
  
-``struct blk_crypto_key`` represents a crypto key (the raw key, size of the
-key, the crypto algorithm to use, the data unit size to use, and the number of
-bytes required to represent data unit numbers that will be specified with the
-``bi_crypt_context``).
-
-``blk_crypto_init_key`` allows upper layers to initialize such a
-``blk_crypto_key``.
-
-``bio_crypt_set_ctx`` should be called on any bio that a user of
-the block layer wants en/decrypted via inline encryption (or the
-blk-crypto-fallback, if hardware support isn't available for the desired
-crypto configuration). This function takes the ``blk_crypto_key`` and the
-data unit number (DUN) to use when en/decrypting the bio.
-
-``blk_crypto_config_supported`` allows upper layers to query whether or not the
-an encryption context passed to request queue can be handled by blk-crypto
-(either by real inline encryption hardware, or by the blk-crypto-fallback).
-This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer
-wants to use an algorithm that may not supported by hardware - this function
-lets the upper layer know ahead of time that the algorithm isn't supported,
-and the upper layer can fallback to something else if appropriate.
-
-``blk_crypto_start_using_key`` - Upper layers must call this function on
-``blk_crypto_key`` and a ``request_queue`` before using the key with any bio
-headed for that ``request_queue``. This function ensures that either the
-hardware supports the key's crypto settings, or the crypto API fallback has
-transforms for the needed mode allocated and ready to go. Note that this
-function may allocate an ``skcipher``, and must not be called from the data
-path, since allocating ``skciphers`` from the data path can deadlock.
-
-``blk_crypto_evict_key`` *must* be called by upper layers before a
-``blk_crypto_key`` is freed. Further, it *must* only be called only once
-there are no more in-flight requests that use that ``blk_crypto_key``.
-``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in
-inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback).
+``blk_crypto_config_supported()`` allows users to check ahead of time whether
+inline encryption with particular crypto settings will work on a particular
+request_queue -- either via hardware or via blk-crypto-fallback.  This function
+takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
+the actual bytes of the key and instead just contains the algorithm, data unit
+size, etc.  This function can be useful if blk-crypto-fallback is disabled.
+
+``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
+
+Users must call ``blk_crypto_start_using_key()`` before actually starting to use
+a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+was called earlier).  This is needed to initialize blk-crypto-fallback if it
+will be needed.  This must not be called from the data path, as this may have to
+allocate resources, which may deadlock in that case.
+
+Next, to attach an encryption context to a bio, users should call
+``bio_crypt_set_ctx()``.  This function allocates a bio_crypt_ctx and attaches
+it to a bio, given the blk_crypto_key and the data unit number that will be used
+for en/decryption.  Users don't need to worry about freeing the bio_crypt_ctx
+later, as that happens automatically when the bio is freed or reset.
+
+Finally, when done using inline encryption with a blk_crypto_key on a
+request_queue, users must call ``blk_crypto_evict_key()``.  This ensures that
+the key is evicted from all keyslots it may be programmed into and unlinked from
+any kernel data structures it may be linked into.
+
+In summary, for users of the block layer, the lifecycle of a blk_crypto_key is
+as follows:
+
+1. ``blk_crypto_config_supported()`` (optional)
+2. ``blk_crypto_init_key()``
+3. ``blk_crypto_start_using_key()``
+4. ``bio_crypt_set_ctx()`` (potentially many times)
+5. ``blk_crypto_evict_key()`` (after all I/O has completed)
+6. Zeroize the blk_crypto_key (this has no dedicated function)
+
+If a blk_crypto_key is being used on multiple request_queues, then
+``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
+and ``blk_crypto_evict_key()`` must be called on each request_queue.
  
  API presented to device drivers
  ===============================
  
-A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in
-the ``request_queue`` of the device. The device driver needs to call
-``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the
-``blk_keyslot_manager``, while specifying the number of keyslots supported by
-the hardware.
-
-The device driver also needs to tell the KSM how to actually manipulate the
-IE hardware in the device to do things like programming the crypto key into
-the IE hardware into a particular keyslot. All this is achieved through the
-struct blk_ksm_ll_ops field in the KSM that the device driver
-must fill up after initing the ``blk_keyslot_manager``.
-
-The KSM also handles runtime power management for the device when applicable
-(e.g. when it wants to program a crypto key into the IE hardware, the device
-must be runtime powered on) - so the device driver must also set the ``dev``
-field in the ksm to point to the `struct device` for the KSM to use for runtime
-power management.
-
-``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device
-needs each and every of its keyslots to be reprogrammed with the key it
-"should have" at the point in time when the function is called. This is useful
-e.g. if a device loses all its keys on runtime power down/up.
-
-If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then
-``blk_ksm_destroy`` should be called to free up all resources used by a
-``blk_keyslot_manager`` once it is no longer needed.
+A device driver that wants to support inline encryption must set up a
+blk_crypto_profile in the request_queue of its device.  To do this, it first
+must call ``blk_crypto_profile_init()`` (or its resource-managed variant
+``devm_blk_crypto_profile_init()``), providing the number of keyslots.
+
+Next, it must advertise its crypto capabilities by setting fields in the
+blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``.
+
+It then must set function pointers in the ``ll_ops`` field of the
+blk_crypto_profile to tell upper layers how to control the inline encryption
+hardware, e.g. how to program and evict keyslots.  Most drivers will need to
+implement ``keyslot_program`` and ``keyslot_evict``.  For details, see the
+comments for ``struct blk_crypto_ll_ops``.
+
+Once the driver registers a blk_crypto_profile with a request_queue, I/O
+requests the driver receives via that queue may have an encryption context.  All
+encryption contexts will be compatible with the crypto capabilities declared in
+the blk_crypto_profile, so drivers don't need to worry about handling
+unsupported requests.  Also, if a nonzero number of keyslots was declared in the
+blk_crypto_profile, then all I/O requests that have an encryption context will
+also have a keyslot which was already programmed with the appropriate key.
+
+If the driver implements runtime suspend and its blk_crypto_ll_ops don't work
+while the device is runtime-suspended, then the driver must also set the ``dev``
+field of the blk_crypto_profile to point to the ``struct device`` that will be
+resumed before any of the low-level operations are called.
+
+If there are situations where the inline encryption hardware loses the contents
+of its keyslots, e.g. device resets, the driver must handle reprogramming the
+keyslots.  To do this, the driver may call ``blk_crypto_reprogram_all_keys()``.
+
+Finally, if the driver used ``blk_crypto_profile_init()`` instead of
+``devm_blk_crypto_profile_init()``, then it is responsible for calling
+``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed.
  
  Layered Devices
  ===============
  
-Request queue based layered devices like dm-rq that wish to support IE need to
-create their own keyslot manager for their request queue, and expose whatever
-functionality they choose. When a layered device wants to pass a clone of that
-request to another ``request_queue``, blk-crypto will initialize and prepare the
-clone as necessary - see ``blk_crypto_insert_cloned_request`` in
-``blk-crypto.c``.
-
-
-Future Optimizations for layered devices
-========================================
-
-Creating a keyslot manager for a layered device uses up memory for each
-keyslot, and in general, a layered device merely passes the request on to a
-"child" device, so the keyslots in the layered device itself are completely
-unused, and don't need any refcounting or keyslot programming. We can instead
-define a new type of KSM; the "passthrough KSM", that layered devices can use
-to advertise an unlimited number of keyslots, and support for any encryption
-algorithms they choose, while not actually using any memory for each keyslot.
-Another use case for the "passthrough KSM" is for IE devices that do not have a
-limited number of keyslots.
-
+Request queue based layered devices like dm-rq that wish to support inline
+encryption need to create their own blk_crypto_profile for their request_queue,
+and expose whatever functionality they choose. When a layered device wants to
+pass a clone of that request to another request_queue, blk-crypto will
+initialize and prepare the clone as necessary; see
+``blk_crypto_insert_cloned_request()``.
  
  Interaction between inline encryption and blk integrity
  =======================================================
@@ -257,7 +298,7 @@ Because there isn't any real hardware yet, it seems prudent to assume that
  hardware implementations might not implement both features together correctly,
  and disallow the combination for now. Whenever a device supports integrity, the
  kernel will pretend that the device does not support hardware inline encryption
-(by essentially setting the keyslot manager in the request_queue of the device
-to NULL). When the crypto API fallback is enabled, this means that all bios with
-and encryption context will use the fallback, and IO will complete as usual.
-When the fallback is disabled, a bio with an encryption context will be failed.
+(by setting the blk_crypto_profile in the request_queue of the device to NULL).
+When the crypto API fallback is enabled, this means that all bios with and
+encryption context will use the fallback, and IO will complete as usual.  When
+the fallback is disabled, a bio with an encryption context will be failed.
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst

index 4dc7f0d..e8c7430 100644 (file)
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -4,7 +4,7 @@ Queue sysfs files
  
  This text file will detail the queue files that are located in the sysfs tree
  for each block device. Note that stacked devices typically do not export
-any settings, since their queue merely functions are a remapping target.
+any settings, since their queue merely functions as a remapping target.
  These files are the ones found in the /sys/block/xxx/queue/ directory.
  
  Files denoted with a RO postfix are readonly and the RW postfix means
@@ -286,4 +286,35 @@ sequential zones of zoned block devices (devices with a zoned attributed
  that reports "host-managed" or "host-aware"). This value is always 0 for
  regular block devices.
  
+independent_access_ranges (RO)
+------------------------------
+
+The presence of this sub-directory of the /sys/block/xxx/queue/ directory
+indicates that the device is capable of executing requests targeting
+different sector ranges in parallel. For instance, single LUN multi-actuator
+hard-disks will have an independent_access_ranges directory if the device
+correctly advertizes the sector ranges of its actuators.
+
+The independent_access_ranges directory contains one directory per access
+range, with each range described using the sector (RO) attribute file to
+indicate the first sector of the range and the nr_sectors (RO) attribute file
+to indicate the total number of sectors in the range starting from the first
+sector of the range.  For example, a dual-actuator hard-disk will have the
+following independent_access_ranges entries.::
+
+        $ tree /sys/block/<device>/queue/independent_access_ranges/
+        /sys/block/<device>/queue/independent_access_ranges/
+        |-- 0
+        |   |-- nr_sectors
+        |   `-- sector
+        `-- 1
+            |-- nr_sectors
+            `-- sector
+
+The sector and nr_sectors attributes use 512B sector unit, regardless of
+the actual block size of the device. Independent access ranges do not
+overlap and include all sectors within the device capacity. The access
+ranges are numbered in increasing order of the range start sector,
+that is, the sector attribute of range 0 always has the value 0.
+
  Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst

index 5845960..52ea7b6 100644 (file)
--- a/Documentation/cdrom/cdrom-standard.rst
+++ b/Documentation/cdrom/cdrom-standard.rst
@@ -907,6 +907,17 @@ commands can be identified by the underscores in their names.
         specifies the slot for which the information is given. The special
         value *CDSL_CURRENT* requests that information about the currently
         selected slot be returned.
+`CDROM_TIMED_MEDIA_CHANGE`
+       Checks whether the disc has been changed since a user supplied time
+       and returns the time of the last disc change.
+
+       *arg* is a pointer to a *cdrom_timed_media_change_info* struct.
+       *arg->last_media_change* may be set by calling code to signal
+       the timestamp of the last known media change (by the caller).
+       Upon successful return, this ioctl call will set
+       *arg->last_media_change* to the latest media change timestamp (in ms)
+       known by the kernel/driver and set *arg->has_changed* to 1 if
+       that timestamp is more recent than the timestamp set by the caller.
  `CDROM_DRIVE_STATUS`
         Returns the status of the drive by a call to
         *drive_status()*. Return values are defined in cdrom_drive_status_.
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst

index 8aed910..5c0552e 100644 (file)
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -326,6 +326,12 @@ maps this page at its virtual address.
                         dirty.  Again, see sparc64 for examples of how
                         to deal with this.
  
+  ``void flush_dcache_folio(struct folio *folio)``
+       This function is called under the same circumstances as
+       flush_dcache_page().  It allows the architecture to
+       optimise for flushing the entire folio of pages instead
+       of flushing one page at a time.
+
    ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
    unsigned long user_vaddr, void *dst, void *src, int len)``
    ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst

index a42f9ba..395835f 100644 (file)
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -95,6 +95,11 @@ More Memory Management Functions
  .. kernel-doc:: mm/mempolicy.c
  .. kernel-doc:: include/linux/mm_types.h
     :internal:
+.. kernel-doc:: include/linux/mm_inline.h
+.. kernel-doc:: include/linux/page-flags.h
  .. kernel-doc:: include/linux/mm.h
     :internal:
+.. kernel-doc:: include/linux/page_ref.h
  .. kernel-doc:: include/linux/mmzone.h
+.. kernel-doc:: mm/util.c
+   :functions: folio_mapping
diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst

index b97579b..01df283 100644 (file)
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
     immutable and bit-for-bit identical to the official golden image for
     their releases due to security and other considerations and
  
- - hope to save some extra storage space with guaranteed end-to-end performance
-   by using reduced metadata and transparent file compression, especially
-   for those embedded devices with limited memory (ex, smartphone);
+ - hope to minimize extra storage space with guaranteed end-to-end performance
+   by using compact layout, transparent file compression and direct access,
+   especially for those embedded devices with limited memory and high-density
+   hosts with numerous containers;
  
  Here is the main features of EROFS:
  
@@ -51,7 +52,9 @@ Here is the main features of EROFS:
   - Support POSIX.1e ACLs by using xattrs;
  
   - Support transparent data compression as an option:
-   LZ4 algorithm with the fixed-sized output compression for high performance.
+   LZ4 algorithm with the fixed-sized output compression for high performance;
+
+ - Multiple device support for multi-layer container images.
  
  The following git tree provides the file system user-space tools under
  development (ex, formatting tool mkfs.erofs):
@@ -87,6 +90,7 @@ cache_strategy=%s      Select a strategy for cached decompression from now on:
  dax={always,never}     Use direct access (no page cache).  See
                         Documentation/filesystems/dax.rst.
  dax                    A legacy option which is an alias for ``dax=always``.
+device=%s              Specify a path to an extra device to be used together.
  ===================    =========================================================
  
  On-disk details
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst

index 0eb799d..4d5d50d 100644 (file)
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -77,11 +77,11 @@ Side-channel attacks
  
  fscrypt is only resistant to side-channel attacks, such as timing or
  electromagnetic attacks, to the extent that the underlying Linux
-Cryptographic API algorithms are.  If a vulnerable algorithm is used,
-such as a table-based implementation of AES, it may be possible for an
-attacker to mount a side channel attack against the online system.
-Side channel attacks may also be mounted against applications
-consuming decrypted data.
+Cryptographic API algorithms or inline encryption hardware are.  If a
+vulnerable algorithm is used, such as a table-based implementation of
+AES, it may be possible for an attacker to mount a side channel attack
+against the online system.  Side channel attacks may also be mounted
+against applications consuming decrypted data.
  
  Unauthorized file access
  ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -176,11 +176,11 @@ Master Keys
  
  Each encrypted directory tree is protected by a *master key*.  Master
  keys can be up to 64 bytes long, and must be at least as long as the
-greater of the key length needed by the contents and filenames
-encryption modes being used.  For example, if AES-256-XTS is used for
-contents encryption, the master key must be 64 bytes (512 bits).  Note
-that the XTS mode is defined to require a key twice as long as that
-required by the underlying block cipher.
+greater of the security strength of the contents and filenames
+encryption modes being used.  For example, if any AES-256 mode is
+used, the master key must be at least 256 bits, i.e. 32 bytes.  A
+stricter requirement applies if the key is used by a v1 encryption
+policy and AES-256-XTS is used; such keys must be 64 bytes.
  
  To "unlock" an encrypted directory tree, userspace must provide the
  appropriate master key.  There can be any number of master keys, each
@@ -1135,6 +1135,50 @@ where applications may later write sensitive data.  It is recommended
  that systems implementing a form of "verified boot" take advantage of
  this by validating all top-level encryption policies prior to access.
  
+Inline encryption support
+=========================
+
+By default, fscrypt uses the kernel crypto API for all cryptographic
+operations (other than HKDF, which fscrypt partially implements
+itself).  The kernel crypto API supports hardware crypto accelerators,
+but only ones that work in the traditional way where all inputs and
+outputs (e.g. plaintexts and ciphertexts) are in memory.  fscrypt can
+take advantage of such hardware, but the traditional acceleration
+model isn't particularly efficient and fscrypt hasn't been optimized
+for it.
+
+Instead, many newer systems (especially mobile SoCs) have *inline
+encryption hardware* that can encrypt/decrypt data while it is on its
+way to/from the storage device.  Linux supports inline encryption
+through a set of extensions to the block layer called *blk-crypto*.
+blk-crypto allows filesystems to attach encryption contexts to bios
+(I/O requests) to specify how the data will be encrypted or decrypted
+in-line.  For more information about blk-crypto, see
+:ref:`Documentation/block/inline-encryption.rst <inline_encryption>`.
+
+On supported filesystems (currently ext4 and f2fs), fscrypt can use
+blk-crypto instead of the kernel crypto API to encrypt/decrypt file
+contents.  To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in
+the kernel configuration, and specify the "inlinecrypt" mount option
+when mounting the filesystem.
+
+Note that the "inlinecrypt" mount option just specifies to use inline
+encryption when possible; it doesn't force its use.  fscrypt will
+still fall back to using the kernel crypto API on files where the
+inline encryption hardware doesn't have the needed crypto capabilities
+(e.g. support for the needed encryption algorithm and data unit size)
+and where blk-crypto-fallback is unusable.  (For blk-crypto-fallback
+to be usable, it must be enabled in the kernel configuration with
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.)
+
+Currently fscrypt always uses the filesystem block size (which is
+usually 4096 bytes) as the data unit size.  Therefore, it can only use
+inline encryption hardware that supports that data unit size.
+
+Inline encryption doesn't affect the ciphertext or other aspects of
+the on-disk format, so users may freely switch back and forth between
+using "inlinecrypt" and not using "inlinecrypt".
+
  Implementation details
  ======================
  
@@ -1184,6 +1228,13 @@ keys`_ and `DIRECT_KEY policies`_.
  Data path changes
  -----------------
  
+When inline encryption is used, filesystems just need to associate
+encryption contexts with bios to specify how the block layer or the
+inline encryption hardware will encrypt/decrypt the file contents.
+
+When inline encryption isn't used, filesystems must encrypt/decrypt
+the file contents themselves, as described below:
+
  For the read path (->readpage()) of regular files, filesystems can
  read the ciphertext into the page cache and decrypt it in-place.  The
  page lock must be held until decryption has finished, to prevent the
@@ -1197,18 +1248,6 @@ buffer.  Some filesystems, such as UBIFS, already use temporary
  buffers regardless of encryption.  Other filesystems, such as ext4 and
  F2FS, have to allocate bounce pages specially for encryption.
  
-Fscrypt is also able to use inline encryption hardware instead of the
-kernel crypto API for en/decryption of file contents.  When possible,
-and if directed to do so (by specifying the 'inlinecrypt' mount option
-for an ext4/F2FS filesystem), it adds encryption contexts to bios and
-uses blk-crypto to perform the en/decryption instead of making use of
-the above read/write path changes.  Of course, even if directed to
-make use of inline encryption, fscrypt will only be able to do so if
-either hardware inline encryption support is available for the
-selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
-is selected.  If neither is the case, fscrypt will fall back to using
-the above mentioned read/write path changes for en/decryption.
-
  Filename hashing and encoding
  -----------------------------
  
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst

index c0ad233..bee63d4 100644 (file)
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -29,7 +29,6 @@ algorithms work.
     fiemap
     files
     locks
-   mandatory-locking
     mount_api
     quota
     seq_file
diff --git a/Documentation/filesystems/locks.rst b/Documentation/filesystems/locks.rst

index c5ae858..2642931 100644 (file)
--- a/Documentation/filesystems/locks.rst
+++ b/Documentation/filesystems/locks.rst
@@ -57,16 +57,9 @@ fcntl(), with all the problems that implies.
  1.3 Mandatory Locking As A Mount Option
  ---------------------------------------
  
-Mandatory locking, as described in
-'Documentation/filesystems/mandatory-locking.rst' was prior to this release a
-general configuration option that was valid for all mounted filesystems.  This
-had a number of inherent dangers, not the least of which was the ability to
-freeze an NFS server by asking it to read a file for which a mandatory lock
-existed.
-
-From this release of the kernel, mandatory locking can be turned on and off
-on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
-The default is to disallow mandatory locking. The intention is that
-mandatory locking only be enabled on a local filesystem as the specific need
-arises.
+Mandatory locking was prior to this release a general configuration option
+that was valid for all mounted filesystems.  This had a number of inherent
+dangers, not the least of which was the ability to freeze an NFS server by
+asking it to read a file for which a mandatory lock existed.
  
+Such option was dropped in Kernel v5.14.
diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst

index 57a6418..bb68d39 100644 (file)
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -524,3 +524,5 @@ Note that these methods are passed a pointer to the cache resource structure,
  not the read request structure as they could be used in other situations where
  there isn't a read request structure as well, such as writing dirty data to the
  cache.
+
+.. kernel-doc:: include/linux/netfs.h
diff --git a/Documentation/userspace-api/ioctl/cdrom.rst b/Documentation/userspace-api/ioctl/cdrom.rst

index 3b4c050..682948f 100644 (file)
--- a/Documentation/userspace-api/ioctl/cdrom.rst
+++ b/Documentation/userspace-api/ioctl/cdrom.rst
@@ -13,61 +13,64 @@ in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
  ioctl values are listed in <linux/cdrom.h>.  As of this writing, they
  are as follows:
  
-       ======================  ===============================================
-       CDROMPAUSE              Pause Audio Operation
-       CDROMRESUME             Resume paused Audio Operation
-       CDROMPLAYMSF            Play Audio MSF (struct cdrom_msf)
-       CDROMPLAYTRKIND         Play Audio Track/index (struct cdrom_ti)
-       CDROMREADTOCHDR         Read TOC header (struct cdrom_tochdr)
-       CDROMREADTOCENTRY       Read TOC entry (struct cdrom_tocentry)
-       CDROMSTOP               Stop the cdrom drive
-       CDROMSTART              Start the cdrom drive
-       CDROMEJECT              Ejects the cdrom media
-       CDROMVOLCTRL            Control output volume (struct cdrom_volctrl)
-       CDROMSUBCHNL            Read subchannel data (struct cdrom_subchnl)
-       CDROMREADMODE2          Read CDROM mode 2 data (2336 Bytes)
-                               (struct cdrom_read)
-       CDROMREADMODE1          Read CDROM mode 1 data (2048 Bytes)
-                               (struct cdrom_read)
-       CDROMREADAUDIO          (struct cdrom_read_audio)
-       CDROMEJECT_SW           enable(1)/disable(0) auto-ejecting
-       CDROMMULTISESSION       Obtain the start-of-last-session
-                               address of multi session disks
-                               (struct cdrom_multisession)
-       CDROM_GET_MCN           Obtain the "Universal Product Code"
-                               if available (struct cdrom_mcn)
-       CDROM_GET_UPC           Deprecated, use CDROM_GET_MCN instead.
-       CDROMRESET              hard-reset the drive
-       CDROMVOLREAD            Get the drive's volume setting
-                               (struct cdrom_volctrl)
-       CDROMREADRAW            read data in raw mode (2352 Bytes)
-                               (struct cdrom_read)
-       CDROMREADCOOKED         read data in cooked mode
-       CDROMSEEK               seek msf address
-       CDROMPLAYBLK            scsi-cd only, (struct cdrom_blk)
-       CDROMREADALL            read all 2646 bytes
-       CDROMGETSPINDOWN        return 4-bit spindown value
-       CDROMSETSPINDOWN        set 4-bit spindown value
-       CDROMCLOSETRAY          pendant of CDROMEJECT
-       CDROM_SET_OPTIONS       Set behavior options
-       CDROM_CLEAR_OPTIONS     Clear behavior options
-       CDROM_SELECT_SPEED      Set the CD-ROM speed
-       CDROM_SELECT_DISC       Select disc (for juke-boxes)
-       CDROM_MEDIA_CHANGED     Check is media changed
-       CDROM_DRIVE_STATUS      Get tray position, etc.
-       CDROM_DISC_STATUS       Get disc type, etc.
-       CDROM_CHANGER_NSLOTS    Get number of slots
-       CDROM_LOCKDOOR          lock or unlock door
-       CDROM_DEBUG             Turn debug messages on/off
-       CDROM_GET_CAPABILITY    get capabilities
-       CDROMAUDIOBUFSIZ        set the audio buffer size
-       DVD_READ_STRUCT         Read structure
-       DVD_WRITE_STRUCT        Write structure
-       DVD_AUTH                Authentication
-       CDROM_SEND_PACKET       send a packet to the drive
-       CDROM_NEXT_WRITABLE     get next writable block
-       CDROM_LAST_WRITTEN      get last block written on disc
-       ======================  ===============================================
+       ========================  ===============================================
+       CDROMPAUSE                Pause Audio Operation
+       CDROMRESUME               Resume paused Audio Operation
+       CDROMPLAYMSF              Play Audio MSF (struct cdrom_msf)
+       CDROMPLAYTRKIND           Play Audio Track/index (struct cdrom_ti)
+       CDROMREADTOCHDR           Read TOC header (struct cdrom_tochdr)
+       CDROMREADTOCENTRY         Read TOC entry (struct cdrom_tocentry)
+       CDROMSTOP                 Stop the cdrom drive
+       CDROMSTART                Start the cdrom drive
+       CDROMEJECT                Ejects the cdrom media
+       CDROMVOLCTRL              Control output volume (struct cdrom_volctrl)
+       CDROMSUBCHNL              Read subchannel data (struct cdrom_subchnl)
+       CDROMREADMODE2            Read CDROM mode 2 data (2336 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADMODE1            Read CDROM mode 1 data (2048 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADAUDIO            (struct cdrom_read_audio)
+       CDROMEJECT_SW             enable(1)/disable(0) auto-ejecting
+       CDROMMULTISESSION         Obtain the start-of-last-session
+                                 address of multi session disks
+                                 (struct cdrom_multisession)
+       CDROM_GET_MCN             Obtain the "Universal Product Code"
+                                 if available (struct cdrom_mcn)
+       CDROM_GET_UPC             Deprecated, use CDROM_GET_MCN instead.
+       CDROMRESET                hard-reset the drive
+       CDROMVOLREAD              Get the drive's volume setting
+                                 (struct cdrom_volctrl)
+       CDROMREADRAW              read data in raw mode (2352 Bytes)
+                                 (struct cdrom_read)
+       CDROMREADCOOKED           read data in cooked mode
+       CDROMSEEK                 seek msf address
+       CDROMPLAYBLK              scsi-cd only, (struct cdrom_blk)
+       CDROMREADALL              read all 2646 bytes
+       CDROMGETSPINDOWN          return 4-bit spindown value
+       CDROMSETSPINDOWN          set 4-bit spindown value
+       CDROMCLOSETRAY            pendant of CDROMEJECT
+       CDROM_SET_OPTIONS         Set behavior options
+       CDROM_CLEAR_OPTIONS       Clear behavior options
+       CDROM_SELECT_SPEED        Set the CD-ROM speed
+       CDROM_SELECT_DISC         Select disc (for juke-boxes)
+       CDROM_MEDIA_CHANGED       Check is media changed
+       CDROM_TIMED_MEDIA_CHANGE  Check if media changed
+                                 since given time
+                                 (struct cdrom_timed_media_change_info)
+       CDROM_DRIVE_STATUS        Get tray position, etc.
+       CDROM_DISC_STATUS         Get disc type, etc.
+       CDROM_CHANGER_NSLOTS      Get number of slots
+       CDROM_LOCKDOOR            lock or unlock door
+       CDROM_DEBUG               Turn debug messages on/off
+       CDROM_GET_CAPABILITY      get capabilities
+       CDROMAUDIOBUFSIZ          set the audio buffer size
+       DVD_READ_STRUCT           Read structure
+       DVD_WRITE_STRUCT          Write structure
+       DVD_AUTH                  Authentication
+       CDROM_SEND_PACKET         send a packet to the drive
+       CDROM_NEXT_WRITABLE       get next writable block
+       CDROM_LAST_WRITTEN        get last block written on disc
+       ========================  ===============================================
  
  
  The information that follows was determined from reading kernel source
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst

index 2e81340..6655d92 100644 (file)
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -104,6 +104,7 @@ Code  Seq#    Include File                                           Comments
  '8'   all                                                            SNP8023 advanced NIC card
                                                                       <mailto:mcr@solidum.com>
  ';'   64-7F  linux/vfio.h
+'='   00-3f  uapi/linux/ptp_clock.h                                  <mailto:richardcochran@gmail.com>
  '@'   00-0F  linux/radeonfb.h                                        conflict!
  '@'   00-0F  drivers/video/aty/aty128fb.c                            conflict!
  'A'   00-1F  linux/apm_bios.h                                        conflict!
diff --git a/MAINTAINERS b/MAINTAINERS

index f26920f..3b79fd4 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5458,6 +5458,19 @@ F:       include/net/devlink.h
  F:     include/uapi/linux/devlink.h
  F:     net/core/devlink.c
  
+DH ELECTRONICS IMX6 DHCOM BOARD SUPPORT
+M:     Christoph Niedermaier <cniedermaier@dh-electronics.com>
+L:     kernel@dh-electronics.com
+S:     Maintained
+F:     arch/arm/boot/dts/imx6*-dhcom-*
+
+DH ELECTRONICS STM32MP1 DHCOM/DHCOR BOARD SUPPORT
+M:     Marek Vasut <marex@denx.de>
+L:     kernel@dh-electronics.com
+S:     Maintained
+F:     arch/arm/boot/dts/stm32mp1*-dhcom-*
+F:     arch/arm/boot/dts/stm32mp1*-dhcor-*
+
  DIALOG SEMICONDUCTOR DRIVERS
  M:     Support Opensource <support.opensource@diasemi.com>
  S:     Supported
@@ -6147,8 +6160,7 @@ T:        git git://anongit.freedesktop.org/drm/drm
  F:     Documentation/devicetree/bindings/display/
  F:     Documentation/devicetree/bindings/gpu/
  F:     Documentation/gpu/
-F:     drivers/gpu/drm/
-F:     drivers/gpu/vga/
+F:     drivers/gpu/
  F:     include/drm/
  F:     include/linux/vga*
  F:     include/uapi/drm/
@@ -11278,7 +11290,6 @@ F:      Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
  F:     drivers/net/ethernet/marvell/octeontx2/af/
  
  MARVELL PRESTERA ETHERNET SWITCH DRIVER
-M:     Vadym Kochan <vkochan@marvell.com>
  M:     Taras Chornyi <tchornyi@marvell.com>
  S:     Supported
  W:     https://github.com/Marvell-switching/switchdev-prestera
diff --git a/Makefile b/Makefile

index 30c7c81..a523163 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
  VERSION = 5
  PATCHLEVEL = 15
  SUBLEVEL = 0
-EXTRAVERSION = -rc7
-NAME = Opossums on Parade
+EXTRAVERSION =
+NAME = Trick or Treat
  
  # *DOCUMENTATION*
  # To see a list of typical targets execute "make help"
@@ -1115,7 +1115,8 @@ export MODORDER := $(extmod_prefix)modules.order
  export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
  
  ifeq ($(KBUILD_EXTMOD),)
-core-y         += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y                 += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
+core-$(CONFIG_BLOCK)   += block/
  
  vmlinux-dirs   := $(patsubst %/,%,$(filter %/, \
                      $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h

index e201b4b..e8c2c74 100644 (file)
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -36,6 +36,7 @@ void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
  
  void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
  
  void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
  void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts

index 8077f17..ecb91fb 100644 (file)
--- a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts
+++ b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts
@@ -112,7 +112,7 @@
         pinctrl-names = "default";
         pinctrl-0 = <&gmac_rgmii_pins>;
         phy-handle = <&phy1>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-id";
         status = "okay";
  };
  
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index 5e56288..e68fb87 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -290,6 +290,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr
   */
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
  extern void flush_dcache_page(struct page *);
+void flush_dcache_folio(struct folio *folio);
  
  #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
  static inline void flush_kernel_vmap_range(void *addr, int size)
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts

index 02f8e72..05486cc 100644 (file)
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
@@ -75,7 +75,7 @@
         pinctrl-0 = <&emac_rgmii_pins>;
         phy-supply = <&reg_gmac_3v3>;
         phy-handle = <&ext_rgmii_phy>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-id";
         status = "okay";
  };
  
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts

index d17abb5..e99e764 100644 (file)
--- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts
@@ -70,7 +70,9 @@
                 regulator-name = "rst-usb-eth2";
                 pinctrl-names = "default";
                 pinctrl-0 = <&pinctrl_usb_eth2>;
-               gpio = <&gpio3 2 GPIO_ACTIVE_LOW>;
+               gpio = <&gpio3 2 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+               regulator-always-on;
         };
  
         reg_vdd_5v: regulator-5v {
@@ -95,7 +97,7 @@
                 clocks = <&osc_can>;
                 interrupt-parent = <&gpio4>;
                 interrupts = <28 IRQ_TYPE_EDGE_FALLING>;
-               spi-max-frequency = <100000>;
+               spi-max-frequency = <10000000>;
                 vdd-supply = <&reg_vdd_3v3>;
                 xceiver-supply = <&reg_vdd_5v>;
         };
@@ -111,7 +113,7 @@
  &fec1 {
         pinctrl-names = "default";
         pinctrl-0 = <&pinctrl_enet>;
-       phy-connection-type = "rgmii";
+       phy-connection-type = "rgmii-rxid";
         phy-handle = <&ethphy>;
         status = "okay";
  
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi

index 9db9b90..42bbbb3 100644 (file)
--- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi
@@ -91,10 +91,12 @@
                         reg_vdd_soc: BUCK1 {
                                 regulator-name = "buck1";
                                 regulator-min-microvolt = <800000>;
-                               regulator-max-microvolt = <900000>;
+                               regulator-max-microvolt = <850000>;
                                 regulator-boot-on;
                                 regulator-always-on;
                                 regulator-ramp-delay = <3125>;
+                               nxp,dvs-run-voltage = <850000>;
+                               nxp,dvs-standby-voltage = <800000>;
                         };
  
                         reg_vdd_arm: BUCK2 {
@@ -111,7 +113,7 @@
                         reg_vdd_dram: BUCK3 {
                                 regulator-name = "buck3";
                                 regulator-min-microvolt = <850000>;
-                               regulator-max-microvolt = <900000>;
+                               regulator-max-microvolt = <950000>;
                                 regulator-boot-on;
                                 regulator-always-on;
                         };
@@ -150,7 +152,7 @@
  
                         reg_vdd_snvs: LDO2 {
                                 regulator-name = "ldo2";
-                               regulator-min-microvolt = <850000>;
+                               regulator-min-microvolt = <800000>;
                                 regulator-max-microvolt = <900000>;
                                 regulator-boot-on;
                                 regulator-always-on;
diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi

index 8c15d9f..d12e4cb 100644 (file)
--- a/arch/arm64/boot/dts/qcom/sm8250.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi
@@ -2590,9 +2590,10 @@
                         power-domains = <&dispcc MDSS_GDSC>;
  
                         clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+                                <&gcc GCC_DISP_HF_AXI_CLK>,
                                  <&gcc GCC_DISP_SF_AXI_CLK>,
                                  <&dispcc DISP_CC_MDSS_MDP_CLK>;
-                       clock-names = "iface", "nrt_bus", "core";
+                       clock-names = "iface", "bus", "nrt_bus", "core";
  
                         assigned-clocks = <&dispcc DISP_CC_MDSS_MDP_CLK>;
                         assigned-clock-rates = <460000000>;
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c

index 41c23f4..803e777 100644 (file)
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1136,6 +1136,11 @@ out:
         return prog;
  }
  
+u64 bpf_jit_alloc_exec_limit(void)
+{
+       return BPF_JIT_REGION_SIZE;
+}
+
  void *bpf_jit_alloc_exec(unsigned long size)
  {
         return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c

index 9a8394e..9c57b24 100644 (file)
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -58,7 +58,7 @@ struct nfhd_device {
         struct gendisk *disk;
  };
  
-static blk_qc_t nfhd_submit_bio(struct bio *bio)
+static void nfhd_submit_bio(struct bio *bio)
  {
         struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
         struct bio_vec bvec;
@@ -76,7 +76,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio)
                 sec += len;
         }
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -100,6 +99,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
  {
         struct nfhd_device *dev;
         int dev_id = id - NFHD_DEV_OFFSET;
+       int err = -ENOMEM;
  
         pr_info("nfhd%u: found device with %u blocks (%u bytes)\n", dev_id,
                 blocks, bsize);
@@ -130,16 +130,20 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
         sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
         set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
         blk_queue_logical_block_size(dev->disk->queue, bsize);
-       add_disk(dev->disk);
+       err = add_disk(dev->disk);
+       if (err)
+               goto out_cleanup_disk;
  
         list_add_tail(&dev->list, &nfhd_list);
  
         return 0;
  
+out_cleanup_disk:
+       blk_cleanup_disk(dev->disk);
  free_dev:
         kfree(dev);
  out:
-       return -ENOMEM;
+       return err;
  }
  
  static int __init nfhd_init(void)
diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h

index 1ac55e7..8ab4662 100644 (file)
--- a/arch/m68k/include/asm/cacheflush_mm.h
+++ b/arch/m68k/include/asm/cacheflush_mm.h
@@ -250,6 +250,7 @@ static inline void __flush_page_to_ram(void *vaddr)
  
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
  #define flush_dcache_page(page)                __flush_page_to_ram(page_address(page))
+void flush_dcache_folio(struct folio *folio);
  #define flush_dcache_mmap_lock(mapping)                do { } while (0)
  #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
  #define flush_icache_page(vma, page)   __flush_page_to_ram(page_address(page))
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h

index b3dc9c5..f207388 100644 (file)
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -61,6 +61,8 @@ static inline void flush_dcache_page(struct page *page)
                 SetPageDcacheDirty(page);
  }
  
+void flush_dcache_folio(struct folio *folio);
+
  #define flush_dcache_mmap_lock(mapping)                do { } while (0)
  #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
  
diff --git a/arch/mips/rb532/prom.c b/arch/mips/rb532/prom.c

index 23ad8dd..b116937 100644 (file)
--- a/arch/mips/rb532/prom.c
+++ b/arch/mips/rb532/prom.c
@@ -16,7 +16,6 @@
  #include <linux/console.h>
  #include <linux/memblock.h>
  #include <linux/ioport.h>
-#include <linux/blkdev.h>
  
  #include <asm/bootinfo.h>
  #include <asm/mach-rc32434/ddr.h>
diff --git a/arch/mips/sibyte/common/cfe.c b/arch/mips/sibyte/common/cfe.c

index a3323f8..1a50429 100644 (file)
--- a/arch/mips/sibyte/common/cfe.c
+++ b/arch/mips/sibyte/common/cfe.c
@@ -7,7 +7,6 @@
  #include <linux/kernel.h>
  #include <linux/linkage.h>
  #include <linux/mm.h>
-#include <linux/blkdev.h>
  #include <linux/memblock.h>
  #include <linux/pm.h>
  #include <linux/smp.h>
diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c

index 538a279..f07b15d 100644 (file)
--- a/arch/mips/sibyte/swarm/setup.c
+++ b/arch/mips/sibyte/swarm/setup.c
@@ -11,7 +11,6 @@
  #include <linux/spinlock.h>
  #include <linux/mm.h>
  #include <linux/memblock.h>
-#include <linux/blkdev.h>
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/screen_info.h>
diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h

index c2a222e..3fc0bb7 100644 (file)
--- a/arch/nds32/include/asm/cacheflush.h
+++ b/arch/nds32/include/asm/cacheflush.h
@@ -27,6 +27,7 @@ void flush_cache_vunmap(unsigned long start, unsigned long end);
  
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
  void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
  void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                        unsigned long vaddr, void *dst, void *src, int len);
  void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c

index 0e23e3a..d55b73b 100644 (file)
--- a/arch/nds32/kernel/ftrace.c
+++ b/arch/nds32/kernel/ftrace.c
@@ -6,7 +6,7 @@
  
  #ifndef CONFIG_DYNAMIC_FTRACE
  extern void (*ftrace_trace_function)(unsigned long, unsigned long,
-                                    struct ftrace_ops*, struct pt_regs*);
+                                    struct ftrace_ops*, struct ftrace_regs*);
  extern void ftrace_graph_caller(void);
  
  noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h

index 18eb9f6..1999561 100644 (file)
--- a/arch/nios2/include/asm/cacheflush.h
+++ b/arch/nios2/include/asm/cacheflush.h
@@ -28,7 +28,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
  extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr,
         unsigned long pfn);
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
  
  extern void flush_icache_range(unsigned long start, unsigned long end);
  extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
diff --git a/arch/nios2/platform/Kconfig.platform b/arch/nios2/platform/Kconfig.platform

index 9e32fb7..e849daf 100644 (file)
--- a/arch/nios2/platform/Kconfig.platform
+++ b/arch/nios2/platform/Kconfig.platform
@@ -37,6 +37,7 @@ config NIOS2_DTB_PHYS_ADDR
  
  config NIOS2_DTB_SOURCE_BOOL
         bool "Compile and link device tree into kernel image"
+       depends on !COMPILE_TEST
         help
           This allows you to specify a dts (device tree source) file
           which will be compiled and linked into the kernel image.
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c

index cfef61a..97305bd 100644 (file)
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -25,7 +25,6 @@
  #include <linux/memblock.h>
  #include <linux/init.h>
  #include <linux/delay.h>
-#include <linux/blkdev.h>      /* for initrd_* */
  #include <linux/pagemap.h>
  
  #include <asm/pgalloc.h>
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h

index eef0096..da0cd4b 100644 (file)
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -49,7 +49,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size);
  #define flush_cache_vunmap(start, end)         flush_cache_all()
  
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
  
  #define flush_dcache_mmap_lock(mapping)                xa_lock_irq(&mapping->i_pages)
  #define flush_dcache_mmap_unlock(mapping)      xa_unlock_irq(&mapping->i_pages)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c

index bed05b6..cb25acc 100644 (file)
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -21,6 +21,7 @@
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/poll.h>
+#include <linux/seq_file.h>
  #include <linux/slab.h>
  
  #include <asm/prom.h>
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c

index dab5c56..a52af8f 100644 (file)
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1302,6 +1302,12 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
                 struct property *default_win;
                 int reset_win_ext;
  
+               /* DDW + IOMMU on single window may fail if there is any allocation */
+               if (iommu_table_in_use(tbl)) {
+                       dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
+                       goto out_failed;
+               }
+
                 default_win = of_find_property(pdn, "ibm,dma-window", NULL);
                 if (!default_win)
                         goto out_failed;
@@ -1356,12 +1362,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
                         query.largest_available_block,
                         1ULL << page_shift);
  
-               /* DDW + IOMMU on single window may fail if there is any allocation */
-               if (default_win_removed && iommu_table_in_use(tbl)) {
-                       dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
-                       goto out_failed;
-               }
-
                 len = order_base_2(query.largest_available_block << page_shift);
                 win_name = DMA64_PROPNAME;
         } else {
@@ -1411,18 +1411,19 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
         } else {
                 struct iommu_table *newtbl;
                 int i;
+               unsigned long start = 0, end = 0;
  
                 for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
                         const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
  
                         /* Look for MMIO32 */
-                       if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM)
+                       if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
+                               start = pci->phb->mem_resources[i].start;
+                               end = pci->phb->mem_resources[i].end;
                                 break;
+                       }
                 }
  
-               if (i == ARRAY_SIZE(pci->phb->mem_resources))
-                       goto out_del_list;
-
                 /* New table for using DDW instead of the default DMA window */
                 newtbl = iommu_pseries_alloc_table(pci->phb->node);
                 if (!newtbl) {
@@ -1432,15 +1433,15 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
  
                 iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
                                             1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
-               iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start,
-                                pci->phb->mem_resources[i].end);
+               iommu_init_table(newtbl, pci->phb->node, start, end);
  
                 pci->table_group->tables[1] = newtbl;
  
                 /* Keep default DMA window stuct if removed */
                 if (default_win_removed) {
                         tbl->it_size = 0;
-                       kfree(tbl->it_map);
+                       vfree(tbl->it_map);
+                       tbl->it_map = NULL;
                 }
  
                 set_iommu_table_base(&dev->dev, newtbl);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig

index 6a6fa9e..f076cee 100644 (file)
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -163,6 +163,12 @@ config PAGE_OFFSET
         default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
         default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
  
+config KASAN_SHADOW_OFFSET
+       hex
+       depends on KASAN_GENERIC
+       default 0xdfffffc800000000 if 64BIT
+       default 0xffffffff if 32BIT
+
  config ARCH_FLATMEM_ENABLE
         def_bool !NUMA
  
diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h

index a2b3d9c..b00f503 100644 (file)
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -30,8 +30,7 @@
  #define KASAN_SHADOW_SIZE      (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
  #define KASAN_SHADOW_START     KERN_VIRT_START
  #define KASAN_SHADOW_END       (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
-#define KASAN_SHADOW_OFFSET    (KASAN_SHADOW_END - (1ULL << \
-                                       (64 - KASAN_SHADOW_SCALE_SHIFT)))
+#define KASAN_SHADOW_OFFSET    _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
  
  void kasan_init(void);
  asmlinkage void kasan_early_init(void);
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S

index fce5184..52c5ff9 100644 (file)
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -193,6 +193,7 @@ setup_trap_vector:
         csrw CSR_SCRATCH, zero
         ret
  
+.align 2
  .Lsecondary_park:
         /* We lack SMP support or have too many harts, so park this hart */
         wfi
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c

index d7189c8..54294f8 100644 (file)
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -17,6 +17,9 @@ asmlinkage void __init kasan_early_init(void)
         uintptr_t i;
         pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START);
  
+       BUILD_BUG_ON(KASAN_SHADOW_OFFSET !=
+               KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
+
         for (i = 0; i < PTRS_PER_PTE; ++i)
                 set_pte(kasan_early_shadow_pte + i,
                         mk_pte(virt_to_page(kasan_early_shadow_page),
@@ -172,21 +175,10 @@ void __init kasan_init(void)
         phys_addr_t p_start, p_end;
         u64 i;
  
-       /*
-        * Populate all kernel virtual address space with kasan_early_shadow_page
-        * except for the linear mapping and the modules/kernel/BPF mapping.
-        */
-       kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
-                                   (void *)kasan_mem_to_shadow((void *)
-                                                               VMEMMAP_END));
         if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
                 kasan_shallow_populate(
                         (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
                         (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
-       else
-               kasan_populate_early_shadow(
-                       (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
-                       (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
  
         /* Populate the linear mapping */
         for_each_mem_range(i, &p_start, &p_end) {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c

index fed86f4..753d85b 100644 (file)
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -125,7 +125,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
  
         if (i == NR_JIT_ITERATIONS) {
                 pr_err("bpf-jit: image did not converge in <%d passes!\n", i);
-               bpf_jit_binary_free(jit_data->header);
+               if (jit_data->header)
+                       bpf_jit_binary_free(jit_data->header);
                 prog = orig_prog;
                 goto out_offset;
         }
@@ -166,6 +167,11 @@ out:
         return prog;
  }
  
+u64 bpf_jit_alloc_exec_limit(void)
+{
+       return BPF_JIT_REGION_SIZE;
+}
+
  void *bpf_jit_alloc_exec(unsigned long size)
  {
         return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index 1072245..2245f4b 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -3053,13 +3053,14 @@ static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
         int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
         struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
         struct kvm_vcpu *vcpu;
+       u8 vcpu_isc_mask;
  
         for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
                 vcpu = kvm_get_vcpu(kvm, vcpu_idx);
                 if (psw_ioint_disabled(vcpu))
                         continue;
-               deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
-               if (deliverable_mask) {
+               vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+               if (deliverable_mask & vcpu_isc_mask) {
                         /* lately kicked but not yet running */
                         if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
                                 return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 6a6dd5e..1c97493 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3363,6 +3363,7 @@ out_free_sie_block:
  
  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  {
+       clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
         return kvm_s390_vcpu_has_irq(vcpu, 0);
  }
  
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h

index 372afa8..c7a97f3 100644 (file)
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -42,7 +42,8 @@ extern void flush_cache_page(struct vm_area_struct *vma,
  extern void flush_cache_range(struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end);
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *page);
+void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
  extern void flush_icache_range(unsigned long start, unsigned long end);
  #define flush_icache_user_range flush_icache_range
  extern void flush_icache_page(struct vm_area_struct *vma,
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c

index cd9dc05..69d2d00 100644 (file)
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -27,6 +27,7 @@
  #include <linux/blk-mq.h>
  #include <linux/ata.h>
  #include <linux/hdreg.h>
+#include <linux/major.h>
  #include <linux/cdrom.h>
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
@@ -854,8 +855,8 @@ static const struct attribute_group *ubd_attr_groups[] = {
         NULL,
  };
  
-static void ubd_disk_register(int major, u64 size, int unit,
-                             struct gendisk *disk)
+static int ubd_disk_register(int major, u64 size, int unit,
+                            struct gendisk *disk)
  {
         disk->major = major;
         disk->first_minor = unit << UBD_SHIFT;
@@ -872,7 +873,7 @@ static void ubd_disk_register(int major, u64 size, int unit,
  
         disk->private_data = &ubd_devs[unit];
         disk->queue = ubd_devs[unit].queue;
-       device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
+       return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
  }
  
  #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
@@ -919,10 +920,15 @@ static int ubd_add(int n, char **error_out)
         blk_queue_write_cache(ubd_dev->queue, true, false);
         blk_queue_max_segments(ubd_dev->queue, MAX_SG);
         blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
-       ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
+       err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
+       if (err)
+               goto out_cleanup_disk;
+
         ubd_gendisk[n] = disk;
         return 0;
  
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
  out_cleanup_tags:
         blk_mq_free_tag_set(&ubd_dev->tag_set);
  out:
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S

index 18d2f51..1cc72b4 100644 (file)
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -78,7 +78,7 @@
         vpxor tmp0, x, x;
  
  
-.section       .rodata.cst164, "aM", @progbits, 164
+.section       .rodata.cst16, "aM", @progbits, 16
  .align 16
  
  /*
@@ -133,6 +133,10 @@
  .L0f0f0f0f:
         .long 0x0f0f0f0f
  
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+       .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
  
  .text
  .align 16
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S

index d2ffd7f..9c5d3f3 100644 (file)
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -93,7 +93,7 @@
         vpxor tmp0, x, x;
  
  
-.section       .rodata.cst164, "aM", @progbits, 164
+.section       .rodata.cst16, "aM", @progbits, 16
  .align 16
  
  /*
@@ -148,6 +148,10 @@
  .L0f0f0f0f:
         .long 0x0f0f0f0f
  
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+       .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
  .text
  .align 16
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 5a0298a..13f6465 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1098,7 +1098,7 @@ struct kvm_arch {
         u64 cur_tsc_generation;
         int nr_vcpus_matched_tsc;
  
-       spinlock_t pvclock_gtod_sync_lock;
+       raw_spinlock_t pvclock_gtod_sync_lock;
         bool use_master_clock;
         u64 master_kernel_ns;
         u64 master_cycle_now;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 2e4916b..7e34d71 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2591,11 +2591,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
  
  int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
  {
-       if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+       int count;
+       int bytes;
+
+       if (svm->vmcb->control.exit_info_2 > INT_MAX)
+               return -EINVAL;
+
+       count = svm->vmcb->control.exit_info_2;
+       if (unlikely(check_mul_overflow(count, size, &bytes)))
+               return -EINVAL;
+
+       if (!setup_vmgexit_scratch(svm, in, bytes))
                 return -EINVAL;
  
-       return kvm_sev_es_string_io(&svm->vcpu, size, port,
-                                   svm->ghcb_sa, svm->ghcb_sa_len / size, in);
+       return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in);
  }
  
  void sev_es_init_vmcb(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index b26647a..bfe0de3 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
         kvm_vcpu_write_tsc_offset(vcpu, offset);
         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
-       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
         if (!matched) {
                 kvm->arch.nr_vcpus_matched_tsc = 0;
         } else if (!already_matched) {
@@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
         }
  
         kvm_track_tsc_matching(vcpu);
-       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
  }
  
  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
         kvm_make_mclock_inprogress_request(kvm);
  
         /* no guest entries from this point */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         pvclock_update_vm_gtod_copy(kvm);
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         kvm_for_each_vcpu(i, vcpu, kvm)
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
         unsigned long flags;
         u64 ret;
  
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         if (!ka->use_master_clock) {
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                 return get_kvmclock_base_ns() + ka->kvmclock_offset;
         }
  
         hv_clock.tsc_timestamp = ka->master_cycle_now;
         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
         get_cpu();
@@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          * If the host uses TSC clock, then passthrough TSC as stable
          * to the guest.
          */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         use_master_clock = ka->use_master_clock;
         if (use_master_clock) {
                 host_tsc = ka->master_cycle_now;
                 kernel_ns = ka->master_kernel_ns;
         }
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
@@ -6100,13 +6100,13 @@ set_pit2_out:
                  * is slightly ahead) here we risk going negative on unsigned
                  * 'system_time' when 'user_ns.clock' is very small.
                  */
-               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
                 if (kvm->arch.use_master_clock)
                         now_ns = ka->master_kernel_ns;
                 else
                         now_ns = get_kvmclock_base_ns();
                 ka->kvmclock_offset = user_ns.clock - now_ns;
-               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
  
                 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                 break;
@@ -8156,9 +8156,9 @@ static void kvm_hyperv_tsc_notifier(void)
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 struct kvm_arch *ka = &kvm->arch;
  
-               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                 pvclock_update_vm_gtod_copy(kvm);
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
                 kvm_for_each_vcpu(cpu, vcpu, kvm)
                         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -8800,9 +8800,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
  
         kvm_run->cr8 = kvm_get_cr8(vcpu);
         kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+       /*
+        * The call to kvm_ready_for_interrupt_injection() may end up in
+        * kvm_xen_has_interrupt() which may require the srcu lock to be
+        * held, to protect against changes in the vcpu_info address.
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
         kvm_run->ready_for_interrupt_injection =
                 pic_in_kernel(vcpu->kvm) ||
                 kvm_vcpu_ready_for_interrupt_injection(vcpu);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  
         if (is_smm(vcpu))
                 kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -11199,7 +11207,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  
         raw_spin_lock_init(&kvm->arch.tsc_write_lock);
         mutex_init(&kvm->arch.apic_map_lock);
-       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+       raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
  
         kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
         pvclock_update_vm_gtod_copy(kvm);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c

index 9ea9c3d..8f62bae 100644 (file)
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
  
  int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
  {
+       int err;
         u8 rc = 0;
  
         /*
@@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
         if (likely(slots->generation == ghc->generation &&
                    !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
                 /* Fast path */
-               __get_user(rc, (u8 __user *)ghc->hva + offset);
-       } else {
-               /* Slow path */
-               kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-                                            sizeof(rc));
+               pagefault_disable();
+               err = __get_user(rc, (u8 __user *)ghc->hva + offset);
+               pagefault_enable();
+               if (!err)
+                       return rc;
         }
  
+       /* Slow path */
+
+       /*
+        * This function gets called from kvm_vcpu_block() after setting the
+        * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+        * from a HLT. So we really mustn't sleep. If the page ended up absent
+        * at that point, just return 1 in order to trigger an immediate wake,
+        * and we'll end up getting called again from a context where we *can*
+        * fault in the page and wait for it.
+        */
+       if (in_atomic() || !task_is_running(current))
+               return 1;
+
+       kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+                                    sizeof(rc));
+
         return rc;
  }
  
diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h

index cf907e5..a8a0416 100644 (file)
--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -120,7 +120,8 @@ void flush_cache_page(struct vm_area_struct*,
  #define flush_cache_vunmap(start,end)  flush_cache_all()
  
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page*);
+void flush_dcache_page(struct page *);
+void flush_dcache_folio(struct folio *);
  
  void local_flush_cache_range(struct vm_area_struct *vma,
                 unsigned long start, unsigned long end);
@@ -137,7 +138,9 @@ void local_flush_cache_page(struct vm_area_struct *vma,
  #define flush_cache_vunmap(start,end)                  do { } while (0)
  
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
  #define flush_dcache_page(page)                                do { } while (0)
+static inline void flush_dcache_folio(struct folio *folio) { }
  
  #define flush_icache_range local_flush_icache_range
  #define flush_cache_page(vma, addr, pfn)               do { } while (0)
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c

index 3cdfa00..07b642c 100644 (file)
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -100,7 +100,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
         spin_unlock(&dev->lock);
  }
  
-static blk_qc_t simdisk_submit_bio(struct bio *bio)
+static void simdisk_submit_bio(struct bio *bio)
  {
         struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
         struct bio_vec bvec;
@@ -118,7 +118,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio)
         }
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int simdisk_open(struct block_device *bdev, fmode_t mode)
@@ -259,6 +258,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
                 struct proc_dir_entry *procdir)
  {
         char tmp[2] = { '0' + which, 0 };
+       int err = -ENOMEM;
  
         dev->fd = -1;
         dev->filename = NULL;
@@ -267,7 +267,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
  
         dev->gd = blk_alloc_disk(NUMA_NO_NODE);
         if (!dev->gd)
-               return -ENOMEM;
+               goto out;
         dev->gd->major = simdisk_major;
         dev->gd->first_minor = which;
         dev->gd->minors = SIMDISK_MINORS;
@@ -275,10 +275,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
         dev->gd->private_data = dev;
         snprintf(dev->gd->disk_name, 32, "simdisk%d", which);
         set_capacity(dev->gd, 0);
-       add_disk(dev->gd);
+       err = add_disk(dev->gd);
+       if (err)
+               goto out_cleanup_disk;
  
         dev->procfile = proc_create_data(tmp, 0644, procdir, &simdisk_proc_ops, dev);
+
         return 0;
+
+out_cleanup_disk:
+       blk_cleanup_disk(dev->gd);
+out:
+       return err;
  }
  
  static int __init simdisk_init(void)
diff --git a/block/Kconfig b/block/Kconfig

index 8e28ae7..c6ce41a 100644 (file)
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -73,7 +73,7 @@ config BLK_DEV_ZONED
  
  config BLK_DEV_THROTTLING
         bool "Block layer bio throttling support"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
         select BLK_CGROUP_RWSTAT
         help
         Block layer bio throttling support. It can be used to limit
@@ -112,7 +112,7 @@ config BLK_WBT_MQ
  
  config BLK_CGROUP_IOLATENCY
         bool "Enable support for latency based cgroup IO protection"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
         help
         Enabling this option enables the .latency interface for IO throttling.
         The IO controller will attempt to maintain average IO latencies below
@@ -132,7 +132,7 @@ config BLK_CGROUP_FC_APPID
  
  config BLK_CGROUP_IOCOST
         bool "Enable support for cost model based cgroup IO controller"
-       depends on BLK_CGROUP=y
+       depends on BLK_CGROUP
         select BLK_RQ_IO_DATA_LEN
         select BLK_RQ_ALLOC_TIME
         help
@@ -190,39 +190,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
           by falling back to the kernel crypto API when inline
           encryption hardware is not present.
  
-menu "Partition Types"
-
  source "block/partitions/Kconfig"
  
-endmenu
-
-endif # BLOCK
-
  config BLOCK_COMPAT
-       bool
-       depends on BLOCK && COMPAT
-       default y
+       def_bool COMPAT
  
  config BLK_MQ_PCI
-       bool
-       depends on BLOCK && PCI
-       default y
+       def_bool PCI
  
  config BLK_MQ_VIRTIO
         bool
-       depends on BLOCK && VIRTIO
+       depends on VIRTIO
         default y
  
  config BLK_MQ_RDMA
         bool
-       depends on BLOCK && INFINIBAND
+       depends on INFINIBAND
         default y
  
  config BLK_PM
-       def_bool BLOCK && PM
+       def_bool PM
  
  # do not use in new code
  config BLOCK_HOLDER_DEPRECATED
         bool
  
  source "block/Kconfig.iosched"
+
+endif # BLOCK
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched

index 2f2158e..885fee8 100644 (file)
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,6 +1,4 @@
  # SPDX-License-Identifier: GPL-2.0
-if BLOCK
-
  menu "IO Schedulers"
  
  config MQ_IOSCHED_DEADLINE
@@ -45,5 +43,3 @@ config BFQ_CGROUP_DEBUG
         files in a cgroup which can be useful for debugging.
  
  endmenu
-
-endif
diff --git a/block/Makefile b/block/Makefile

index 41aa1ba..44df57e 100644 (file)
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,13 +3,13 @@
  # Makefile for the kernel block layer
  #
  
-obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-y          := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
                         blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                         blk-exec.o blk-merge.o blk-timeout.o \
                         blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
                         blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
                         genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
-                       disk-events.o
+                       disk-events.o blk-ia-ranges.o
  
  obj-$(CONFIG_BOUNCE)           += bounce.o
  obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS)    += blk-mq-debugfs.o
  obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
  obj-$(CONFIG_BLK_SED_OPAL)     += sed-opal.o
  obj-$(CONFIG_BLK_PM)           += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION)    += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION)    += blk-crypto.o blk-crypto-profile.o
  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)   += blk-crypto-fallback.o
  obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)  += holder.o
diff --git a/block/bdev.c b/block/bdev.c

index 485a258..b4dab2f 100644 (file)
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -12,6 +12,7 @@
  #include <linux/major.h>
  #include <linux/device_cgroup.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/backing-dev.h>
  #include <linux/module.h>
  #include <linux/blkpg.h>
@@ -184,14 +185,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
  
  EXPORT_SYMBOL(sb_min_blocksize);
  
-int __sync_blockdev(struct block_device *bdev, int wait)
+int sync_blockdev_nowait(struct block_device *bdev)
  {
         if (!bdev)
                 return 0;
-       if (!wait)
-               return filemap_flush(bdev->bd_inode->i_mapping);
-       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+       return filemap_flush(bdev->bd_inode->i_mapping);
  }
+EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
  
  /*
   * Write out and wait upon all the dirty data associated with a block
@@ -199,7 +199,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
   */
  int sync_blockdev(struct block_device *bdev)
  {
-       return __sync_blockdev(bdev, 1);
+       if (!bdev)
+               return 0;
+       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
  }
  EXPORT_SYMBOL(sync_blockdev);
  
@@ -326,12 +328,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
         if (!ops->rw_page || bdev_get_integrity(bdev))
                 return result;
  
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       result = blk_queue_enter(bdev_get_queue(bdev), 0);
         if (result)
                 return result;
         result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
                               REQ_OP_READ);
-       blk_queue_exit(bdev->bd_disk->queue);
+       blk_queue_exit(bdev_get_queue(bdev));
         return result;
  }
  
@@ -362,7 +364,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
  
         if (!ops->rw_page || bdev_get_integrity(bdev))
                 return -EOPNOTSUPP;
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       result = blk_queue_enter(bdev_get_queue(bdev), 0);
         if (result)
                 return result;
  
@@ -375,7 +377,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
                 clean_page_buffers(page);
                 unlock_page(page);
         }
-       blk_queue_exit(bdev->bd_disk->queue);
+       blk_queue_exit(bdev_get_queue(bdev));
         return result;
  }
  
@@ -492,6 +494,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
         spin_lock_init(&bdev->bd_size_lock);
         bdev->bd_partno = partno;
         bdev->bd_inode = inode;
+       bdev->bd_queue = disk->queue;
         bdev->bd_stats = alloc_percpu(struct disk_stats);
         if (!bdev->bd_stats) {
                 iput(inode);
@@ -962,9 +965,11 @@ EXPORT_SYMBOL(blkdev_put);
   * @pathname:  special file representing the block device
   * @dev:       return value of the block device's dev_t
   *
- * Get a reference to the blockdevice at @pathname in the current
- * namespace if possible and return it.  Return ERR_PTR(error)
- * otherwise.
+ * Lookup the block device's dev_t at @pathname in the current
+ * namespace if possible and return it by @dev.
+ *
+ * RETURNS:
+ * 0 if succeeded, errno otherwise.
   */
  int lookup_bdev(const char *pathname, dev_t *dev)
  {
@@ -1016,7 +1021,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
  }
  EXPORT_SYMBOL(__invalidate_device);
  
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+void sync_bdevs(bool wait)
  {
         struct inode *inode, *old_inode = NULL;
  
@@ -1047,8 +1052,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
                 bdev = I_BDEV(inode);
  
                 mutex_lock(&bdev->bd_disk->open_mutex);
-               if (bdev->bd_openers)
-                       func(bdev, arg);
+               if (!bdev->bd_openers) {
+                       ; /* skip */
+               } else if (wait) {
+                       /*
+                        * We keep the error status of individual mapping so
+                        * that applications can catch the writeback error using
+                        * fsync(2). See filemap_fdatawait_keep_errors() for
+                        * details.
+                        */
+                       filemap_fdatawait_keep_errors(inode->i_mapping);
+               } else {
+                       filemap_fdatawrite(inode->i_mapping);
+               }
                 mutex_unlock(&bdev->bd_disk->open_mutex);
  
                 spin_lock(&blockdev_superblock->s_inode_list_lock);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

index 85b8e1c..24a5c53 100644 (file)
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -6,13 +6,13 @@
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/cgroup.h>
-#include <linux/elevator.h>
  #include <linux/ktime.h>
  #include <linux/rbtree.h>
  #include <linux/ioprio.h>
  #include <linux/sbitmap.h>
  #include <linux/delay.h>
  
+#include "elevator.h"
  #include "bfq-iosched.h"
  
  #ifdef CONFIG_BFQ_CGROUP_DEBUG
@@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
  {
         if (blkg_rwstat_init(&stats->bytes, gfp) ||
             blkg_rwstat_init(&stats->ios, gfp))
-               return -ENOMEM;
+               goto error;
  
  #ifdef CONFIG_BFQ_CGROUP_DEBUG
         if (blkg_rwstat_init(&stats->merged, gfp) ||
@@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
             bfq_stat_init(&stats->dequeue, gfp) ||
             bfq_stat_init(&stats->group_wait_time, gfp) ||
             bfq_stat_init(&stats->idle_time, gfp) ||
-           bfq_stat_init(&stats->empty_time, gfp)) {
-               bfqg_stats_exit(stats);
-               return -ENOMEM;
-       }
+           bfq_stat_init(&stats->empty_time, gfp))
+               goto error;
  #endif
  
         return 0;
+
+error:
+       bfqg_stats_exit(stats);
+       return -ENOMEM;
  }
  
  static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 480e1a1..fec1811 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -117,7 +117,6 @@
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/cgroup.h>
-#include <linux/elevator.h>
  #include <linux/ktime.h>
  #include <linux/rbtree.h>
  #include <linux/ioprio.h>
@@ -127,6 +126,7 @@
  
  #include <trace/events/block.h>
  
+#include "elevator.h"
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
@@ -6884,8 +6884,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
         struct blk_mq_tags *tags = hctx->sched_tags;
         unsigned int min_shallow;
  
-       min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
+       min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
  }
  
  static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c

index 6b47cdd..d251147 100644 (file)
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -6,7 +6,7 @@
   * Written by: Martin K. Petersen <martin.petersen@oracle.com>
   */
  
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/mempool.h>
  #include <linux/export.h>
  #include <linux/bio.h>
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
         iv = bip->bip_vec + bip->bip_vcnt;
  
         if (bip->bip_vcnt &&
-           bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
+           bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
                              &bip->bip_vec[bip->bip_vcnt - 1], offset))
                 return 0;
  
diff --git a/block/bio.c b/block/bio.c

index a6fb6a0..15ab0d6 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
  
         snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
         bslab->slab = kmem_cache_create(bslab->name, size,
-                       ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
+                       ARCH_KMALLOC_MINALIGN,
+                       SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
         if (!bslab->slab)
                 goto fail_alloc_slab;
  
@@ -156,7 +157,7 @@ out:
  
  void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
  {
-       BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
+       BUG_ON(nr_vecs > BIO_MAX_VECS);
  
         if (nr_vecs == BIO_MAX_VECS)
                 mempool_free(bv, pool);
@@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
  
         atomic_set(&bio->__bi_remaining, 1);
         atomic_set(&bio->__bi_cnt, 1);
+       bio->bi_cookie = BLK_QC_T_NONE;
  
         bio->bi_max_vecs = max_vecs;
         bio->bi_io_vec = table;
@@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
   *   REQ_OP_READ, zero the truncated part. This function should only
   *   be used for handling corner cases, such as bio eod.
   */
-void bio_truncate(struct bio *bio, unsigned new_size)
+static void bio_truncate(struct bio *bio, unsigned new_size)
  {
         struct bio_vec bv;
         struct bvec_iter iter;
@@ -677,7 +679,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
  void bio_put(struct bio *bio)
  {
         if (unlikely(bio_flagged(bio, BIO_REFFED))) {
-               BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+               BUG_ON(!atomic_read(&bio->__bi_cnt));
                 if (!atomic_dec_and_test(&bio->__bi_cnt))
                         return;
         }
@@ -772,6 +774,23 @@ const char *bio_devname(struct bio *bio, char *buf)
  }
  EXPORT_SYMBOL(bio_devname);
  
+/**
+ * bio_full - check if the bio is full
+ * @bio:       bio to check
+ * @len:       length of one segment to be added
+ *
+ * Return true if @bio is full and one segment with @len bytes can't be
+ * added to the bio, otherwise return false
+ */
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return true;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return true;
+       return false;
+}
+
  static inline bool page_is_mergeable(const struct bio_vec *bv,
                 struct page *page, unsigned int len, unsigned int off,
                 bool *same_page)
@@ -791,6 +810,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
         return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
  }
  
+/**
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: start page to add
+ * @len: length of the data to add
+ * @off: offset of the data relative to @page
+ * @same_page: return if the segment has been merged inside the same page
+ *
+ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
+ * useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Warn if (@len, @off) crosses pages in case that @same_page is true.
+ *
+ * Return %true on success or %false on failure.
+ */
+static bool __bio_try_merge_page(struct bio *bio, struct page *page,
+               unsigned int len, unsigned int off, bool *same_page)
+{
+       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+               return false;
+
+       if (bio->bi_vcnt > 0) {
+               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+               if (page_is_mergeable(bv, page, len, off, same_page)) {
+                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
+                               *same_page = false;
+                               return false;
+                       }
+                       bv->bv_len += len;
+                       bio->bi_iter.bi_size += len;
+                       return true;
+               }
+       }
+       return false;
+}
+
  /*
   * Try to merge a page into a segment, while obeying the hardware segment
   * size limit.  This is not for normal read/write bios, but for passthrough
@@ -908,7 +965,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
  int bio_add_zone_append_page(struct bio *bio, struct page *page,
                              unsigned int len, unsigned int offset)
  {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
         bool same_page = false;
  
         if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@@ -923,45 +980,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
  EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
  
  /**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio.  This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
-               unsigned int len, unsigned int off, bool *same_page)
-{
-       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-               return false;
-
-       if (bio->bi_vcnt > 0) {
-               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-               if (page_is_mergeable(bv, page, len, off, same_page)) {
-                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
-                               *same_page = false;
-                               return false;
-                       }
-                       bv->bv_len += len;
-                       bio->bi_iter.bi_size += len;
-                       return true;
-               }
-       }
-       return false;
-}
-EXPORT_SYMBOL_GPL(__bio_try_merge_page);
-
-/**
   * __bio_add_page - add page(s) to a bio in a new segment
   * @bio: destination bio
   * @page: start page to add
@@ -1015,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page,
  }
  EXPORT_SYMBOL(bio_add_page);
  
-void bio_release_pages(struct bio *bio, bool mark_dirty)
+void __bio_release_pages(struct bio *bio, bool mark_dirty)
  {
         struct bvec_iter_all iter_all;
         struct bio_vec *bvec;
  
-       if (bio_flagged(bio, BIO_NO_PAGE_REF))
-               return;
-
         bio_for_each_segment_all(bvec, bio, iter_all) {
                 if (mark_dirty && !PageCompound(bvec->bv_page))
                         set_page_dirty_lock(bvec->bv_page);
                 put_page(bvec->bv_page);
         }
  }
-EXPORT_SYMBOL_GPL(bio_release_pages);
+EXPORT_SYMBOL_GPL(__bio_release_pages);
  
-static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
  {
+       size_t size = iov_iter_count(iter);
+
         WARN_ON_ONCE(bio->bi_max_vecs);
  
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+               size_t max_sectors = queue_max_zone_append_sectors(q);
+
+               size = min(size, max_sectors << SECTOR_SHIFT);
+       }
+
         bio->bi_vcnt = iter->nr_segs;
         bio->bi_io_vec = (struct bio_vec *)iter->bvec;
         bio->bi_iter.bi_bvec_done = iter->iov_offset;
-       bio->bi_iter.bi_size = iter->count;
+       bio->bi_iter.bi_size = size;
         bio_set_flag(bio, BIO_NO_PAGE_REF);
         bio_set_flag(bio, BIO_CLONED);
  }
  
-static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
-{
-       __bio_iov_bvec_set(bio, iter);
-       iov_iter_advance(iter, iter->count);
-       return 0;
-}
-
-static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
-{
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
-       struct iov_iter i = *iter;
-
-       iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
-       __bio_iov_bvec_set(bio, &i);
-       iov_iter_advance(iter, i.count);
-       return 0;
-}
-
  static void bio_put_pages(struct page **pages, size_t size, size_t off)
  {
         size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@@ -1130,7 +1136,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
  {
         unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
         unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
         unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
         struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
         struct page **pages = (struct page **)bv;
@@ -1202,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
         int ret = 0;
  
         if (iov_iter_is_bvec(iter)) {
-               if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-                       return bio_iov_bvec_set_append(bio, iter);
-               return bio_iov_bvec_set(bio, iter);
+               bio_iov_bvec_set(bio, iter);
+               iov_iter_advance(iter, bio->bi_iter.bi_size);
+               return 0;
         }
  
         do {
@@ -1260,18 +1266,7 @@ int submit_bio_wait(struct bio *bio)
  }
  EXPORT_SYMBOL(submit_bio_wait);
  
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio:       bio to advance
- * @bytes:     number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
+void __bio_advance(struct bio *bio, unsigned bytes)
  {
         if (bio_integrity(bio))
                 bio_integrity_advance(bio, bytes);
@@ -1279,7 +1274,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
         bio_crypt_advance(bio, bytes);
         bio_advance_iter(bio, &bio->bi_iter, bytes);
  }
-EXPORT_SYMBOL(bio_advance);
+EXPORT_SYMBOL(__bio_advance);
  
  void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                         struct bio *src, struct bvec_iter *src_iter)
@@ -1467,10 +1462,10 @@ again:
                 return;
  
         if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
-               rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
+               rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
  
         if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-               trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
+               trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
         }
  
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 9a1c583..88b1fce 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,6 +32,7 @@
  #include <linux/psi.h>
  #include "blk.h"
  #include "blk-ioprio.h"
+#include "blk-throttle.h"
  
  /*
   * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -620,7 +621,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
   */
  int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                    char *input, struct blkg_conf_ctx *ctx)
-       __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
+       __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
  {
         struct block_device *bdev;
         struct request_queue *q;
@@ -631,7 +632,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
  
-       q = bdev->bd_disk->queue;
+       q = bdev_get_queue(bdev);
+
+       /*
+        * blkcg_deactivate_policy() requires queue to be frozen, we can grab
+        * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
+        */
+       ret = blk_queue_enter(q, 0);
+       if (ret)
+               return ret;
  
         rcu_read_lock();
         spin_lock_irq(&q->queue_lock);
@@ -702,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                         goto success;
         }
  success:
+       blk_queue_exit(q);
         ctx->bdev = bdev;
         ctx->blkg = blkg;
         ctx->body = input;
@@ -714,6 +724,7 @@ fail_unlock:
         rcu_read_unlock();
  fail:
         blkdev_put_no_open(bdev);
+       blk_queue_exit(q);
         /*
          * If queue was bypassing, we should retry.  Do so after a
          * short msleep().  It isn't strictly necessary but queue
@@ -736,9 +747,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
   * with blkg_conf_prep().
   */
  void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-       __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
+       __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
  {
-       spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+       spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
         rcu_read_unlock();
         blkdev_put_no_open(ctx->bdev);
  }
@@ -841,7 +852,7 @@ static void blkcg_fill_root_iostats(void)
         while ((dev = class_dev_iter_next(&iter))) {
                 struct block_device *bdev = dev_to_bdev(dev);
                 struct blkcg_gq *blkg =
-                       blk_queue_root_blkg(bdev->bd_disk->queue);
+                       blk_queue_root_blkg(bdev_get_queue(bdev));
                 struct blkg_iostat tmp;
                 int cpu;
  
@@ -1800,7 +1811,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
  
         rcu_read_lock();
         blkg = blkg_lookup_create(css_to_blkcg(css),
-                                 bio->bi_bdev->bd_disk->queue);
+                                 bdev_get_queue(bio->bi_bdev));
         while (blkg) {
                 if (blkg_tryget(blkg)) {
                         ret_blkg = blkg;
@@ -1836,8 +1847,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
         if (css && css->parent) {
                 bio->bi_blkg = blkg_tryget_closest(bio, css);
         } else {
-               blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
-               bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
+               blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
+               bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
         }
  }
  EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
diff --git a/block/blk-core.c b/block/blk-core.c

index 4d8f5fe..ac1de7d 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -18,6 +18,7 @@
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-pm.h>
+#include <linux/blk-integrity.h>
  #include <linux/highmem.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
@@ -49,6 +50,7 @@
  #include "blk-mq.h"
  #include "blk-mq-sched.h"
  #include "blk-pm.h"
+#include "blk-throttle.h"
  
  struct dentry *blk_debugfs_root;
  
@@ -214,8 +216,7 @@ int blk_status_to_errno(blk_status_t status)
  }
  EXPORT_SYMBOL_GPL(blk_status_to_errno);
  
-static void print_req_error(struct request *req, blk_status_t status,
-               const char *caller)
+void blk_print_req_error(struct request *req, blk_status_t status)
  {
         int idx = (__force int)status;
  
@@ -223,9 +224,9 @@ static void print_req_error(struct request *req, blk_status_t status,
                 return;
  
         printk_ratelimited(KERN_ERR
-               "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+               "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                 "phys_seg %u prio class %u\n",
-               caller, blk_errors[idx].name,
+               blk_errors[idx].name,
                 req->rq_disk ? req->rq_disk->disk_name : "?",
                 blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
                 req->cmd_flags & ~REQ_OP_MASK,
@@ -233,33 +234,6 @@ static void print_req_error(struct request *req, blk_status_t status,
                 IOPRIO_PRIO_CLASS(req->ioprio));
  }
  
-static void req_bio_endio(struct request *rq, struct bio *bio,
-                         unsigned int nbytes, blk_status_t error)
-{
-       if (error)
-               bio->bi_status = error;
-
-       if (unlikely(rq->rq_flags & RQF_QUIET))
-               bio_set_flag(bio, BIO_QUIET);
-
-       bio_advance(bio, nbytes);
-
-       if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
-               /*
-                * Partial zone append completions cannot be supported as the
-                * BIO fragments may end up not being written sequentially.
-                */
-               if (bio->bi_iter.bi_size)
-                       bio->bi_status = BLK_STS_IOERR;
-               else
-                       bio->bi_iter.bi_sector = rq->__sector;
-       }
-
-       /* don't actually finish bio if it's part of flush sequence */
-       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
-               bio_endio(bio);
-}
-
  void blk_dump_rq_flags(struct request *rq, char *msg)
  {
         printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
@@ -402,7 +376,7 @@ void blk_cleanup_queue(struct request_queue *q)
          */
         mutex_lock(&q->sysfs_lock);
         if (q->elevator)
-               blk_mq_sched_free_requests(q);
+               blk_mq_sched_free_rqs(q);
         mutex_unlock(&q->sysfs_lock);
  
         percpu_ref_exit(&q->q_usage_counter);
@@ -415,7 +389,7 @@ EXPORT_SYMBOL(blk_cleanup_queue);
  static bool blk_try_enter_queue(struct request_queue *q, bool pm)
  {
         rcu_read_lock();
-       if (!percpu_ref_tryget_live(&q->q_usage_counter))
+       if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                 goto fail;
  
         /*
@@ -430,7 +404,7 @@ static bool blk_try_enter_queue(struct request_queue *q, bool pm)
         return true;
  
  fail_put:
-       percpu_ref_put(&q->q_usage_counter);
+       blk_queue_exit(q);
  fail:
         rcu_read_unlock();
         return false;
@@ -470,10 +444,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
  
  static inline int bio_queue_enter(struct bio *bio)
  {
-       struct gendisk *disk = bio->bi_bdev->bd_disk;
-       struct request_queue *q = disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  
         while (!blk_try_enter_queue(q, false)) {
+               struct gendisk *disk = bio->bi_bdev->bd_disk;
+
                 if (bio->bi_opf & REQ_NOWAIT) {
                         if (test_bit(GD_DEAD, &disk->state))
                                 goto dead;
@@ -553,7 +528,7 @@ struct request_queue *blk_alloc_queue(int node_id)
  
         q->node = node_id;
  
-       atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
+       atomic_set(&q->nr_active_requests_shared_tags, 0);
  
         timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
         INIT_WORK(&q->timeout_work, blk_timeout_work);
@@ -586,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id)
  
         blk_queue_dma_alignment(q, 511);
         blk_set_default_limits(&q->limits);
-       q->nr_requests = BLKDEV_MAX_RQ;
+       q->nr_requests = BLKDEV_DEFAULT_RQ;
  
         return q;
  
@@ -622,40 +597,13 @@ bool blk_get_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL(blk_get_queue);
  
-/**
- * blk_get_request - allocate a request
- * @q: request queue to allocate a request for
- * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
- * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
- */
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-                               blk_mq_req_flags_t flags)
-{
-       struct request *req;
-
-       WARN_ON_ONCE(op & REQ_NOWAIT);
-       WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
-
-       req = blk_mq_alloc_request(q, op, flags);
-       if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
-               q->mq_ops->initialize_rq_fn(req);
-
-       return req;
-}
-EXPORT_SYMBOL(blk_get_request);
-
-void blk_put_request(struct request *req)
-{
-       blk_mq_free_request(req);
-}
-EXPORT_SYMBOL(blk_put_request);
-
  static void handle_bad_sector(struct bio *bio, sector_t maxsector)
  {
         char b[BDEVNAME_SIZE];
  
-       pr_info_ratelimited("attempt to access beyond end of device\n"
+       pr_info_ratelimited("%s: attempt to access beyond end of device\n"
                             "%s: rw=%d, want=%llu, limit=%llu\n",
+                           current->comm,
                             bio_devname(bio, b), bio->bi_opf,
                             bio_end_sector(bio), maxsector);
  }
@@ -797,7 +745,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
  {
         struct block_device *bdev = bio->bi_bdev;
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
         blk_status_t status = BLK_STS_IOERR;
         struct blk_plug *plug;
  
@@ -839,7 +787,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
         }
  
         if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               bio_clear_hipri(bio);
+               bio_clear_polled(bio);
  
         switch (bio_op(bio)) {
         case REQ_OP_DISCARD:
@@ -912,25 +860,22 @@ end_io:
         return false;
  }
  
-static blk_qc_t __submit_bio(struct bio *bio)
+static void __submit_bio(struct bio *bio)
  {
         struct gendisk *disk = bio->bi_bdev->bd_disk;
-       blk_qc_t ret = BLK_QC_T_NONE;
  
         if (unlikely(bio_queue_enter(bio) != 0))
-               return BLK_QC_T_NONE;
+               return;
  
         if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
                 goto queue_exit;
-       if (disk->fops->submit_bio) {
-               ret = disk->fops->submit_bio(bio);
-               goto queue_exit;
+       if (!disk->fops->submit_bio) {
+               blk_mq_submit_bio(bio);
+               return;
         }
-       return blk_mq_submit_bio(bio);
-
+       disk->fops->submit_bio(bio);
  queue_exit:
         blk_queue_exit(disk->queue);
-       return ret;
  }
  
  /*
@@ -952,10 +897,9 @@ queue_exit:
   * bio_list_on_stack[1] contains bios that were submitted before the current
   *     ->submit_bio_bio, but that haven't been processed yet.
   */
-static blk_qc_t __submit_bio_noacct(struct bio *bio)
+static void __submit_bio_noacct(struct bio *bio)
  {
         struct bio_list bio_list_on_stack[2];
-       blk_qc_t ret = BLK_QC_T_NONE;
  
         BUG_ON(bio->bi_next);
  
@@ -963,7 +907,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
         current->bio_list = bio_list_on_stack;
  
         do {
-               struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                 struct bio_list lower, same;
  
                 /*
@@ -972,7 +916,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
                 bio_list_on_stack[1] = bio_list_on_stack[0];
                 bio_list_init(&bio_list_on_stack[0]);
  
-               ret = __submit_bio(bio);
+               __submit_bio(bio);
  
                 /*
                  * Sort new bios into those for a lower level and those for the
@@ -981,7 +925,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
                 bio_list_init(&lower);
                 bio_list_init(&same);
                 while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
-                       if (q == bio->bi_bdev->bd_disk->queue)
+                       if (q == bdev_get_queue(bio->bi_bdev))
                                 bio_list_add(&same, bio);
                         else
                                 bio_list_add(&lower, bio);
@@ -995,22 +939,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
         } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
  
         current->bio_list = NULL;
-       return ret;
  }
  
-static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
+static void __submit_bio_noacct_mq(struct bio *bio)
  {
         struct bio_list bio_list[2] = { };
-       blk_qc_t ret;
  
         current->bio_list = bio_list;
  
         do {
-               ret = __submit_bio(bio);
+               __submit_bio(bio);
         } while ((bio = bio_list_pop(&bio_list[0])));
  
         current->bio_list = NULL;
-       return ret;
  }
  
  /**
@@ -1022,7 +963,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
   * systems and other upper level users of the block layer should use
   * submit_bio() instead.
   */
-blk_qc_t submit_bio_noacct(struct bio *bio)
+void submit_bio_noacct(struct bio *bio)
  {
         /*
          * We only want one ->submit_bio to be active at a time, else stack
@@ -1030,14 +971,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
          * to collect a list of requests submited by a ->submit_bio method while
          * it is active, and then process them after it returned.
          */
-       if (current->bio_list) {
+       if (current->bio_list)
                 bio_list_add(&current->bio_list[0], bio);
-               return BLK_QC_T_NONE;
-       }
-
-       if (!bio->bi_bdev->bd_disk->fops->submit_bio)
-               return __submit_bio_noacct_mq(bio);
-       return __submit_bio_noacct(bio);
+       else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+               __submit_bio_noacct_mq(bio);
+       else
+               __submit_bio_noacct(bio);
  }
  EXPORT_SYMBOL(submit_bio_noacct);
  
@@ -1054,10 +993,10 @@ EXPORT_SYMBOL(submit_bio_noacct);
   * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
   * been called.
   */
-blk_qc_t submit_bio(struct bio *bio)
+void submit_bio(struct bio *bio)
  {
         if (blkcg_punt_bio_submit(bio))
-               return BLK_QC_T_NONE;
+               return;
  
         /*
          * If it's a regular read/write or a barrier with data attached,
@@ -1068,7 +1007,7 @@ blk_qc_t submit_bio(struct bio *bio)
  
                 if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
                         count = queue_logical_block_size(
-                                       bio->bi_bdev->bd_disk->queue) >> 9;
+                                       bdev_get_queue(bio->bi_bdev)) >> 9;
                 else
                         count = bio_sectors(bio);
  
@@ -1089,20 +1028,93 @@ blk_qc_t submit_bio(struct bio *bio)
         if (unlikely(bio_op(bio) == REQ_OP_READ &&
             bio_flagged(bio, BIO_WORKINGSET))) {
                 unsigned long pflags;
-               blk_qc_t ret;
  
                 psi_memstall_enter(&pflags);
-               ret = submit_bio_noacct(bio);
+               submit_bio_noacct(bio);
                 psi_memstall_leave(&pflags);
-
-               return ret;
+               return;
         }
  
-       return submit_bio_noacct(bio);
+       submit_bio_noacct(bio);
  }
  EXPORT_SYMBOL(submit_bio);
  
  /**
+ * bio_poll - poll for BIO completions
+ * @bio: bio to poll for
+ * @flags: BLK_POLL_* flags that control the behavior
+ *
+ * Poll for completions on queue associated with the bio. Returns number of
+ * completed entries found.
+ *
+ * Note: the caller must either be the context that submitted @bio, or
+ * be in a RCU critical section to prevent freeing of @bio.
+ */
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
+{
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+       blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
+       int ret;
+
+       if (cookie == BLK_QC_T_NONE ||
+           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+               return 0;
+
+       if (current->plug)
+               blk_flush_plug(current->plug, false);
+
+       if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
+               return 0;
+       if (WARN_ON_ONCE(!queue_is_mq(q)))
+               ret = 0;        /* not yet implemented, should not happen */
+       else
+               ret = blk_mq_poll(q, cookie, iob, flags);
+       blk_queue_exit(q);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(bio_poll);
+
+/*
+ * Helper to implement file_operations.iopoll.  Requires the bio to be stored
+ * in iocb->private, and cleared before freeing the bio.
+ */
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+                   unsigned int flags)
+{
+       struct bio *bio;
+       int ret = 0;
+
+       /*
+        * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
+        * point to a freshly allocated bio at this point.  If that happens
+        * we have a few cases to consider:
+        *
+        *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
+        *     simply nothing in this case
+        *  2) the bio points to a not poll enabled device.  bio_poll will catch
+        *     this and return 0
+        *  3) the bio points to a poll capable device, including but not
+        *     limited to the one that the original bio pointed to.  In this
+        *     case we will call into the actual poll method and poll for I/O,
+        *     even if we don't need to, but it won't cause harm either.
+        *
+        * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
+        * is still allocated. Because partitions hold a reference to the whole
+        * device bdev and thus disk, the disk is also still valid.  Grabbing
+        * a reference to the queue in bio_poll() ensures the hctxs and requests
+        * are still valid as well.
+        */
+       rcu_read_lock();
+       bio = READ_ONCE(kiocb->private);
+       if (bio && bio->bi_bdev)
+               ret = bio_poll(bio, iob, flags);
+       rcu_read_unlock();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
+
+/**
   * blk_cloned_rq_check_limits - Helper function to check a cloned request
   *                              for the new queue limits
   * @q:  the queue
@@ -1177,8 +1189,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
         if (blk_crypto_insert_cloned_request(rq))
                 return BLK_STS_IOERR;
  
-       if (blk_queue_io_stat(q))
-               blk_account_io_start(rq);
+       blk_account_io_start(rq);
  
         /*
          * Since we have a scheduler attached on the top device,
@@ -1246,41 +1257,19 @@ again:
         }
  }
  
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void __blk_account_io_done(struct request *req, u64 now)
  {
-       if (req->part && blk_do_io_stat(req)) {
-               const int sgrp = op_stat_group(req_op(req));
+       const int sgrp = op_stat_group(req_op(req));
  
-               part_stat_lock();
-               part_stat_add(req->part, sectors[sgrp], bytes >> 9);
-               part_stat_unlock();
-       }
-}
-
-void blk_account_io_done(struct request *req, u64 now)
-{
-       /*
-        * Account IO completion.  flush_rq isn't accounted as a
-        * normal IO on queueing nor completion.  Accounting the
-        * containing request is enough.
-        */
-       if (req->part && blk_do_io_stat(req) &&
-           !(req->rq_flags & RQF_FLUSH_SEQ)) {
-               const int sgrp = op_stat_group(req_op(req));
-
-               part_stat_lock();
-               update_io_ticks(req->part, jiffies, true);
-               part_stat_inc(req->part, ios[sgrp]);
-               part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
-               part_stat_unlock();
-       }
+       part_stat_lock();
+       update_io_ticks(req->part, jiffies, true);
+       part_stat_inc(req->part, ios[sgrp]);
+       part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+       part_stat_unlock();
  }
  
-void blk_account_io_start(struct request *rq)
+void __blk_account_io_start(struct request *rq)
  {
-       if (!blk_do_io_stat(rq))
-               return;
-
         /* passthrough requests can hold bios that do not have ->bi_bdev set */
         if (rq->bio && rq->bio->bi_bdev)
                 rq->part = rq->bio->bi_bdev;
@@ -1376,112 +1365,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
  }
  EXPORT_SYMBOL_GPL(blk_steal_bios);
  
-/**
- * blk_update_request - Complete multiple bytes without completing the request
- * @req:      the request being processed
- * @error:    block status code
- * @nr_bytes: number of bytes to complete for @req
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @req, but doesn't complete
- *     the request structure even if @req doesn't have leftover.
- *     If @req has leftover, sets it up for the next range of segments.
- *
- *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
- *     %false return from this function.
- *
- * Note:
- *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
- *      except in the consistency check at the end of this function.
- *
- * Return:
- *     %false - this request doesn't have any more data
- *     %true  - this request has more data
- **/
-bool blk_update_request(struct request *req, blk_status_t error,
-               unsigned int nr_bytes)
-{
-       int total_bytes;
-
-       trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
-
-       if (!req->bio)
-               return false;
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
-           error == BLK_STS_OK)
-               req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
-
-       if (unlikely(error && !blk_rq_is_passthrough(req) &&
-                    !(req->rq_flags & RQF_QUIET)))
-               print_req_error(req, error, __func__);
-
-       blk_account_io_completion(req, nr_bytes);
-
-       total_bytes = 0;
-       while (req->bio) {
-               struct bio *bio = req->bio;
-               unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
-
-               if (bio_bytes == bio->bi_iter.bi_size)
-                       req->bio = bio->bi_next;
-
-               /* Completion has already been traced */
-               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-               req_bio_endio(req, bio, bio_bytes, error);
-
-               total_bytes += bio_bytes;
-               nr_bytes -= bio_bytes;
-
-               if (!nr_bytes)
-                       break;
-       }
-
-       /*
-        * completely done
-        */
-       if (!req->bio) {
-               /*
-                * Reset counters so that the request stacking driver
-                * can find how many bytes remain in the request
-                * later.
-                */
-               req->__data_len = 0;
-               return false;
-       }
-
-       req->__data_len -= total_bytes;
-
-       /* update sector only for requests with clear definition of sector */
-       if (!blk_rq_is_passthrough(req))
-               req->__sector += total_bytes >> 9;
-
-       /* mixed attributes always follow the first bio */
-       if (req->rq_flags & RQF_MIXED_MERGE) {
-               req->cmd_flags &= ~REQ_FAILFAST_MASK;
-               req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
-       }
-
-       if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
-               /*
-                * If total number of sectors is less than the first segment
-                * size, something has gone terribly wrong.
-                */
-               if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-                       blk_dump_rq_flags(req, "request botched");
-                       req->__data_len = blk_rq_cur_bytes(req);
-               }
-
-               /* recalculate the number of segments */
-               req->nr_phys_segments = blk_recalc_rq_segments(req);
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_update_request);
-
  #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
  /**
   * rq_flush_dcache_pages - Helper function to flush all pages in a request
@@ -1629,6 +1512,32 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
  }
  EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
  
+void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
+{
+       struct task_struct *tsk = current;
+
+       /*
+        * If this is a nested plug, don't actually assign it.
+        */
+       if (tsk->plug)
+               return;
+
+       plug->mq_list = NULL;
+       plug->cached_rq = NULL;
+       plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
+       plug->rq_count = 0;
+       plug->multiple_queues = false;
+       plug->has_elevator = false;
+       plug->nowait = false;
+       INIT_LIST_HEAD(&plug->cb_list);
+
+       /*
+        * Store ordering should not be needed here, since a potential
+        * preempt will imply a full memory barrier
+        */
+       tsk->plug = plug;
+}
+
  /**
   * blk_start_plug - initialize blk_plug and track it inside the task_struct
   * @plug:      The &struct blk_plug that needs to be initialized
@@ -1654,25 +1563,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
   */
  void blk_start_plug(struct blk_plug *plug)
  {
-       struct task_struct *tsk = current;
-
-       /*
-        * If this is a nested plug, don't actually assign it.
-        */
-       if (tsk->plug)
-               return;
-
-       INIT_LIST_HEAD(&plug->mq_list);
-       INIT_LIST_HEAD(&plug->cb_list);
-       plug->rq_count = 0;
-       plug->multiple_queues = false;
-       plug->nowait = false;
-
-       /*
-        * Store ordering should not be needed here, since a potential
-        * preempt will imply a full memory barrier
-        */
-       tsk->plug = plug;
+       blk_start_plug_nr_ios(plug, 1);
  }
  EXPORT_SYMBOL(blk_start_plug);
  
@@ -1718,12 +1609,14 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
  }
  EXPORT_SYMBOL(blk_check_plugged);
  
-void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
  {
-       flush_plug_callbacks(plug, from_schedule);
-
-       if (!list_empty(&plug->mq_list))
+       if (!list_empty(&plug->cb_list))
+               flush_plug_callbacks(plug, from_schedule);
+       if (!rq_list_empty(plug->mq_list))
                 blk_mq_flush_plug_list(plug, from_schedule);
+       if (unlikely(!from_schedule && plug->cached_rq))
+               blk_mq_free_plug_rqs(plug);
  }
  
  /**
@@ -1738,11 +1631,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
   */
  void blk_finish_plug(struct blk_plug *plug)
  {
-       if (plug != current->plug)
-               return;
-       blk_flush_plug_list(plug, false);
-
-       current->plug = NULL;
+       if (plug == current->plug) {
+               blk_flush_plug(plug, false);
+               current->plug = NULL;
+       }
  }
  EXPORT_SYMBOL(blk_finish_plug);
  
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c

index c322176..c87aba8 100644 (file)
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -12,12 +12,13 @@
  #include <crypto/skcipher.h>
  #include <linux/blk-cgroup.h>
  #include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
  #include <linux/blkdev.h>
  #include <linux/crypto.h>
-#include <linux/keyslot-manager.h>
  #include <linux/mempool.h>
  #include <linux/module.h>
  #include <linux/random.h>
+#include <linux/scatterlist.h>
  
  #include "blk-crypto-internal.h"
  
@@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
  static DEFINE_MUTEX(tfms_init_lock);
  static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
  
-static struct blk_crypto_keyslot {
+static struct blk_crypto_fallback_keyslot {
         enum blk_crypto_mode_num crypto_mode;
         struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
  } *blk_crypto_keyslots;
  
-static struct blk_keyslot_manager blk_crypto_ksm;
+static struct blk_crypto_profile blk_crypto_fallback_profile;
  static struct workqueue_struct *blk_crypto_wq;
  static mempool_t *blk_crypto_bounce_page_pool;
  static struct bio_set crypto_bio_split;
@@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
   */
  static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
  
-static void blk_crypto_evict_keyslot(unsigned int slot)
+static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
  {
-       struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+       struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
         enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
         int err;
  
@@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
         slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
  }
  
-static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     unsigned int slot)
+static int
+blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   unsigned int slot)
  {
-       struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+       struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
         const enum blk_crypto_mode_num crypto_mode =
                                                 key->crypto_cfg.crypto_mode;
         int err;
  
         if (crypto_mode != slotp->crypto_mode &&
             slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
-               blk_crypto_evict_keyslot(slot);
+               blk_crypto_fallback_evict_keyslot(slot);
  
         slotp->crypto_mode = crypto_mode;
         err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
                                      key->size);
         if (err) {
-               blk_crypto_evict_keyslot(slot);
+               blk_crypto_fallback_evict_keyslot(slot);
                 return err;
         }
         return 0;
  }
  
-static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
-                                   const struct blk_crypto_key *key,
-                                   unsigned int slot)
+static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
+                                            const struct blk_crypto_key *key,
+                                            unsigned int slot)
  {
-       blk_crypto_evict_keyslot(slot);
+       blk_crypto_fallback_evict_keyslot(slot);
         return 0;
  }
  
-/*
- * The crypto API fallback KSM ops - only used for a bio when it specifies a
- * blk_crypto_key that was not supported by the device's inline encryption
- * hardware.
- */
-static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
-       .keyslot_program        = blk_crypto_keyslot_program,
-       .keyslot_evict          = blk_crypto_keyslot_evict,
+static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
+       .keyslot_program        = blk_crypto_fallback_keyslot_program,
+       .keyslot_evict          = blk_crypto_fallback_keyslot_evict,
  };
  
  static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
         bio_endio(src_bio);
  }
  
-static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
  {
         struct bvec_iter iter;
         struct bio_vec bv;
@@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
         return bio;
  }
  
-static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
-                                       struct skcipher_request **ciph_req_ret,
-                                       struct crypto_wait *wait)
+static bool
+blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
+                                    struct skcipher_request **ciph_req_ret,
+                                    struct crypto_wait *wait)
  {
         struct skcipher_request *ciph_req;
-       const struct blk_crypto_keyslot *slotp;
-       int keyslot_idx = blk_ksm_get_slot_idx(slot);
+       const struct blk_crypto_fallback_keyslot *slotp;
+       int keyslot_idx = blk_crypto_keyslot_index(slot);
  
         slotp = &blk_crypto_keyslots[keyslot_idx];
         ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
         return true;
  }
  
-static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
  {
         struct bio *bio = *bio_ptr;
         unsigned int i = 0;
@@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
  {
         struct bio *src_bio, *enc_bio;
         struct bio_crypt_ctx *bc;
-       struct blk_ksm_keyslot *slot;
+       struct blk_crypto_keyslot *slot;
         int data_unit_size;
         struct skcipher_request *ciph_req = NULL;
         DECLARE_CRYPTO_WAIT(wait);
@@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
         blk_status_t blk_st;
  
         /* Split the bio if it's too big for single page bvec */
-       if (!blk_crypto_split_bio_if_needed(bio_ptr))
+       if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
                 return false;
  
         src_bio = *bio_ptr;
@@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
         data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
  
         /* Allocate bounce bio for encryption */
-       enc_bio = blk_crypto_clone_bio(src_bio);
+       enc_bio = blk_crypto_fallback_clone_bio(src_bio);
         if (!enc_bio) {
                 src_bio->bi_status = BLK_STS_RESOURCE;
                 return false;
         }
  
         /*
-        * Use the crypto API fallback keyslot manager to get a crypto_skcipher
-        * for the algorithm and key specified for this bio.
+        * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+        * this bio's algorithm and key.
          */
-       blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+       blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+                                       bc->bc_key, &slot);
         if (blk_st != BLK_STS_OK) {
                 src_bio->bi_status = blk_st;
                 goto out_put_enc_bio;
         }
  
         /* and then allocate an skcipher_request for it */
-       if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+       if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
                 src_bio->bi_status = BLK_STS_RESOURCE;
                 goto out_release_keyslot;
         }
@@ -362,7 +361,7 @@ out_free_bounce_pages:
  out_free_ciph_req:
         skcipher_request_free(ciph_req);
  out_release_keyslot:
-       blk_ksm_put_slot(slot);
+       blk_crypto_put_keyslot(slot);
  out_put_enc_bio:
         if (enc_bio)
                 bio_put(enc_bio);
@@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
                 container_of(work, struct bio_fallback_crypt_ctx, work);
         struct bio *bio = f_ctx->bio;
         struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
-       struct blk_ksm_keyslot *slot;
+       struct blk_crypto_keyslot *slot;
         struct skcipher_request *ciph_req = NULL;
         DECLARE_CRYPTO_WAIT(wait);
         u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
         blk_status_t blk_st;
  
         /*
-        * Use the crypto API fallback keyslot manager to get a crypto_skcipher
-        * for the algorithm and key specified for this bio.
+        * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+        * this bio's algorithm and key.
          */
-       blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+       blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+                                       bc->bc_key, &slot);
         if (blk_st != BLK_STS_OK) {
                 bio->bi_status = blk_st;
                 goto out_no_keyslot;
         }
  
         /* and then allocate an skcipher_request for it */
-       if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+       if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
                 bio->bi_status = BLK_STS_RESOURCE;
                 goto out;
         }
@@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
  
  out:
         skcipher_request_free(ciph_req);
-       blk_ksm_put_slot(slot);
+       blk_crypto_put_keyslot(slot);
  out_no_keyslot:
         mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
         bio_endio(bio);
@@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
   * @bio_ptr: pointer to the bio to prepare
   *
   * If bio is doing a WRITE operation, this splits the bio into two parts if it's
- * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
- * for the first part, encrypts it, and update bio_ptr to point to the bounce
- * bio.
+ * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
+ * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
+ * the bounce bio.
   *
   * For a READ operation, we mark the bio for decryption by using bi_private and
   * bi_end_io.
@@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
                 return false;
         }
  
-       if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
-                                         &bc->bc_key->crypto_cfg)) {
+       if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
+                                       &bc->bc_key->crypto_cfg)) {
                 bio->bi_status = BLK_STS_NOTSUPP;
                 return false;
         }
@@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
  
  int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
  {
-       return blk_ksm_evict_key(&blk_crypto_ksm, key);
+       return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
  }
  
  static bool blk_crypto_fallback_inited;
@@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
  {
         int i;
         int err;
+       struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
  
         if (blk_crypto_fallback_inited)
                 return 0;
@@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
         if (err)
                 goto out;
  
-       err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+       err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
         if (err)
                 goto fail_free_bioset;
         err = -ENOMEM;
  
-       blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
-       blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+       profile->ll_ops = blk_crypto_fallback_ll_ops;
+       profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
  
         /* All blk-crypto modes have a crypto API fallback. */
         for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
-               blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
-       blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+               profile->modes_supported[i] = 0xFFFFFFFF;
+       profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
  
         blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
                                         WQ_UNBOUND | WQ_HIGHPRI |
                                         WQ_MEM_RECLAIM, num_online_cpus());
         if (!blk_crypto_wq)
-               goto fail_free_ksm;
+               goto fail_destroy_profile;
  
         blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
                                       sizeof(blk_crypto_keyslots[0]),
@@ -595,8 +596,8 @@ fail_free_keyslots:
         kfree(blk_crypto_keyslots);
  fail_free_wq:
         destroy_workqueue(blk_crypto_wq);
-fail_free_ksm:
-       blk_ksm_destroy(&blk_crypto_ksm);
+fail_destroy_profile:
+       blk_crypto_profile_destroy(profile);
  fail_free_bioset:
         bioset_exit(&crypto_bio_split);
  out:
@@ -610,7 +611,7 @@ out:
  int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
  {
         const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
-       struct blk_crypto_keyslot *slotp;
+       struct blk_crypto_fallback_keyslot *slotp;
         unsigned int i;
         int err = 0;
  
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h

index 0d36aae..2fb0d65 100644 (file)
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -7,7 +7,7 @@
  #define __LINUX_BLK_CRYPTO_INTERNAL_H
  
  #include <linux/bio.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  
  /* Represents a crypto mode supported by blk-crypto  */
  struct blk_crypto_mode {
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c

new file mode 100644 (file)

index 0000000..605ba06
--- /dev/null
+++ b/block/blk-crypto-profile.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/**
+ * DOC: blk-crypto profiles
+ *
+ * 'struct blk_crypto_profile' contains all generic inline encryption-related
+ * state for a particular inline encryption device.  blk_crypto_profile serves
+ * as the way that drivers for inline encryption hardware expose their crypto
+ * capabilities and certain functions (e.g., functions to program and evict
+ * keys) to upper layers.  Device drivers that want to support inline encryption
+ * construct a crypto profile, then associate it with the disk's request_queue.
+ *
+ * If the device has keyslots, then its blk_crypto_profile also handles managing
+ * these keyslots in a device-independent way, using the driver-provided
+ * functions to program and evict keys as needed.  This includes keeping track
+ * of which key and how many I/O requests are using each keyslot, getting
+ * keyslots for I/O requests, and handling key eviction requests.
+ *
+ * For more information, see Documentation/block/inline-encryption.rst.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/blk-crypto-profile.h>
+#include <linux/device.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+
+struct blk_crypto_keyslot {
+       atomic_t slot_refs;
+       struct list_head idle_slot_node;
+       struct hlist_node hash_node;
+       const struct blk_crypto_key *key;
+       struct blk_crypto_profile *profile;
+};
+
+static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+{
+       /*
+        * Calling into the driver requires profile->lock held and the device
+        * resumed.  But we must resume the device first, since that can acquire
+        * and release profile->lock via blk_crypto_reprogram_all_keys().
+        */
+       if (profile->dev)
+               pm_runtime_get_sync(profile->dev);
+       down_write(&profile->lock);
+}
+
+static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+{
+       up_write(&profile->lock);
+       if (profile->dev)
+               pm_runtime_put_sync(profile->dev);
+}
+
+/**
+ * blk_crypto_profile_init() - Initialize a blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Storage drivers must call this when starting to set up a blk_crypto_profile,
+ * before filling in additional fields.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+                           unsigned int num_slots)
+{
+       unsigned int slot;
+       unsigned int i;
+       unsigned int slot_hashtable_size;
+
+       memset(profile, 0, sizeof(*profile));
+       init_rwsem(&profile->lock);
+
+       if (num_slots == 0)
+               return 0;
+
+       /* Initialize keyslot management data. */
+
+       profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]),
+                                 GFP_KERNEL);
+       if (!profile->slots)
+               return -ENOMEM;
+
+       profile->num_slots = num_slots;
+
+       init_waitqueue_head(&profile->idle_slots_wait_queue);
+       INIT_LIST_HEAD(&profile->idle_slots);
+
+       for (slot = 0; slot < num_slots; slot++) {
+               profile->slots[slot].profile = profile;
+               list_add_tail(&profile->slots[slot].idle_slot_node,
+                             &profile->idle_slots);
+       }
+
+       spin_lock_init(&profile->idle_slots_lock);
+
+       slot_hashtable_size = roundup_pow_of_two(num_slots);
+       /*
+        * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
+        * buckets.  This only makes a difference when there is only 1 keyslot.
+        */
+       if (slot_hashtable_size < 2)
+               slot_hashtable_size = 2;
+
+       profile->log_slot_ht_size = ilog2(slot_hashtable_size);
+       profile->slot_hashtable =
+               kvmalloc_array(slot_hashtable_size,
+                              sizeof(profile->slot_hashtable[0]), GFP_KERNEL);
+       if (!profile->slot_hashtable)
+               goto err_destroy;
+       for (i = 0; i < slot_hashtable_size; i++)
+               INIT_HLIST_HEAD(&profile->slot_hashtable[i]);
+
+       return 0;
+
+err_destroy:
+       blk_crypto_profile_destroy(profile);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_init);
+
+static void blk_crypto_profile_destroy_callback(void *profile)
+{
+       blk_crypto_profile_destroy(profile);
+}
+
+/**
+ * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init()
+ * @dev: the device which owns the blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be
+ * called automatically on driver detach.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int devm_blk_crypto_profile_init(struct device *dev,
+                                struct blk_crypto_profile *profile,
+                                unsigned int num_slots)
+{
+       int err = blk_crypto_profile_init(profile, num_slots);
+
+       if (err)
+               return err;
+
+       return devm_add_action_or_reset(dev,
+                                       blk_crypto_profile_destroy_callback,
+                                       profile);
+}
+EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init);
+
+static inline struct hlist_head *
+blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile,
+                              const struct blk_crypto_key *key)
+{
+       return &profile->slot_hashtable[
+                       hash_ptr(key, profile->log_slot_ht_size)];
+}
+
+static void
+blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot)
+{
+       struct blk_crypto_profile *profile = slot->profile;
+       unsigned long flags;
+
+       spin_lock_irqsave(&profile->idle_slots_lock, flags);
+       list_del(&slot->idle_slot_node);
+       spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_keyslot(struct blk_crypto_profile *profile,
+                       const struct blk_crypto_key *key)
+{
+       const struct hlist_head *head =
+               blk_crypto_hash_bucket_for_key(profile, key);
+       struct blk_crypto_keyslot *slotp;
+
+       hlist_for_each_entry(slotp, head, hash_node) {
+               if (slotp->key == key)
+                       return slotp;
+       }
+       return NULL;
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile,
+                                const struct blk_crypto_key *key)
+{
+       struct blk_crypto_keyslot *slot;
+
+       slot = blk_crypto_find_keyslot(profile, key);
+       if (!slot)
+               return NULL;
+       if (atomic_inc_return(&slot->slot_refs) == 1) {
+               /* Took first reference to this slot; remove it from LRU list */
+               blk_crypto_remove_slot_from_lru_list(slot);
+       }
+       return slot;
+}
+
+/**
+ * blk_crypto_keyslot_index() - Get the index of a keyslot
+ * @slot: a keyslot that blk_crypto_get_keyslot() returned
+ *
+ * Return: the 0-based index of the keyslot within the device's keyslots.
+ */
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot)
+{
+       return slot - slot->profile->slots;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index);
+
+/**
+ * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed.
+ * @profile: the crypto profile of the device the key will be used on
+ * @key: the key that will be used
+ * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct
+ *           will be stored here; otherwise NULL will be stored here.
+ *
+ * If the device has keyslots, this gets a keyslot that's been programmed with
+ * the specified key.  If the key is already in a slot, this reuses it;
+ * otherwise this waits for a slot to become idle and programs the key into it.
+ *
+ * This must be paired with a call to blk_crypto_put_keyslot().
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or
+ *        one wasn't needed; or a blk_status_t error on failure.
+ */
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   struct blk_crypto_keyslot **slot_ptr)
+{
+       struct blk_crypto_keyslot *slot;
+       int slot_idx;
+       int err;
+
+       *slot_ptr = NULL;
+
+       /*
+        * If the device has no concept of "keyslots", then there is no need to
+        * get one.
+        */
+       if (profile->num_slots == 0)
+               return BLK_STS_OK;
+
+       down_read(&profile->lock);
+       slot = blk_crypto_find_and_grab_keyslot(profile, key);
+       up_read(&profile->lock);
+       if (slot)
+               goto success;
+
+       for (;;) {
+               blk_crypto_hw_enter(profile);
+               slot = blk_crypto_find_and_grab_keyslot(profile, key);
+               if (slot) {
+                       blk_crypto_hw_exit(profile);
+                       goto success;
+               }
+
+               /*
+                * If we're here, that means there wasn't a slot that was
+                * already programmed with the key. So try to program it.
+                */
+               if (!list_empty(&profile->idle_slots))
+                       break;
+
+               blk_crypto_hw_exit(profile);
+               wait_event(profile->idle_slots_wait_queue,
+                          !list_empty(&profile->idle_slots));
+       }
+
+       slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot,
+                               idle_slot_node);
+       slot_idx = blk_crypto_keyslot_index(slot);
+
+       err = profile->ll_ops.keyslot_program(profile, key, slot_idx);
+       if (err) {
+               wake_up(&profile->idle_slots_wait_queue);
+               blk_crypto_hw_exit(profile);
+               return errno_to_blk_status(err);
+       }
+
+       /* Move this slot to the hash list for the new key. */
+       if (slot->key)
+               hlist_del(&slot->hash_node);
+       slot->key = key;
+       hlist_add_head(&slot->hash_node,
+                      blk_crypto_hash_bucket_for_key(profile, key));
+
+       atomic_set(&slot->slot_refs, 1);
+
+       blk_crypto_remove_slot_from_lru_list(slot);
+
+       blk_crypto_hw_exit(profile);
+success:
+       *slot_ptr = slot;
+       return BLK_STS_OK;
+}
+
+/**
+ * blk_crypto_put_keyslot() - Release a reference to a keyslot
+ * @slot: The keyslot to release the reference of (may be NULL).
+ *
+ * Context: Any context.
+ */
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
+{
+       struct blk_crypto_profile *profile;
+       unsigned long flags;
+
+       if (!slot)
+               return;
+
+       profile = slot->profile;
+
+       if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
+                                       &profile->idle_slots_lock, flags)) {
+               list_add_tail(&slot->idle_slot_node, &profile->idle_slots);
+               spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+               wake_up(&profile->idle_slots_wait_queue);
+       }
+}
+
+/**
+ * __blk_crypto_cfg_supported() - Check whether the given crypto profile
+ *                               supports the given crypto configuration.
+ * @profile: the crypto profile to check
+ * @cfg: the crypto configuration to check for
+ *
+ * Return: %true if @profile supports the given @cfg.
+ */
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+                               const struct blk_crypto_config *cfg)
+{
+       if (!profile)
+               return false;
+       if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
+               return false;
+       if (profile->max_dun_bytes_supported < cfg->dun_bytes)
+               return false;
+       return true;
+}
+
+/**
+ * __blk_crypto_evict_key() - Evict a key from a device.
+ * @profile: the crypto profile of the device
+ * @key: the key to evict.  It must not still be used in any I/O.
+ *
+ * If the device has keyslots, this finds the keyslot (if any) that contains the
+ * specified key and calls the driver's keyslot_evict function to evict it.
+ *
+ * Otherwise, this just calls the driver's keyslot_evict function if it is
+ * implemented, passing just the key (without any particular keyslot).  This
+ * allows layered devices to evict the key from their underlying devices.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
+ *        if the keyslot is still in use, or another -errno value on other
+ *        error.
+ */
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+                          const struct blk_crypto_key *key)
+{
+       struct blk_crypto_keyslot *slot;
+       int err = 0;
+
+       if (profile->num_slots == 0) {
+               if (profile->ll_ops.keyslot_evict) {
+                       blk_crypto_hw_enter(profile);
+                       err = profile->ll_ops.keyslot_evict(profile, key, -1);
+                       blk_crypto_hw_exit(profile);
+                       return err;
+               }
+               return 0;
+       }
+
+       blk_crypto_hw_enter(profile);
+       slot = blk_crypto_find_keyslot(profile, key);
+       if (!slot)
+               goto out_unlock;
+
+       if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
+               err = -EBUSY;
+               goto out_unlock;
+       }
+       err = profile->ll_ops.keyslot_evict(profile, key,
+                                           blk_crypto_keyslot_index(slot));
+       if (err)
+               goto out_unlock;
+
+       hlist_del(&slot->hash_node);
+       slot->key = NULL;
+       err = 0;
+out_unlock:
+       blk_crypto_hw_exit(profile);
+       return err;
+}
+
+/**
+ * blk_crypto_reprogram_all_keys() - Re-program all keyslots.
+ * @profile: The crypto profile
+ *
+ * Re-program all keyslots that are supposed to have a key programmed.  This is
+ * intended only for use by drivers for hardware that loses its keys on reset.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ */
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile)
+{
+       unsigned int slot;
+
+       if (profile->num_slots == 0)
+               return;
+
+       /* This is for device initialization, so don't resume the device */
+       down_write(&profile->lock);
+       for (slot = 0; slot < profile->num_slots; slot++) {
+               const struct blk_crypto_key *key = profile->slots[slot].key;
+               int err;
+
+               if (!key)
+                       continue;
+
+               err = profile->ll_ops.keyslot_program(profile, key, slot);
+               WARN_ON(err);
+       }
+       up_write(&profile->lock);
+}
+EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile)
+{
+       if (!profile)
+               return;
+       kvfree(profile->slot_hashtable);
+       kvfree_sensitive(profile->slots,
+                        sizeof(profile->slots[0]) * profile->num_slots);
+       memzero_explicit(profile, sizeof(*profile));
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy);
+
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+                        struct request_queue *q)
+{
+       if (blk_integrity_queue_supports_integrity(q)) {
+               pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+               return false;
+       }
+       q->crypto_profile = profile;
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_register);
+
+void blk_crypto_unregister(struct request_queue *q)
+{
+       q->crypto_profile = NULL;
+}
+
+/**
+ * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
+ *                                      by child device
+ * @parent: the crypto profile for the parent device
+ * @child: the crypto profile for the child device, or NULL
+ *
+ * This clears all crypto capabilities in @parent that aren't set in @child.  If
+ * @child is NULL, then this clears all parent capabilities.
+ *
+ * Only use this when setting up the crypto profile for a layered device, before
+ * it's been exposed yet.
+ */
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+                                      const struct blk_crypto_profile *child)
+{
+       if (child) {
+               unsigned int i;
+
+               parent->max_dun_bytes_supported =
+                       min(parent->max_dun_bytes_supported,
+                           child->max_dun_bytes_supported);
+               for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
+                       parent->modes_supported[i] &= child->modes_supported[i];
+       } else {
+               parent->max_dun_bytes_supported = 0;
+               memset(parent->modes_supported, 0,
+                      sizeof(parent->modes_supported));
+       }
+}
+EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
+
+/**
+ * blk_crypto_has_capabilities() - Check whether @target supports at least all
+ *                                the crypto capabilities that @reference does.
+ * @target: the target profile
+ * @reference: the reference profile
+ *
+ * Return: %true if @target supports all the crypto capabilities of @reference.
+ */
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+                                const struct blk_crypto_profile *reference)
+{
+       int i;
+
+       if (!reference)
+               return true;
+
+       if (!target)
+               return false;
+
+       for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) {
+               if (reference->modes_supported[i] & ~target->modes_supported[i])
+                       return false;
+       }
+
+       if (reference->max_dun_bytes_supported >
+           target->max_dun_bytes_supported)
+               return false;
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
+
+/**
+ * blk_crypto_update_capabilities() - Update the capabilities of a crypto
+ *                                   profile to match those of another crypto
+ *                                   profile.
+ * @dst: The crypto profile whose capabilities to update.
+ * @src: The crypto profile whose capabilities this function will update @dst's
+ *      capabilities to.
+ *
+ * Blk-crypto requires that crypto capabilities that were
+ * advertised when a bio was created continue to be supported by the
+ * device until that bio is ended. This is turn means that a device cannot
+ * shrink its advertised crypto capabilities without any explicit
+ * synchronization with upper layers. So if there's no such explicit
+ * synchronization, @src must support all the crypto capabilities that
+ * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)).
+ *
+ * Note also that as long as the crypto capabilities are being expanded, the
+ * order of updates becoming visible is not important because it's alright
+ * for blk-crypto to see stale values - they only cause blk-crypto to
+ * believe that a crypto capability isn't supported when it actually is (which
+ * might result in blk-crypto-fallback being used if available, or the bio being
+ * failed).
+ */
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+                                   const struct blk_crypto_profile *src)
+{
+       memcpy(dst->modes_supported, src->modes_supported,
+              sizeof(dst->modes_supported));
+
+       dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c

index 103c2e2..ec9efee 100644 (file)
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -11,7 +11,7 @@
  
  #include <linux/bio.h>
  #include <linux/blkdev.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  #include <linux/module.h>
  #include <linux/slab.h>
  
@@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
  
  blk_status_t __blk_crypto_init_request(struct request *rq)
  {
-       return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
-                                       &rq->crypt_keyslot);
+       return blk_crypto_get_keyslot(rq->q->crypto_profile,
+                                     rq->crypt_ctx->bc_key,
+                                     &rq->crypt_keyslot);
  }
  
  /**
@@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
   */
  void __blk_crypto_free_request(struct request *rq)
  {
-       blk_ksm_put_slot(rq->crypt_keyslot);
+       blk_crypto_put_keyslot(rq->crypt_keyslot);
         mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
         blk_crypto_rq_set_defaults(rq);
  }
@@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
  {
         struct bio *bio = *bio_ptr;
         const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
+       struct blk_crypto_profile *profile;
  
         /* Error if bio has no data. */
         if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
          * Success if device supports the encryption context, or if we succeeded
          * in falling back to the crypto API.
          */
-       if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
-                                        &bc_key->crypto_cfg))
+       profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
+       if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
                 return true;
  
         if (blk_crypto_fallback_bio_prep(bio_ptr))
@@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
                                  const struct blk_crypto_config *cfg)
  {
         return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-              blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+              __blk_crypto_cfg_supported(q->crypto_profile, cfg);
  }
  
  /**
@@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
  int blk_crypto_start_using_key(const struct blk_crypto_key *key,
                                struct request_queue *q)
  {
-       if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+       if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
                 return 0;
         return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
  }
@@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
   * evicted from any hardware that it might have been programmed into.  The key
   * must not be in use by any in-flight IO when this function is called.
   *
- * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
   */
  int blk_crypto_evict_key(struct request_queue *q,
                          const struct blk_crypto_key *key)
  {
-       if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
-               return blk_ksm_evict_key(q->ksm, key);
+       if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+               return __blk_crypto_evict_key(q->crypto_profile, key);
  
         /*
-        * If the request queue's associated inline encryption hardware didn't
-        * have support for the key, then the key might have been programmed
-        * into the fallback keyslot manager, so try to evict from there.
+        * If the request_queue didn't support the key, then blk-crypto-fallback
+        * may have been used, so try to evict the key from blk-crypto-fallback.
          */
         return blk_crypto_fallback_evict_key(key);
  }
diff --git a/block/blk-exec.c b/block/blk-exec.c

index d6cd501..1b8b47f 100644 (file)
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  
  static bool blk_rq_is_poll(struct request *rq)
  {
-       return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL;
+       if (!rq->mq_hctx)
+               return false;
+       if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+               return false;
+       if (WARN_ON_ONCE(!rq->bio))
+               return false;
+       return true;
  }
  
  static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
  {
         do {
-               blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true);
+               bio_poll(rq->bio, NULL, 0);
                 cond_resched();
         } while (!completion_done(wait));
  }
diff --git a/block/blk-flush.c b/block/blk-flush.c

index 4201728..8e364bd 100644 (file)
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
   * @rq is being submitted.  Analyze what needs to be done and put it on the
   * right queue.
   */
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
  {
         struct request_queue *q = rq->q;
         unsigned long fflags = q->queue_flags;  /* may change, cache */
@@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq)
          */
         if (!policy) {
                 blk_mq_end_request(rq, 0);
-               return;
+               return true;
         }
  
         BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
@@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq)
          * for normal execution.
          */
         if ((policy & REQ_FSEQ_DATA) &&
-           !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-               blk_mq_request_bypass_insert(rq, false, false);
-               return;
-       }
+           !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
+               return false;
  
         /*
          * @rq should go through flush machinery.  Mark it part of flush
@@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq)
         spin_lock_irq(&fq->mq_flush_lock);
         blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
         spin_unlock_irq(&fq->mq_flush_lock);
+
+       return true;
  }
  
  /**
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c

new file mode 100644 (file)

index 0000000..c246c42
--- /dev/null
+++ b/block/blk-ia-ranges.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Block device concurrent positioning ranges.
+ *
+ *  Copyright (C) 2021 Western Digital Corporation or its Affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include "blk.h"
+
+static ssize_t
+blk_ia_range_sector_show(struct blk_independent_access_range *iar,
+                        char *buf)
+{
+       return sprintf(buf, "%llu\n", iar->sector);
+}
+
+static ssize_t
+blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar,
+                            char *buf)
+{
+       return sprintf(buf, "%llu\n", iar->nr_sectors);
+}
+
+struct blk_ia_range_sysfs_entry {
+       struct attribute attr;
+       ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
+       .attr = { .name = "sector", .mode = 0444 },
+       .show = blk_ia_range_sector_show,
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
+       .attr = { .name = "nr_sectors", .mode = 0444 },
+       .show = blk_ia_range_nr_sectors_show,
+};
+
+static struct attribute *blk_ia_range_attrs[] = {
+       &blk_ia_range_sector_entry.attr,
+       &blk_ia_range_nr_sectors_entry.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(blk_ia_range);
+
+static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj,
+                                     struct attribute *attr, char *buf)
+{
+       struct blk_ia_range_sysfs_entry *entry =
+               container_of(attr, struct blk_ia_range_sysfs_entry, attr);
+       struct blk_independent_access_range *iar =
+               container_of(kobj, struct blk_independent_access_range, kobj);
+       ssize_t ret;
+
+       mutex_lock(&iar->queue->sysfs_lock);
+       ret = entry->show(iar, buf);
+       mutex_unlock(&iar->queue->sysfs_lock);
+
+       return ret;
+}
+
+static const struct sysfs_ops blk_ia_range_sysfs_ops = {
+       .show   = blk_ia_range_sysfs_show,
+};
+
+/*
+ * Independent access range entries are not freed individually, but alltogether
+ * with struct blk_independent_access_ranges and its array of ranges. Since
+ * kobject_add() takes a reference on the parent kobject contained in
+ * struct blk_independent_access_ranges, the array of independent access range
+ * entries cannot be freed until kobject_del() is called for all entries.
+ * So we do not need to do anything here, but still need this no-op release
+ * operation to avoid complaints from the kobject code.
+ */
+static void blk_ia_range_sysfs_nop_release(struct kobject *kobj)
+{
+}
+
+static struct kobj_type blk_ia_range_ktype = {
+       .sysfs_ops      = &blk_ia_range_sysfs_ops,
+       .default_groups = blk_ia_range_groups,
+       .release        = blk_ia_range_sysfs_nop_release,
+};
+
+/*
+ * This will be executed only after all independent access range entries are
+ * removed with kobject_del(), at which point, it is safe to free everything,
+ * including the array of ranges.
+ */
+static void blk_ia_ranges_sysfs_release(struct kobject *kobj)
+{
+       struct blk_independent_access_ranges *iars =
+               container_of(kobj, struct blk_independent_access_ranges, kobj);
+
+       kfree(iars);
+}
+
+static struct kobj_type blk_ia_ranges_ktype = {
+       .release        = blk_ia_ranges_sysfs_release,
+};
+
+/**
+ * disk_register_ia_ranges - register with sysfs a set of independent
+ *                         access ranges
+ * @disk:      Target disk
+ * @new_iars:  New set of independent access ranges
+ *
+ * Register with sysfs a set of independent access ranges for @disk.
+ * If @new_iars is not NULL, this set of ranges is registered and the old set
+ * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
+ * registered if it is not already.
+ */
+int disk_register_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *new_iars)
+{
+       struct request_queue *q = disk->queue;
+       struct blk_independent_access_ranges *iars;
+       int i, ret;
+
+       lockdep_assert_held(&q->sysfs_dir_lock);
+       lockdep_assert_held(&q->sysfs_lock);
+
+       /* If a new range set is specified, unregister the old one */
+       if (new_iars) {
+               if (q->ia_ranges)
+                       disk_unregister_independent_access_ranges(disk);
+               q->ia_ranges = new_iars;
+       }
+
+       iars = q->ia_ranges;
+       if (!iars)
+               return 0;
+
+       /*
+        * At this point, iars is the new set of sector access ranges that needs
+        * to be registered with sysfs.
+        */
+       WARN_ON(iars->sysfs_registered);
+       ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
+                                  &q->kobj, "%s", "independent_access_ranges");
+       if (ret) {
+               q->ia_ranges = NULL;
+               kfree(iars);
+               return ret;
+       }
+
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               iars->ia_range[i].queue = q;
+               ret = kobject_init_and_add(&iars->ia_range[i].kobj,
+                                          &blk_ia_range_ktype, &iars->kobj,
+                                          "%d", i);
+               if (ret) {
+                       while (--i >= 0)
+                               kobject_del(&iars->ia_range[i].kobj);
+                       kobject_del(&iars->kobj);
+                       kobject_put(&iars->kobj);
+                       return ret;
+               }
+       }
+
+       iars->sysfs_registered = true;
+
+       return 0;
+}
+
+void disk_unregister_independent_access_ranges(struct gendisk *disk)
+{
+       struct request_queue *q = disk->queue;
+       struct blk_independent_access_ranges *iars = q->ia_ranges;
+       int i;
+
+       lockdep_assert_held(&q->sysfs_dir_lock);
+       lockdep_assert_held(&q->sysfs_lock);
+
+       if (!iars)
+               return;
+
+       if (iars->sysfs_registered) {
+               for (i = 0; i < iars->nr_ia_ranges; i++)
+                       kobject_del(&iars->ia_range[i].kobj);
+               kobject_del(&iars->kobj);
+               kobject_put(&iars->kobj);
+       } else {
+               kfree(iars);
+       }
+
+       q->ia_ranges = NULL;
+}
+
+static struct blk_independent_access_range *
+disk_find_ia_range(struct blk_independent_access_ranges *iars,
+                 sector_t sector)
+{
+       struct blk_independent_access_range *iar;
+       int i;
+
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               iar = &iars->ia_range[i];
+               if (sector >= iar->sector &&
+                   sector < iar->sector + iar->nr_sectors)
+                       return iar;
+       }
+
+       return NULL;
+}
+
+static bool disk_check_ia_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars)
+{
+       struct blk_independent_access_range *iar, *tmp;
+       sector_t capacity = get_capacity(disk);
+       sector_t sector = 0;
+       int i;
+
+       /*
+        * While sorting the ranges in increasing LBA order, check that the
+        * ranges do not overlap, that there are no sector holes and that all
+        * sectors belong to one range.
+        */
+       for (i = 0; i < iars->nr_ia_ranges; i++) {
+               tmp = disk_find_ia_range(iars, sector);
+               if (!tmp || tmp->sector != sector) {
+                       pr_warn("Invalid non-contiguous independent access ranges\n");
+                       return false;
+               }
+
+               iar = &iars->ia_range[i];
+               if (tmp != iar) {
+                       swap(iar->sector, tmp->sector);
+                       swap(iar->nr_sectors, tmp->nr_sectors);
+               }
+
+               sector += iar->nr_sectors;
+       }
+
+       if (sector != capacity) {
+               pr_warn("Independent access ranges do not match disk capacity\n");
+               return false;
+       }
+
+       return true;
+}
+
+static bool disk_ia_ranges_changed(struct gendisk *disk,
+                                  struct blk_independent_access_ranges *new)
+{
+       struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+       int i;
+
+       if (!old)
+               return true;
+
+       if (old->nr_ia_ranges != new->nr_ia_ranges)
+               return true;
+
+       for (i = 0; i < old->nr_ia_ranges; i++) {
+               if (new->ia_range[i].sector != old->ia_range[i].sector ||
+                   new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors)
+                       return true;
+       }
+
+       return false;
+}
+
+/**
+ * disk_alloc_independent_access_ranges - Allocate an independent access ranges
+ *                                        data structure
+ * @disk:              target disk
+ * @nr_ia_ranges:      Number of independent access ranges
+ *
+ * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges
+ * access range descriptors.
+ */
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges)
+{
+       struct blk_independent_access_ranges *iars;
+
+       iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges),
+                           GFP_KERNEL, disk->queue->node);
+       if (iars)
+               iars->nr_ia_ranges = nr_ia_ranges;
+       return iars;
+}
+EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges);
+
+/**
+ * disk_set_independent_access_ranges - Set a disk independent access ranges
+ * @disk:      target disk
+ * @iars:      independent access ranges structure
+ *
+ * Set the independent access ranges information of the request queue
+ * of @disk to @iars. If @iars is NULL and the independent access ranges
+ * structure already set is cleared. If there are no differences between
+ * @iars and the independent access ranges structure already set, @iars
+ * is freed.
+ */
+void disk_set_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars)
+{
+       struct request_queue *q = disk->queue;
+
+       if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
+               kfree(iars);
+               iars = NULL;
+       }
+
+       mutex_lock(&q->sysfs_dir_lock);
+       mutex_lock(&q->sysfs_lock);
+
+       if (iars) {
+               if (!disk_check_ia_ranges(disk, iars)) {
+                       kfree(iars);
+                       iars = NULL;
+                       goto reg;
+               }
+
+               if (!disk_ia_ranges_changed(disk, iars)) {
+                       kfree(iars);
+                       goto unlock;
+               }
+       }
+
+       /*
+        * This may be called for a registered queue. E.g. during a device
+        * revalidation. If that is the case, we need to unregister the old
+        * set of independent access ranges and register the new set. If the
+        * queue is not registered, registration of the device request queue
+        * will register the independent access ranges, so only swap in the
+        * new set and free the old one.
+        */
+reg:
+       if (blk_queue_registered(q)) {
+               disk_register_independent_access_ranges(disk, iars);
+       } else {
+               swap(q->ia_ranges, iars);
+               kfree(iars);
+       }
+
+unlock:
+       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->sysfs_dir_lock);
+}
+EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c

index 16d5d53..d670d54 100644 (file)
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -6,7 +6,7 @@
   * Written by: Martin K. Petersen <martin.petersen@oracle.com>
   */
  
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/backing-dev.h>
  #include <linux/mempool.h>
  #include <linux/bio.h>
@@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
  
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       if (disk->queue->ksm) {
+       if (disk->queue->crypto_profile) {
                 pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-               blk_ksm_unregister(disk->queue);
+               blk_crypto_unregister(disk->queue);
         }
  #endif
  }
diff --git a/block/blk-iocost.c b/block/blk-iocost.c

index b3880e4..a5b37cc 100644 (file)
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3165,12 +3165,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
  
-       ioc = q_to_ioc(bdev->bd_disk->queue);
+       ioc = q_to_ioc(bdev_get_queue(bdev));
         if (!ioc) {
-               ret = blk_iocost_init(bdev->bd_disk->queue);
+               ret = blk_iocost_init(bdev_get_queue(bdev));
                 if (ret)
                         goto err;
-               ioc = q_to_ioc(bdev->bd_disk->queue);
+               ioc = q_to_ioc(bdev_get_queue(bdev));
         }
  
         spin_lock_irq(&ioc->lock);
@@ -3332,12 +3332,12 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
  
-       ioc = q_to_ioc(bdev->bd_disk->queue);
+       ioc = q_to_ioc(bdev_get_queue(bdev));
         if (!ioc) {
-               ret = blk_iocost_init(bdev->bd_disk->queue);
+               ret = blk_iocost_init(bdev_get_queue(bdev));
                 if (ret)
                         goto err;
-               ioc = q_to_ioc(bdev->bd_disk->queue);
+               ioc = q_to_ioc(bdev_get_queue(bdev));
         }
  
         spin_lock_irq(&ioc->lock);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c

index c0545f9..6593c71 100644 (file)
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,6 +74,7 @@
  #include <linux/sched/signal.h>
  #include <trace/events/block.h>
  #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
  #include "blk-rq-qos.h"
  #include "blk-stat.h"
  #include "blk.h"
diff --git a/block/blk-merge.c b/block/blk-merge.c

index 7a5c81c..df69f4b 100644 (file)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -6,12 +6,45 @@
  #include <linux/module.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/scatterlist.h>
  
  #include <trace/events/block.h>
  
  #include "blk.h"
  #include "blk-rq-qos.h"
+#include "blk-throttle.h"
+
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       struct bvec_iter iter = bio->bi_iter;
+       int idx;
+
+       bio_get_first_bvec(bio, bv);
+       if (bv->bv_len == bio->bi_iter.bi_size)
+               return;         /* this bio only has a single bvec */
+
+       bio_advance_iter(bio, &iter, iter.bi_size);
+
+       if (!iter.bi_bvec_done)
+               idx = iter.bi_idx - 1;
+       else    /* in the middle of bvec */
+               idx = iter.bi_idx;
+
+       *bv = bio->bi_io_vec[idx];
+
+       /*
+        * iter.bi_bvec_done records actual length of the last bvec
+        * if this bio ends in the middle of one io vector
+        */
+       if (iter.bi_bvec_done)
+               bv->bv_len = iter.bi_bvec_done;
+}
  
  static inline bool bio_will_gap(struct request_queue *q,
                 struct request *prev_rq, struct bio *prev, struct bio *next)
@@ -285,13 +318,13 @@ split:
          * iopoll in direct IO routine. Given performance gain of iopoll for
          * big IO can be trival, disable iopoll when split needed.
          */
-       bio_clear_hipri(bio);
-
+       bio_clear_polled(bio);
         return bio_split(bio, sectors, GFP_NOIO, bs);
  }
  
  /**
   * __blk_queue_split - split a bio and submit the second half
+ * @q:       [in] request_queue new bio is being queued at
   * @bio:     [in, out] bio to be split
   * @nr_segs: [out] number of segments in the first bio
   *
@@ -302,9 +335,9 @@ split:
   * of the caller to ensure that q->bio_split is only released after processing
   * of the split bio has finished.
   */
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+                      unsigned int *nr_segs)
  {
-       struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
         struct bio *split = NULL;
  
         switch (bio_op(*bio)) {
@@ -321,21 +354,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
                                 nr_segs);
                 break;
         default:
-               /*
-                * All drivers must accept single-segments bios that are <=
-                * PAGE_SIZE.  This is a quick and dirty check that relies on
-                * the fact that bi_io_vec[0] is always valid if a bio has data.
-                * The check might lead to occasional false negatives when bios
-                * are cloned, but compared to the performance impact of cloned
-                * bios themselves the loop below doesn't matter anyway.
-                */
-               if (!q->limits.chunk_sectors &&
-                   (*bio)->bi_vcnt == 1 &&
-                   ((*bio)->bi_io_vec[0].bv_len +
-                    (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
-                       *nr_segs = 1;
-                       break;
-               }
                 split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
                 break;
         }
@@ -365,9 +383,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
   */
  void blk_queue_split(struct bio **bio)
  {
+       struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
         unsigned int nr_segs;
  
-       __blk_queue_split(bio, &nr_segs);
+       if (blk_may_split(q, *bio))
+               __blk_queue_split(q, bio, &nr_segs);
  }
  EXPORT_SYMBOL(blk_queue_split);
  
@@ -558,6 +578,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
         return queue_max_segments(rq->q);
  }
  
+static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
+                                                 sector_t offset)
+{
+       struct request_queue *q = rq->q;
+
+       if (blk_rq_is_passthrough(rq))
+               return q->limits.max_hw_sectors;
+
+       if (!q->limits.chunk_sectors ||
+           req_op(rq) == REQ_OP_DISCARD ||
+           req_op(rq) == REQ_OP_SECURE_ERASE)
+               return blk_queue_get_max_sectors(q, req_op(rq));
+
+       return min(blk_max_size_offset(q, offset, 0),
+                       blk_queue_get_max_sectors(q, req_op(rq)));
+}
+
  static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
                 unsigned int nr_phys_segs)
  {
@@ -718,6 +755,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
         return ELEVATOR_NO_MERGE;
  }
  
+static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+{
+       if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
+               return true;
+       return false;
+}
+
  /*
   * For non-mq, this has to be called with the request spinlock acquired.
   * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -1023,12 +1067,11 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
   * @q: request_queue new bio is being queued at
   * @bio: new bio being queued
   * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
+ * @same_queue_rq: output value, will be true if there's an existing request
+ * from the passed in @q already in the plug list
   *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list.  Returns %true if merge was successful,
+ * Determine whether @bio being queued on @q can be merged with the previous
+ * request on %current's plugged list.  Returns %true if merge was successful,
   * otherwise %false.
   *
   * Plugging coalesces IOs from the same issuer for the same purpose without
@@ -1041,36 +1084,26 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
   * Caller must ensure !blk_queue_nomerges(q) beforehand.
   */
  bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs, struct request **same_queue_rq)
+               unsigned int nr_segs, bool *same_queue_rq)
  {
         struct blk_plug *plug;
         struct request *rq;
-       struct list_head *plug_list;
  
         plug = blk_mq_plug(q, bio);
-       if (!plug)
+       if (!plug || rq_list_empty(plug->mq_list))
                 return false;
  
-       plug_list = &plug->mq_list;
-
-       list_for_each_entry_reverse(rq, plug_list, queuelist) {
-               if (rq->q == q && same_queue_rq) {
-                       /*
-                        * Only blk-mq multiple hardware queues case checks the
-                        * rq in the same queue, there should be only one such
-                        * rq in a queue
-                        **/
-                       *same_queue_rq = rq;
-               }
-
-               if (rq->q != q)
-                       continue;
-
-               if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
-                   BIO_MERGE_OK)
-                       return true;
+       /* check the previously added entry for a quick merge attempt */
+       rq = rq_list_peek(&plug->mq_list);
+       if (rq->q == q) {
+               /*
+                * Only blk-mq multiple hardware queues case checks the rq in
+                * the same queue, there should be only one such rq in a queue
+                */
+               *same_queue_rq = true;
         }
-
+       if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK)
+               return true;
         return false;
  }
  
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

index 3b38d15..f5076c1 100644 (file)
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -124,7 +124,6 @@ static const char *const blk_queue_flag_name[] = {
         QUEUE_FLAG_NAME(STATS),
         QUEUE_FLAG_NAME(POLL_STATS),
         QUEUE_FLAG_NAME(REGISTERED),
-       QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
         QUEUE_FLAG_NAME(QUIESCED),
         QUEUE_FLAG_NAME(PCI_P2PDMA),
         QUEUE_FLAG_NAME(ZONE_RESETALL),
@@ -287,7 +286,7 @@ static const char *const cmd_flag_name[] = {
         CMD_FLAG_NAME(BACKGROUND),
         CMD_FLAG_NAME(NOWAIT),
         CMD_FLAG_NAME(NOUNMAP),
-       CMD_FLAG_NAME(HIPRI),
+       CMD_FLAG_NAME(POLLED),
  };
  #undef CMD_FLAG_NAME
  
@@ -453,11 +452,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
                    atomic_read(&tags->active_queues));
  
         seq_puts(m, "\nbitmap_tags:\n");
-       sbitmap_queue_show(tags->bitmap_tags, m);
+       sbitmap_queue_show(&tags->bitmap_tags, m);
  
         if (tags->nr_reserved_tags) {
                 seq_puts(m, "\nbreserved_tags:\n");
-               sbitmap_queue_show(tags->breserved_tags, m);
+               sbitmap_queue_show(&tags->breserved_tags, m);
         }
  }
  
@@ -488,7 +487,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
         if (res)
                 goto out;
         if (hctx->tags)
-               sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
+               sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
         mutex_unlock(&q->sysfs_lock);
  
  out:
@@ -522,77 +521,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
         if (res)
                 goto out;
         if (hctx->sched_tags)
-               sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
+               sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
         mutex_unlock(&q->sysfs_lock);
  
  out:
         return res;
  }
  
-static int hctx_io_poll_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       seq_printf(m, "considered=%lu\n", hctx->poll_considered);
-       seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
-       seq_printf(m, "success=%lu\n", hctx->poll_success);
-       return 0;
-}
-
-static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
-       return count;
-}
-
-static int hctx_dispatched_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-       int i;
-
-       seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
-       for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
-               unsigned int d = 1U << (i - 1);
-
-               seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
-       }
-
-       seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
-       return 0;
-}
-
-static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
-                                    size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-       int i;
-
-       for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
-               hctx->dispatched[i] = 0;
-       return count;
-}
-
-static int hctx_queued_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       seq_printf(m, "%lu\n", hctx->queued);
-       return 0;
-}
-
-static ssize_t hctx_queued_write(void *data, const char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       struct blk_mq_hw_ctx *hctx = data;
-
-       hctx->queued = 0;
-       return count;
-}
-
  static int hctx_run_show(void *data, struct seq_file *m)
  {
         struct blk_mq_hw_ctx *hctx = data;
@@ -614,7 +549,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
  {
         struct blk_mq_hw_ctx *hctx = data;
  
-       seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+       seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
         return 0;
  }
  
@@ -663,57 +598,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
  CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
  CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
  
-static int ctx_dispatched_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
-       return 0;
-}
-
-static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
-                                   size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
-       return count;
-}
-
-static int ctx_merged_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu\n", ctx->rq_merged);
-       return 0;
-}
-
-static ssize_t ctx_merged_write(void *data, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_merged = 0;
-       return count;
-}
-
-static int ctx_completed_show(void *data, struct seq_file *m)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
-       return 0;
-}
-
-static ssize_t ctx_completed_write(void *data, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       struct blk_mq_ctx *ctx = data;
-
-       ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
-       return count;
-}
-
  static int blk_mq_debugfs_show(struct seq_file *m, void *v)
  {
         const struct blk_mq_debugfs_attr *attr = m->private;
@@ -789,9 +673,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
         {"tags_bitmap", 0400, hctx_tags_bitmap_show},
         {"sched_tags", 0400, hctx_sched_tags_show},
         {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
-       {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
-       {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
-       {"queued", 0600, hctx_queued_show, hctx_queued_write},
         {"run", 0600, hctx_run_show, hctx_run_write},
         {"active", 0400, hctx_active_show},
         {"dispatch_busy", 0400, hctx_dispatch_busy_show},
@@ -803,9 +684,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
         {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
         {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
         {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
-       {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
-       {"merged", 0600, ctx_merged_show, ctx_merged_write},
-       {"completed", 0600, ctx_completed_show, ctx_completed_write},
         {},
  };
  
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c

index 0f006ca..c62b966 100644 (file)
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -57,10 +57,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  }
  EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
  
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  {
-       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-               return;
         clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  
         /*
@@ -363,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
         }
  }
  
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                 unsigned int nr_segs)
  {
         struct elevator_queue *e = q->elevator;
@@ -389,13 +387,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
          * potentially merge with. Currently includes a hand-wavy stop
          * count of 8, to not spend too much time checking for merges.
          */
-       if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
-               ctx->rq_merged++;
+       if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
                 ret = true;
-       }
  
         spin_unlock(&ctx->lock);
-
         return ret;
  }
  
@@ -515,83 +510,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
         percpu_ref_put(&q->q_usage_counter);
  }
  
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
-                                  struct blk_mq_hw_ctx *hctx,
-                                  unsigned int hctx_idx)
+static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
+                                         struct blk_mq_hw_ctx *hctx,
+                                         unsigned int hctx_idx)
  {
-       struct blk_mq_tag_set *set = q->tag_set;
-       int ret;
+       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+               hctx->sched_tags = q->sched_shared_tags;
+               return 0;
+       }
+
+       hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
+                                                   q->nr_requests);
  
-       hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
-                                              set->reserved_tags, set->flags);
         if (!hctx->sched_tags)
                 return -ENOMEM;
+       return 0;
+}
  
-       ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
-       if (ret) {
-               blk_mq_free_rq_map(hctx->sched_tags, set->flags);
-               hctx->sched_tags = NULL;
-       }
-
-       return ret;
+static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
+{
+       blk_mq_free_rq_map(queue->sched_shared_tags);
+       queue->sched_shared_tags = NULL;
  }
  
  /* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
  {
         struct blk_mq_hw_ctx *hctx;
         int i;
  
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (hctx->sched_tags) {
-                       blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
+                       if (!blk_mq_is_shared_tags(flags))
+                               blk_mq_free_rq_map(hctx->sched_tags);
                         hctx->sched_tags = NULL;
                 }
         }
+
+       if (blk_mq_is_shared_tags(flags))
+               blk_mq_exit_sched_shared_tags(q);
  }
  
-static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
  {
         struct blk_mq_tag_set *set = queue->tag_set;
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-       struct blk_mq_hw_ctx *hctx;
-       int ret, i;
  
         /*
          * Set initial depth at max so that we don't need to reallocate for
          * updating nr_requests.
          */
-       ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
-                                 &queue->sched_breserved_tags,
-                                 MAX_SCHED_RQ, set->reserved_tags,
-                                 set->numa_node, alloc_policy);
-       if (ret)
-               return ret;
-
-       queue_for_each_hw_ctx(queue, hctx, i) {
-               hctx->sched_tags->bitmap_tags =
-                                       &queue->sched_bitmap_tags;
-               hctx->sched_tags->breserved_tags =
-                                       &queue->sched_breserved_tags;
-       }
+       queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
+                                               BLK_MQ_NO_HCTX_IDX,
+                                               MAX_SCHED_RQ);
+       if (!queue->sched_shared_tags)
+               return -ENOMEM;
  
-       sbitmap_queue_resize(&queue->sched_bitmap_tags,
-                            queue->nr_requests - set->reserved_tags);
+       blk_mq_tag_update_sched_shared_tags(queue);
  
         return 0;
  }
  
-static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
-{
-       sbitmap_queue_free(&queue->sched_bitmap_tags);
-       sbitmap_queue_free(&queue->sched_breserved_tags);
-}
-
  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
  {
+       unsigned int i, flags = q->tag_set->flags;
         struct blk_mq_hw_ctx *hctx;
         struct elevator_queue *eq;
-       unsigned int i;
         int ret;
  
         if (!e) {
@@ -606,23 +589,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
          * Additionally, this is a per-hw queue depth.
          */
         q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
-                                  BLKDEV_MAX_RQ);
+                                  BLKDEV_DEFAULT_RQ);
  
-       queue_for_each_hw_ctx(q, hctx, i) {
-               ret = blk_mq_sched_alloc_tags(q, hctx, i);
+       if (blk_mq_is_shared_tags(flags)) {
+               ret = blk_mq_init_sched_shared_tags(q);
                 if (ret)
-                       goto err_free_tags;
+                       return ret;
         }
  
-       if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
-               ret = blk_mq_init_sched_shared_sbitmap(q);
+       queue_for_each_hw_ctx(q, hctx, i) {
+               ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
                 if (ret)
-                       goto err_free_tags;
+                       goto err_free_map_and_rqs;
         }
  
         ret = e->ops.init_sched(q, e);
         if (ret)
-               goto err_free_sbitmap;
+               goto err_free_map_and_rqs;
  
         blk_mq_debugfs_register_sched(q);
  
@@ -631,7 +614,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
                         ret = e->ops.init_hctx(hctx, i);
                         if (ret) {
                                 eq = q->elevator;
-                               blk_mq_sched_free_requests(q);
+                               blk_mq_sched_free_rqs(q);
                                 blk_mq_exit_sched(q, eq);
                                 kobject_put(&eq->kobj);
                                 return ret;
@@ -642,12 +625,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
  
         return 0;
  
-err_free_sbitmap:
-       if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
-               blk_mq_exit_sched_shared_sbitmap(q);
-err_free_tags:
-       blk_mq_sched_free_requests(q);
-       blk_mq_sched_tags_teardown(q);
+err_free_map_and_rqs:
+       blk_mq_sched_free_rqs(q);
+       blk_mq_sched_tags_teardown(q, flags);
+
         q->elevator = NULL;
         return ret;
  }
@@ -656,14 +637,20 @@ err_free_tags:
   * called in either blk_queue_cleanup or elevator_switch, tagset
   * is required for freeing requests
   */
-void blk_mq_sched_free_requests(struct request_queue *q)
+void blk_mq_sched_free_rqs(struct request_queue *q)
  {
         struct blk_mq_hw_ctx *hctx;
         int i;
  
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (hctx->sched_tags)
-                       blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+               blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
+                               BLK_MQ_NO_HCTX_IDX);
+       } else {
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (hctx->sched_tags)
+                               blk_mq_free_rqs(q->tag_set,
+                                               hctx->sched_tags, i);
+               }
         }
  }
  
@@ -684,8 +671,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
         blk_mq_debugfs_unregister_sched(q);
         if (e->type->ops.exit_sched)
                 e->type->ops.exit_sched(e);
-       blk_mq_sched_tags_teardown(q);
-       if (blk_mq_is_sbitmap_shared(flags))
-               blk_mq_exit_sched_shared_sbitmap(q);
+       blk_mq_sched_tags_teardown(q, flags);
         q->elevator = NULL;
  }
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h

index 5246ae0..25d1034 100644 (file)
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -2,21 +2,22 @@
  #ifndef BLK_MQ_SCHED_H
  #define BLK_MQ_SCHED_H
  
+#include "elevator.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
  
-#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
  
  void blk_mq_sched_assign_ioc(struct request *rq);
  
  bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                 unsigned int nr_segs, struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                 unsigned int nr_segs);
  bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                    struct list_head *free);
  void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
  
  void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                  bool run_queue, bool async);
@@ -28,45 +29,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
  
  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
  void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-void blk_mq_sched_free_requests(struct request_queue *q);
+void blk_mq_sched_free_rqs(struct request_queue *q);
  
-static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs)
+static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  {
-       if (blk_queue_nomerges(q) || !bio_mergeable(bio))
-               return false;
+       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+               __blk_mq_sched_restart(hctx);
+}
  
-       return __blk_mq_sched_bio_merge(q, bio, nr_segs);
+static inline bool bio_mergeable(struct bio *bio)
+{
+       return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
  }
  
  static inline bool
  blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
                          struct bio *bio)
  {
-       struct elevator_queue *e = q->elevator;
-
-       if (e && e->type->ops.allow_merge)
-               return e->type->ops.allow_merge(q, rq, bio);
+       if (rq->rq_flags & RQF_ELV) {
+               struct elevator_queue *e = q->elevator;
  
+               if (e->type->ops.allow_merge)
+                       return e->type->ops.allow_merge(q, rq, bio);
+       }
         return true;
  }
  
  static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
  {
-       struct elevator_queue *e = rq->q->elevator;
+       if (rq->rq_flags & RQF_ELV) {
+               struct elevator_queue *e = rq->q->elevator;
  
-       if (e && e->type->ops.completed_request)
-               e->type->ops.completed_request(rq, now);
+               if (e->type->ops.completed_request)
+                       e->type->ops.completed_request(rq, now);
+       }
  }
  
  static inline void blk_mq_sched_requeue_request(struct request *rq)
  {
-       struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
+       if (rq->rq_flags & RQF_ELV) {
+               struct request_queue *q = rq->q;
+               struct elevator_queue *e = q->elevator;
  
-       if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
-               e->type->ops.requeue_request(rq);
+               if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
+                       e->type->ops.requeue_request(rq);
+       }
  }
  
  static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

index ff5caeb..995336a 100644 (file)
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -24,13 +24,12 @@
   */
  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  {
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
                 struct request_queue *q = hctx->queue;
-               struct blk_mq_tag_set *set = q->tag_set;
  
                 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
                     !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
-                       atomic_inc(&set->active_queues_shared_sbitmap);
+                       atomic_inc(&hctx->tags->active_queues);
         } else {
                 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
                     !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
@@ -45,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
   */
  void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
  {
-       sbitmap_queue_wake_all(tags->bitmap_tags);
+       sbitmap_queue_wake_all(&tags->bitmap_tags);
         if (include_reserve)
-               sbitmap_queue_wake_all(tags->breserved_tags);
+               sbitmap_queue_wake_all(&tags->breserved_tags);
  }
  
  /*
@@ -57,20 +56,20 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
  void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  {
         struct blk_mq_tags *tags = hctx->tags;
-       struct request_queue *q = hctx->queue;
-       struct blk_mq_tag_set *set = q->tag_set;
  
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
+               struct request_queue *q = hctx->queue;
+
                 if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
                                         &q->queue_flags))
                         return;
-               atomic_dec(&set->active_queues_shared_sbitmap);
         } else {
                 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                         return;
-               atomic_dec(&tags->active_queues);
         }
  
+       atomic_dec(&tags->active_queues);
+
         blk_mq_tag_wakeup_all(tags, false);
  }
  
@@ -87,6 +86,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                 return __sbitmap_queue_get(bt);
  }
  
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+                             unsigned int *offset)
+{
+       struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+       struct sbitmap_queue *bt = &tags->bitmap_tags;
+       unsigned long ret;
+
+       if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
+           data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+               return 0;
+       ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
+       *offset += tags->nr_reserved_tags;
+       return ret;
+}
+
  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
  {
         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -101,10 +115,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                         WARN_ON_ONCE(1);
                         return BLK_MQ_NO_TAG;
                 }
-               bt = tags->breserved_tags;
+               bt = &tags->breserved_tags;
                 tag_offset = 0;
         } else {
-               bt = tags->bitmap_tags;
+               bt = &tags->bitmap_tags;
                 tag_offset = tags->nr_reserved_tags;
         }
  
@@ -150,9 +164,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                                                 data->ctx);
                 tags = blk_mq_tags_from_data(data);
                 if (data->flags & BLK_MQ_REQ_RESERVED)
-                       bt = tags->breserved_tags;
+                       bt = &tags->breserved_tags;
                 else
-                       bt = tags->bitmap_tags;
+                       bt = &tags->bitmap_tags;
  
                 /*
                  * If destination hw queue is changed, fake wake up on
@@ -186,13 +200,19 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                 const int real_tag = tag - tags->nr_reserved_tags;
  
                 BUG_ON(real_tag >= tags->nr_tags);
-               sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
+               sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
         } else {
                 BUG_ON(tag >= tags->nr_reserved_tags);
-               sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
+               sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
         }
  }
  
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
+{
+       sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+                                       tag_array, nr_tags);
+}
+
  struct bt_iter_data {
         struct blk_mq_hw_ctx *hctx;
         busy_iter_fn *fn;
@@ -340,9 +360,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
         WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
  
         if (tags->nr_reserved_tags)
-               bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
+               bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
                                  flags | BT_TAG_ITER_RESERVED);
-       bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
+       bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
  }
  
  /**
@@ -379,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                 busy_tag_iter_fn *fn, void *priv)
  {
-       int i;
+       unsigned int flags = tagset->flags;
+       int i, nr_tags;
+
+       nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
  
-       for (i = 0; i < tagset->nr_hw_queues; i++) {
+       for (i = 0; i < nr_tags; i++) {
                 if (tagset->tags && tagset->tags[i])
                         __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
                                               BT_TAG_ITER_STARTED);
@@ -459,8 +482,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                         continue;
  
                 if (tags->nr_reserved_tags)
-                       bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
-               bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
+                       bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+               bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
         }
         blk_queue_exit(q);
  }
@@ -492,56 +515,10 @@ free_bitmap_tags:
         return -ENOMEM;
  }
  
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
-                                  int node, int alloc_policy)
-{
-       int ret;
-
-       ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
-                                 &tags->__breserved_tags,
-                                 tags->nr_tags, tags->nr_reserved_tags,
-                                 node, alloc_policy);
-       if (ret)
-               return ret;
-
-       tags->bitmap_tags = &tags->__bitmap_tags;
-       tags->breserved_tags = &tags->__breserved_tags;
-
-       return 0;
-}
-
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
-{
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-       int i, ret;
-
-       ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
-                                 set->queue_depth, set->reserved_tags,
-                                 set->numa_node, alloc_policy);
-       if (ret)
-               return ret;
-
-       for (i = 0; i < set->nr_hw_queues; i++) {
-               struct blk_mq_tags *tags = set->tags[i];
-
-               tags->bitmap_tags = &set->__bitmap_tags;
-               tags->breserved_tags = &set->__breserved_tags;
-       }
-
-       return 0;
-}
-
-void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
-{
-       sbitmap_queue_free(&set->__bitmap_tags);
-       sbitmap_queue_free(&set->__breserved_tags);
-}
-
  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                      unsigned int reserved_tags,
-                                    int node, unsigned int flags)
+                                    int node, int alloc_policy)
  {
-       int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
         struct blk_mq_tags *tags;
  
         if (total_tags > BLK_MQ_TAG_MAX) {
@@ -557,22 +534,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
         tags->nr_reserved_tags = reserved_tags;
         spin_lock_init(&tags->lock);
  
-       if (blk_mq_is_sbitmap_shared(flags))
-               return tags;
-
-       if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
+       if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
+                               total_tags, reserved_tags, node,
+                               alloc_policy) < 0) {
                 kfree(tags);
                 return NULL;
         }
         return tags;
  }
  
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_tags(struct blk_mq_tags *tags)
  {
-       if (!blk_mq_is_sbitmap_shared(flags)) {
-               sbitmap_queue_free(tags->bitmap_tags);
-               sbitmap_queue_free(tags->breserved_tags);
-       }
+       sbitmap_queue_free(&tags->bitmap_tags);
+       sbitmap_queue_free(&tags->breserved_tags);
         kfree(tags);
  }
  
@@ -592,7 +566,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
         if (tdepth > tags->nr_tags) {
                 struct blk_mq_tag_set *set = hctx->queue->tag_set;
                 struct blk_mq_tags *new;
-               bool ret;
  
                 if (!can_grow)
                         return -EINVAL;
@@ -604,34 +577,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                 if (tdepth > MAX_SCHED_RQ)
                         return -EINVAL;
  
-               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
-                               tags->nr_reserved_tags, set->flags);
+               /*
+                * Only the sbitmap needs resizing since we allocated the max
+                * initially.
+                */
+               if (blk_mq_is_shared_tags(set->flags))
+                       return 0;
+
+               new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
                 if (!new)
                         return -ENOMEM;
-               ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
-               if (ret) {
-                       blk_mq_free_rq_map(new, set->flags);
-                       return -ENOMEM;
-               }
  
-               blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
-               blk_mq_free_rq_map(*tagsptr, set->flags);
+               blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
                 *tagsptr = new;
         } else {
                 /*
                  * Don't need (or can't) update reserved tags here, they
                  * remain static and should never need resizing.
                  */
-               sbitmap_queue_resize(tags->bitmap_tags,
+               sbitmap_queue_resize(&tags->bitmap_tags,
                                 tdepth - tags->nr_reserved_tags);
         }
  
         return 0;
  }
  
-void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
+{
+       struct blk_mq_tags *tags = set->shared_tags;
+
+       sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
+}
+
+void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
  {
-       sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+       sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
+                            q->nr_requests - q->tag_set->reserved_tags);
  }
  
  /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h

index 8ed55af..df787b5 100644 (file)
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -2,52 +2,30 @@
  #ifndef INT_BLK_MQ_TAG_H
  #define INT_BLK_MQ_TAG_H
  
-/*
- * Tag address space map.
- */
-struct blk_mq_tags {
-       unsigned int nr_tags;
-       unsigned int nr_reserved_tags;
-
-       atomic_t active_queues;
-
-       struct sbitmap_queue *bitmap_tags;
-       struct sbitmap_queue *breserved_tags;
-
-       struct sbitmap_queue __bitmap_tags;
-       struct sbitmap_queue __breserved_tags;
-
-       struct request **rqs;
-       struct request **static_rqs;
-       struct list_head page_list;
-
-       /*
-        * used to clear request reference in rqs[] before freeing one
-        * request pool
-        */
-       spinlock_t lock;
-};
+struct blk_mq_alloc_data;
  
  extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
                                         unsigned int reserved_tags,
-                                       int node, unsigned int flags);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+                                       int node, int alloc_policy);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
  extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
                                struct sbitmap_queue *breserved_tags,
                                unsigned int queue_depth,
                                unsigned int reserved,
                                int node, int alloc_policy);
  
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
-extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
  extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+                             unsigned int *offset);
  extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                            unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
  extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                                         struct blk_mq_tags **tags,
                                         unsigned int depth, bool can_grow);
-extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
+extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
                                              unsigned int size);
+extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
  
  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/block/blk-mq.c b/block/blk-mq.c

index bc02637..07eb141 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -10,14 +10,15 @@
  #include <linux/backing-dev.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/kmemleak.h>
  #include <linux/mm.h>
  #include <linux/init.h>
  #include <linux/slab.h>
  #include <linux/workqueue.h>
  #include <linux/smp.h>
+#include <linux/interrupt.h>
  #include <linux/llist.h>
-#include <linux/list_sort.h>
  #include <linux/cpu.h>
  #include <linux/cache.h>
  #include <linux/sched/sysctl.h>
@@ -63,6 +64,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
         return bucket;
  }
  
+#define BLK_QC_T_SHIFT         16
+#define BLK_QC_T_INTERNAL      (1U << 31)
+
+static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
+               blk_qc_t qc)
+{
+       return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+}
+
+static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
+               blk_qc_t qc)
+{
+       unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
+
+       if (qc & BLK_QC_T_INTERNAL)
+               return blk_mq_tag_to_rq(hctx->sched_tags, tag);
+       return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static inline blk_qc_t blk_rq_to_qc(struct request *rq)
+{
+       return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
+               (rq->tag != -1 ?
+                rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+}
+
  /*
   * Check if any of the ctx, dispatch list or elevator
   * have pending work in this hardware queue.
@@ -214,7 +241,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
   */
  void blk_mq_quiesce_queue_nowait(struct request_queue *q)
  {
-       blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->queue_lock, flags);
+       if (!q->quiesce_depth++)
+               blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+       spin_unlock_irqrestore(&q->queue_lock, flags);
  }
  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  
@@ -255,10 +287,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
   */
  void blk_mq_unquiesce_queue(struct request_queue *q)
  {
-       blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+       unsigned long flags;
+       bool run_queue = false;
+
+       spin_lock_irqsave(&q->queue_lock, flags);
+       if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
+               ;
+       } else if (!--q->quiesce_depth) {
+               blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+               run_queue = true;
+       }
+       spin_unlock_irqrestore(&q->queue_lock, flags);
  
         /* dispatch requests which are inserted during quiescing */
-       blk_mq_run_hw_queues(q, true);
+       if (run_queue)
+               blk_mq_run_hw_queues(q, true);
  }
  EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
  
@@ -272,74 +315,67 @@ void blk_mq_wake_waiters(struct request_queue *q)
                         blk_mq_tag_wakeup_all(hctx->tags, true);
  }
  
-/*
- * Only need start/end time stamping if we have iostat or
- * blk stats enabled, or using an IO scheduler.
- */
-static inline bool blk_mq_need_time_stamp(struct request *rq)
-{
-       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
-}
-
  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-               unsigned int tag, u64 alloc_time_ns)
+               struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
  {
-       struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+       struct blk_mq_ctx *ctx = data->ctx;
+       struct blk_mq_hw_ctx *hctx = data->hctx;
+       struct request_queue *q = data->q;
         struct request *rq = tags->static_rqs[tag];
  
-       if (data->q->elevator) {
-               rq->tag = BLK_MQ_NO_TAG;
-               rq->internal_tag = tag;
-       } else {
+       rq->q = q;
+       rq->mq_ctx = ctx;
+       rq->mq_hctx = hctx;
+       rq->cmd_flags = data->cmd_flags;
+
+       if (data->flags & BLK_MQ_REQ_PM)
+               data->rq_flags |= RQF_PM;
+       if (blk_queue_io_stat(q))
+               data->rq_flags |= RQF_IO_STAT;
+       rq->rq_flags = data->rq_flags;
+
+       if (!(data->rq_flags & RQF_ELV)) {
                 rq->tag = tag;
                 rq->internal_tag = BLK_MQ_NO_TAG;
+       } else {
+               rq->tag = BLK_MQ_NO_TAG;
+               rq->internal_tag = tag;
         }
+       rq->timeout = 0;
  
-       /* csd/requeue_work/fifo_time is initialized before use */
-       rq->q = data->q;
-       rq->mq_ctx = data->ctx;
-       rq->mq_hctx = data->hctx;
-       rq->rq_flags = 0;
-       rq->cmd_flags = data->cmd_flags;
-       if (data->flags & BLK_MQ_REQ_PM)
-               rq->rq_flags |= RQF_PM;
-       if (blk_queue_io_stat(data->q))
-               rq->rq_flags |= RQF_IO_STAT;
-       INIT_LIST_HEAD(&rq->queuelist);
-       INIT_HLIST_NODE(&rq->hash);
-       RB_CLEAR_NODE(&rq->rb_node);
+       if (blk_mq_need_time_stamp(rq))
+               rq->start_time_ns = ktime_get_ns();
+       else
+               rq->start_time_ns = 0;
         rq->rq_disk = NULL;
         rq->part = NULL;
  #ifdef CONFIG_BLK_RQ_ALLOC_TIME
         rq->alloc_time_ns = alloc_time_ns;
  #endif
-       if (blk_mq_need_time_stamp(rq))
-               rq->start_time_ns = ktime_get_ns();
-       else
-               rq->start_time_ns = 0;
         rq->io_start_time_ns = 0;
         rq->stats_sectors = 0;
         rq->nr_phys_segments = 0;
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
         rq->nr_integrity_segments = 0;
  #endif
-       blk_crypto_rq_set_defaults(rq);
-       /* tag was already set */
-       WRITE_ONCE(rq->deadline, 0);
-
-       rq->timeout = 0;
-
         rq->end_io = NULL;
         rq->end_io_data = NULL;
  
-       data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
+       blk_crypto_rq_set_defaults(rq);
+       INIT_LIST_HEAD(&rq->queuelist);
+       /* tag was already set */
+       WRITE_ONCE(rq->deadline, 0);
         refcount_set(&rq->ref, 1);
  
-       if (!op_is_flush(data->cmd_flags)) {
+       if (rq->rq_flags & RQF_ELV) {
                 struct elevator_queue *e = data->q->elevator;
  
                 rq->elv.icq = NULL;
-               if (e && e->type->ops.prepare_request) {
+               INIT_HLIST_NODE(&rq->hash);
+               RB_CLEAR_NODE(&rq->rb_node);
+
+               if (!op_is_flush(data->cmd_flags) &&
+                   e->type->ops.prepare_request) {
                         if (e->type->icq_cache)
                                 blk_mq_sched_assign_ioc(rq);
  
@@ -348,15 +384,44 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                 }
         }
  
-       data->hctx->queued++;
         return rq;
  }
  
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
+static inline struct request *
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
+               u64 alloc_time_ns)
+{
+       unsigned int tag, tag_offset;
+       struct blk_mq_tags *tags;
+       struct request *rq;
+       unsigned long tag_mask;
+       int i, nr = 0;
+
+       tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+       if (unlikely(!tag_mask))
+               return NULL;
+
+       tags = blk_mq_tags_from_data(data);
+       for (i = 0; tag_mask; i++) {
+               if (!(tag_mask & (1UL << i)))
+                       continue;
+               prefetch(tags->static_rqs[tag]);
+               tag = tag_offset + i;
+               tag_mask &= ~(1UL << i);
+               rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+               rq_list_add(data->cached_rq, rq);
+       }
+       data->nr_tags -= nr;
+
+       return rq_list_pop(data->cached_rq);
+}
+
+static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
  {
         struct request_queue *q = data->q;
         struct elevator_queue *e = q->elevator;
         u64 alloc_time_ns = 0;
+       struct request *rq;
         unsigned int tag;
  
         /* alloc_time includes depth and tag waits */
@@ -386,6 +451,16 @@ retry:
                 blk_mq_tag_busy(data->hctx);
  
         /*
+        * Try batched alloc if we want more than 1 tag.
+        */
+       if (data->nr_tags > 1) {
+               rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+               if (rq)
+                       return rq;
+               data->nr_tags = 1;
+       }
+
+       /*
          * Waiting allocations only fail because of an inactive hctx.  In that
          * case just retry the hctx assignment and tag allocation as CPU hotplug
          * should have migrated us to an online CPU by now.
@@ -394,16 +469,18 @@ retry:
         if (tag == BLK_MQ_NO_TAG) {
                 if (data->flags & BLK_MQ_REQ_NOWAIT)
                         return NULL;
-
                 /*
-                * Give up the CPU and sleep for a random short time to ensure
-                * that thread using a realtime scheduling class are migrated
-                * off the CPU, and thus off the hctx that is going away.
+                * Give up the CPU and sleep for a random short time to
+                * ensure that thread using a realtime scheduling class
+                * are migrated off the CPU, and thus off the hctx that
+                * is going away.
                  */
                 msleep(3);
                 goto retry;
         }
-       return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+
+       return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
+                                       alloc_time_ns);
  }
  
  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -413,6 +490,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                 .q              = q,
                 .flags          = flags,
                 .cmd_flags      = op,
+               .rq_flags       = q->elevator ? RQF_ELV : 0,
+               .nr_tags        = 1,
         };
         struct request *rq;
         int ret;
@@ -421,7 +500,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
         if (ret)
                 return ERR_PTR(ret);
  
-       rq = __blk_mq_alloc_request(&data);
+       rq = __blk_mq_alloc_requests(&data);
         if (!rq)
                 goto out_queue_exit;
         rq->__data_len = 0;
@@ -441,6 +520,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                 .q              = q,
                 .flags          = flags,
                 .cmd_flags      = op,
+               .rq_flags       = q->elevator ? RQF_ELV : 0,
+               .nr_tags        = 1,
         };
         u64 alloc_time_ns = 0;
         unsigned int cpu;
@@ -485,7 +566,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         tag = blk_mq_get_tag(&data);
         if (tag == BLK_MQ_NO_TAG)
                 goto out_queue_exit;
-       return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
+       return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
+                                       alloc_time_ns);
  
  out_queue_exit:
         blk_queue_exit(q);
@@ -514,12 +596,12 @@ static void __blk_mq_free_request(struct request *rq)
  void blk_mq_free_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
  
         if (rq->rq_flags & RQF_ELVPRIV) {
-               if (e && e->type->ops.finish_request)
+               struct elevator_queue *e = q->elevator;
+
+               if (e->type->ops.finish_request)
                         e->type->ops.finish_request(rq);
                 if (rq->elv.icq) {
                         put_io_context(rq->elv.icq->ioc);
@@ -527,7 +609,6 @@ void blk_mq_free_request(struct request *rq)
                 }
         }
  
-       ctx->rq_completed[rq_is_sync(rq)]++;
         if (rq->rq_flags & RQF_MQ_INFLIGHT)
                 __blk_mq_dec_active_requests(hctx);
  
@@ -542,21 +623,173 @@ void blk_mq_free_request(struct request *rq)
  }
  EXPORT_SYMBOL_GPL(blk_mq_free_request);
  
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
  {
-       u64 now = 0;
+       struct request *rq;
  
-       if (blk_mq_need_time_stamp(rq))
-               now = ktime_get_ns();
+       while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
+               percpu_ref_get(&rq->q->q_usage_counter);
+               blk_mq_free_request(rq);
+       }
+}
  
+static void req_bio_endio(struct request *rq, struct bio *bio,
+                         unsigned int nbytes, blk_status_t error)
+{
+       if (unlikely(error)) {
+               bio->bi_status = error;
+       } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
+               /*
+                * Partial zone append completions cannot be supported as the
+                * BIO fragments may end up not being written sequentially.
+                */
+               if (bio->bi_iter.bi_size != nbytes)
+                       bio->bi_status = BLK_STS_IOERR;
+               else
+                       bio->bi_iter.bi_sector = rq->__sector;
+       }
+
+       bio_advance(bio, nbytes);
+
+       if (unlikely(rq->rq_flags & RQF_QUIET))
+               bio_set_flag(bio, BIO_QUIET);
+       /* don't actually finish bio if it's part of flush sequence */
+       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+               bio_endio(bio);
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+       if (req->part && blk_do_io_stat(req)) {
+               const int sgrp = op_stat_group(req_op(req));
+
+               part_stat_lock();
+               part_stat_add(req->part, sectors[sgrp], bytes >> 9);
+               part_stat_unlock();
+       }
+}
+
+/**
+ * blk_update_request - Complete multiple bytes without completing the request
+ * @req:      the request being processed
+ * @error:    block status code
+ * @nr_bytes: number of bytes to complete for @req
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @req, but doesn't complete
+ *     the request structure even if @req doesn't have leftover.
+ *     If @req has leftover, sets it up for the next range of segments.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
+ *
+ * Note:
+ *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ *      except in the consistency check at the end of this function.
+ *
+ * Return:
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
+ **/
+bool blk_update_request(struct request *req, blk_status_t error,
+               unsigned int nr_bytes)
+{
+       int total_bytes;
+
+       trace_block_rq_complete(req, error, nr_bytes);
+
+       if (!req->bio)
+               return false;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+           error == BLK_STS_OK)
+               req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
+       if (unlikely(error && !blk_rq_is_passthrough(req) &&
+                    !(req->rq_flags & RQF_QUIET)))
+               blk_print_req_error(req, error);
+
+       blk_account_io_completion(req, nr_bytes);
+
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
+
+               if (bio_bytes == bio->bi_iter.bi_size)
+                       req->bio = bio->bi_next;
+
+               /* Completion has already been traced */
+               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+               req_bio_endio(req, bio, bio_bytes, error);
+
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
+
+               if (!nr_bytes)
+                       break;
+       }
+
+       /*
+        * completely done
+        */
+       if (!req->bio) {
+               /*
+                * Reset counters so that the request stacking driver
+                * can find how many bytes remain in the request
+                * later.
+                */
+               req->__data_len = 0;
+               return false;
+       }
+
+       req->__data_len -= total_bytes;
+
+       /* update sector only for requests with clear definition of sector */
+       if (!blk_rq_is_passthrough(req))
+               req->__sector += total_bytes >> 9;
+
+       /* mixed attributes always follow the first bio */
+       if (req->rq_flags & RQF_MIXED_MERGE) {
+               req->cmd_flags &= ~REQ_FAILFAST_MASK;
+               req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
+       }
+
+       if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+               /*
+                * If total number of sectors is less than the first segment
+                * size, something has gone terribly wrong.
+                */
+               if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+                       blk_dump_rq_flags(req, "request botched");
+                       req->__data_len = blk_rq_cur_bytes(req);
+               }
+
+               /* recalculate the number of segments */
+               req->nr_phys_segments = blk_recalc_rq_segments(req);
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+
+static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
+{
         if (rq->rq_flags & RQF_STATS) {
                 blk_mq_poll_stats_start(rq->q);
                 blk_stat_add(rq, now);
         }
  
         blk_mq_sched_completed_request(rq, now);
-
         blk_account_io_done(rq, now);
+}
+
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+       if (blk_mq_need_time_stamp(rq))
+               __blk_mq_end_request_acct(rq, ktime_get_ns());
  
         if (rq->end_io) {
                 rq_qos_done(rq->q, rq);
@@ -575,6 +808,57 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
  }
  EXPORT_SYMBOL(blk_mq_end_request);
  
+#define TAG_COMP_BATCH         32
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+                                         int *tag_array, int nr_tags)
+{
+       struct request_queue *q = hctx->queue;
+
+       blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
+       percpu_ref_put_many(&q->q_usage_counter, nr_tags);
+}
+
+void blk_mq_end_request_batch(struct io_comp_batch *iob)
+{
+       int tags[TAG_COMP_BATCH], nr_tags = 0;
+       struct blk_mq_hw_ctx *cur_hctx = NULL;
+       struct request *rq;
+       u64 now = 0;
+
+       if (iob->need_ts)
+               now = ktime_get_ns();
+
+       while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
+               prefetch(rq->bio);
+               prefetch(rq->rq_next);
+
+               blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+               if (iob->need_ts)
+                       __blk_mq_end_request_acct(rq, now);
+
+               WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+               if (!refcount_dec_and_test(&rq->ref))
+                       continue;
+
+               blk_crypto_free_request(rq);
+               blk_pm_mark_last_busy(rq);
+               rq_qos_done(rq->q, rq);
+
+               if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
+                       if (cur_hctx)
+                               blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+                       nr_tags = 0;
+                       cur_hctx = rq->mq_hctx;
+               }
+               tags[nr_tags++] = rq->tag;
+       }
+
+       if (nr_tags)
+               blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
  static void blk_complete_reqs(struct llist_head *list)
  {
         struct llist_node *entry = llist_reverse_order(llist_del_all(list));
@@ -658,7 +942,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
          * For a polled request, always complete locallly, it's pointless
          * to redirect the completion.
          */
-       if (rq->cmd_flags & REQ_HIPRI)
+       if (rq->cmd_flags & REQ_POLLED)
                 return false;
  
         if (blk_mq_complete_need_ipi(rq)) {
@@ -723,7 +1007,14 @@ void blk_mq_start_request(struct request *rq)
         trace_block_rq_issue(rq);
  
         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-               rq->io_start_time_ns = ktime_get_ns();
+               u64 start_time;
+#ifdef CONFIG_BLK_CGROUP
+               if (rq->bio)
+                       start_time = bio_issue_time(&rq->bio->bi_issue);
+               else
+#endif
+                       start_time = ktime_get_ns();
+               rq->io_start_time_ns = start_time;
                 rq->stats_sectors = blk_rq_sectors(rq);
                 rq->rq_flags |= RQF_STATS;
                 rq_qos_issue(q, rq);
@@ -738,6 +1029,8 @@ void blk_mq_start_request(struct request *rq)
         if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                 q->integrity.profile->prepare_fn(rq);
  #endif
+       if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
+               WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
  }
  EXPORT_SYMBOL(blk_mq_start_request);
  
@@ -763,7 +1056,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
         /* this request will be re-inserted to io scheduler queue */
         blk_mq_sched_requeue_request(rq);
  
-       BUG_ON(!list_empty(&rq->queuelist));
         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
  }
  EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -844,17 +1136,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
  }
  EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
  
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
-{
-       if (tag < tags->nr_tags) {
-               prefetch(tags->rqs[tag]);
-               return tags->rqs[tag];
-       }
-
-       return NULL;
-}
-EXPORT_SYMBOL(blk_mq_tag_to_rq);
-
  static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
                                void *priv, bool reserved)
  {
@@ -1059,24 +1340,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
         return data.rq;
  }
  
-static inline unsigned int queued_to_index(unsigned int queued)
-{
-       if (!queued)
-               return 0;
-
-       return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
-}
-
-static bool __blk_mq_get_driver_tag(struct request *rq)
+static bool __blk_mq_alloc_driver_tag(struct request *rq)
  {
-       struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
         unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
         int tag;
  
         blk_mq_tag_busy(rq->mq_hctx);
  
         if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
-               bt = rq->mq_hctx->tags->breserved_tags;
+               bt = &rq->mq_hctx->tags->breserved_tags;
                 tag_offset = 0;
         } else {
                 if (!hctx_may_queue(rq->mq_hctx, bt))
@@ -1091,11 +1364,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
         return true;
  }
  
-bool blk_mq_get_driver_tag(struct request *rq)
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
  {
-       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
                 return false;
  
         if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
@@ -1119,7 +1390,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                 struct sbitmap_queue *sbq;
  
                 list_del_init(&wait->entry);
-               sbq = hctx->tags->bitmap_tags;
+               sbq = &hctx->tags->bitmap_tags;
                 atomic_dec(&sbq->ws_active);
         }
         spin_unlock(&hctx->dispatch_wait_lock);
@@ -1137,7 +1408,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
  static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                  struct request *rq)
  {
-       struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
+       struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
         struct wait_queue_head *wq;
         wait_queue_entry_t *wait;
         bool ret;
@@ -1325,6 +1596,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
         int errors, queued;
         blk_status_t ret = BLK_STS_OK;
         LIST_HEAD(zone_list);
+       bool needs_resource = false;
  
         if (list_empty(list))
                 return false;
@@ -1370,6 +1642,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                         queued++;
                         break;
                 case BLK_STS_RESOURCE:
+                       needs_resource = true;
+                       fallthrough;
                 case BLK_STS_DEV_RESOURCE:
                         blk_mq_handle_dev_resource(rq, list);
                         goto out;
@@ -1380,6 +1654,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                          * accept.
                          */
                         blk_mq_handle_zone_resource(rq, &zone_list);
+                       needs_resource = true;
                         break;
                 default:
                         errors++;
@@ -1390,8 +1665,6 @@ out:
         if (!list_empty(&zone_list))
                 list_splice_tail_init(&zone_list, list);
  
-       hctx->dispatched[queued_to_index(queued)]++;
-
         /* If we didn't flush the entire list, we could have told the driver
          * there was more coming, but that turned out to be a lie.
          */
@@ -1406,7 +1679,6 @@ out:
                 /* For non-shared tags, the RESTART check will suffice */
                 bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                         (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
-               bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
  
                 if (nr_budgets)
                         blk_mq_release_budgets(q, list);
@@ -1447,14 +1719,16 @@ out:
                  * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                  * bit is set, run queue after a delay to avoid IO stalls
                  * that could otherwise occur if the queue is idle.  We'll do
-                * similar if we couldn't get budget and SCHED_RESTART is set.
+                * similar if we couldn't get budget or couldn't lock a zone
+                * and SCHED_RESTART is set.
                  */
                 needs_restart = blk_mq_sched_needs_restart(hctx);
+               if (prep == PREP_DISPATCH_NO_BUDGET)
+                       needs_resource = true;
                 if (!needs_restart ||
                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                         blk_mq_run_hw_queue(hctx, true);
-               else if (needs_restart && (ret == BLK_STS_RESOURCE ||
-                                          no_budget_avail))
+               else if (needs_restart && needs_resource)
                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
  
                 blk_mq_update_dispatch_busy(hctx, true);
@@ -1894,54 +2168,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
         spin_unlock(&ctx->lock);
  }
  
-static int plug_rq_cmp(void *priv, const struct list_head *a,
-                      const struct list_head *b)
+static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
+                             bool from_schedule)
  {
-       struct request *rqa = container_of(a, struct request, queuelist);
-       struct request *rqb = container_of(b, struct request, queuelist);
+       if (hctx->queue->mq_ops->commit_rqs) {
+               trace_block_unplug(hctx->queue, *queued, !from_schedule);
+               hctx->queue->mq_ops->commit_rqs(hctx);
+       }
+       *queued = 0;
+}
  
-       if (rqa->mq_ctx != rqb->mq_ctx)
-               return rqa->mq_ctx > rqb->mq_ctx;
-       if (rqa->mq_hctx != rqb->mq_hctx)
-               return rqa->mq_hctx > rqb->mq_hctx;
+static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
+{
+       struct blk_mq_hw_ctx *hctx = NULL;
+       struct request *rq;
+       int queued = 0;
+       int errors = 0;
  
-       return blk_rq_pos(rqa) > blk_rq_pos(rqb);
+       while ((rq = rq_list_pop(&plug->mq_list))) {
+               bool last = rq_list_empty(plug->mq_list);
+               blk_status_t ret;
+
+               if (hctx != rq->mq_hctx) {
+                       if (hctx)
+                               blk_mq_commit_rqs(hctx, &queued, from_schedule);
+                       hctx = rq->mq_hctx;
+               }
+
+               ret = blk_mq_request_issue_directly(rq, last);
+               switch (ret) {
+               case BLK_STS_OK:
+                       queued++;
+                       break;
+               case BLK_STS_RESOURCE:
+               case BLK_STS_DEV_RESOURCE:
+                       blk_mq_request_bypass_insert(rq, false, last);
+                       blk_mq_commit_rqs(hctx, &queued, from_schedule);
+                       return;
+               default:
+                       blk_mq_end_request(rq, ret);
+                       errors++;
+                       break;
+               }
+       }
+
+       /*
+        * If we didn't flush the entire list, we could have told the driver
+        * there was more coming, but that turned out to be a lie.
+        */
+       if (errors)
+               blk_mq_commit_rqs(hctx, &queued, from_schedule);
  }
  
  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  {
+       struct blk_mq_hw_ctx *this_hctx;
+       struct blk_mq_ctx *this_ctx;
+       unsigned int depth;
         LIST_HEAD(list);
  
-       if (list_empty(&plug->mq_list))
+       if (rq_list_empty(plug->mq_list))
                 return;
-       list_splice_init(&plug->mq_list, &list);
-
-       if (plug->rq_count > 2 && plug->multiple_queues)
-               list_sort(NULL, &list, plug_rq_cmp);
-
         plug->rq_count = 0;
  
+       if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+               blk_mq_plug_issue_direct(plug, from_schedule);
+               if (rq_list_empty(plug->mq_list))
+                       return;
+       }
+
+       this_hctx = NULL;
+       this_ctx = NULL;
+       depth = 0;
         do {
-               struct list_head rq_list;
-               struct request *rq, *head_rq = list_entry_rq(list.next);
-               struct list_head *pos = &head_rq->queuelist; /* skip first */
-               struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
-               struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
-               unsigned int depth = 1;
-
-               list_for_each_continue(pos, &list) {
-                       rq = list_entry_rq(pos);
-                       BUG_ON(!rq->q);
-                       if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
-                               break;
-                       depth++;
+               struct request *rq;
+
+               rq = rq_list_pop(&plug->mq_list);
+
+               if (!this_hctx) {
+                       this_hctx = rq->mq_hctx;
+                       this_ctx = rq->mq_ctx;
+               } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+                       trace_block_unplug(this_hctx->queue, depth,
+                                               !from_schedule);
+                       blk_mq_sched_insert_requests(this_hctx, this_ctx,
+                                               &list, from_schedule);
+                       depth = 0;
+                       this_hctx = rq->mq_hctx;
+                       this_ctx = rq->mq_ctx;
+
                 }
  
-               list_cut_before(&rq_list, &list, pos);
-               trace_block_unplug(head_rq->q, depth, !from_schedule);
-               blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
+               list_add(&rq->queuelist, &list);
+               depth++;
+       } while (!rq_list_empty(plug->mq_list));
+
+       if (!list_empty(&list)) {
+               trace_block_unplug(this_hctx->queue, depth, !from_schedule);
+               blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
                                                 from_schedule);
-       } while(!list_empty(&list));
+       }
  }
  
  static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
@@ -1964,19 +2290,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
  }
  
  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                           struct request *rq,
-                                           blk_qc_t *cookie, bool last)
+                                           struct request *rq, bool last)
  {
         struct request_queue *q = rq->q;
         struct blk_mq_queue_data bd = {
                 .rq = rq,
                 .last = last,
         };
-       blk_qc_t new_cookie;
         blk_status_t ret;
  
-       new_cookie = request_to_qc_t(hctx, rq);
-
         /*
          * For OK queue, we are done. For error, caller may kill it.
          * Any other error (busy), just add it to our list as we
@@ -1986,7 +2308,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
         switch (ret) {
         case BLK_STS_OK:
                 blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = new_cookie;
                 break;
         case BLK_STS_RESOURCE:
         case BLK_STS_DEV_RESOURCE:
@@ -1995,7 +2316,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                 break;
         default:
                 blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = BLK_QC_T_NONE;
                 break;
         }
  
@@ -2004,7 +2324,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
  
  static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                 struct request *rq,
-                                               blk_qc_t *cookie,
                                                 bool bypass_insert, bool last)
  {
         struct request_queue *q = rq->q;
@@ -2024,7 +2343,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                 goto insert;
         }
  
-       if (q->elevator && !bypass_insert)
+       if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
                 goto insert;
  
         budget_token = blk_mq_get_dispatch_budget(q);
@@ -2038,7 +2357,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                 goto insert;
         }
  
-       return __blk_mq_issue_directly(hctx, rq, cookie, last);
+       return __blk_mq_issue_directly(hctx, rq, last);
  insert:
         if (bypass_insert)
                 return BLK_STS_RESOURCE;
@@ -2052,7 +2371,6 @@ insert:
   * blk_mq_try_issue_directly - Try to send a request directly to device driver.
   * @hctx: Pointer of the associated hardware queue.
   * @rq: Pointer to request to be sent.
- * @cookie: Request queue cookie.
   *
   * If the device has enough resources to accept a new request now, send the
   * request directly to device driver. Else, insert at hctx->dispatch queue, so
@@ -2060,7 +2378,7 @@ insert:
   * queue have higher priority.
   */
  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-               struct request *rq, blk_qc_t *cookie)
+               struct request *rq)
  {
         blk_status_t ret;
         int srcu_idx;
@@ -2069,7 +2387,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
  
         hctx_lock(hctx, &srcu_idx);
  
-       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
+       ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
         if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
                 blk_mq_request_bypass_insert(rq, false, true);
         else if (ret != BLK_STS_OK)
@@ -2082,11 +2400,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
  {
         blk_status_t ret;
         int srcu_idx;
-       blk_qc_t unused_cookie;
         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
  
         hctx_lock(hctx, &srcu_idx);
-       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
+       ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
         hctx_unlock(hctx, srcu_idx);
  
         return ret;
@@ -2130,27 +2447,28 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  
  static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
  {
-       list_add_tail(&rq->queuelist, &plug->mq_list);
-       plug->rq_count++;
-       if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
-               struct request *tmp;
+       if (!plug->multiple_queues) {
+               struct request *nxt = rq_list_peek(&plug->mq_list);
  
-               tmp = list_first_entry(&plug->mq_list, struct request,
-                                               queuelist);
-               if (tmp->q != rq->q)
+               if (nxt && nxt->q != rq->q)
                         plug->multiple_queues = true;
         }
+       if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+               plug->has_elevator = true;
+       rq->rq_next = NULL;
+       rq_list_add(&plug->mq_list, rq);
+       plug->rq_count++;
  }
  
  /*
- * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
   * queues. This is important for md arrays to benefit from merging
   * requests.
   */
  static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
  {
         if (plug->multiple_queues)
-               return BLK_MAX_REQUEST_COUNT * 4;
+               return BLK_MAX_REQUEST_COUNT * 2;
         return BLK_MAX_REQUEST_COUNT;
  }
  
@@ -2166,57 +2484,63 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
   *
   * It will not queue the request if there is an error with the bio, or at the
   * request creation.
- *
- * Returns: Request queue cookie.
   */
-blk_qc_t blk_mq_submit_bio(struct bio *bio)
+void blk_mq_submit_bio(struct bio *bio)
  {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
         const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = {
-               .q              = q,
-       };
         struct request *rq;
         struct blk_plug *plug;
-       struct request *same_queue_rq = NULL;
-       unsigned int nr_segs;
-       blk_qc_t cookie;
+       bool same_queue_rq = false;
+       unsigned int nr_segs = 1;
         blk_status_t ret;
-       bool hipri;
  
         blk_queue_bounce(q, &bio);
-       __blk_queue_split(&bio, &nr_segs);
+       if (blk_may_split(q, bio))
+               __blk_queue_split(q, &bio, &nr_segs);
  
         if (!bio_integrity_prep(bio))
                 goto queue_exit;
  
-       if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-               goto queue_exit;
-
-       if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-               goto queue_exit;
+       if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+               if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
+                       goto queue_exit;
+               if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+                       goto queue_exit;
+       }
  
         rq_qos_throttle(q, bio);
  
-       hipri = bio->bi_opf & REQ_HIPRI;
-
-       data.cmd_flags = bio->bi_opf;
-       rq = __blk_mq_alloc_request(&data);
-       if (unlikely(!rq)) {
-               rq_qos_cleanup(q, bio);
-               if (bio->bi_opf & REQ_NOWAIT)
-                       bio_wouldblock_error(bio);
-               goto queue_exit;
+       plug = blk_mq_plug(q, bio);
+       if (plug && plug->cached_rq) {
+               rq = rq_list_pop(&plug->cached_rq);
+               INIT_LIST_HEAD(&rq->queuelist);
+       } else {
+               struct blk_mq_alloc_data data = {
+                       .q              = q,
+                       .nr_tags        = 1,
+                       .cmd_flags      = bio->bi_opf,
+                       .rq_flags       = q->elevator ? RQF_ELV : 0,
+               };
+
+               if (plug) {
+                       data.nr_tags = plug->nr_ios;
+                       plug->nr_ios = 1;
+                       data.cached_rq = &plug->cached_rq;
+               }
+               rq = __blk_mq_alloc_requests(&data);
+               if (unlikely(!rq)) {
+                       rq_qos_cleanup(q, bio);
+                       if (bio->bi_opf & REQ_NOWAIT)
+                               bio_wouldblock_error(bio);
+                       goto queue_exit;
+               }
         }
  
         trace_block_getrq(bio);
  
         rq_qos_track(q, rq, bio);
  
-       cookie = request_to_qc_t(data.hctx, rq);
-
         blk_mq_bio_to_request(rq, bio, nr_segs);
  
         ret = blk_crypto_init_request(rq);
@@ -2224,17 +2548,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                 bio->bi_status = ret;
                 bio_endio(bio);
                 blk_mq_free_request(rq);
-               return BLK_QC_T_NONE;
+               return;
         }
  
-       plug = blk_mq_plug(q, bio);
-       if (unlikely(is_flush_fua)) {
-               /* Bypass scheduler for flush requests */
-               blk_insert_flush(rq);
-               blk_mq_run_hw_queue(data.hctx, true);
-       } else if (plug && (q->nr_hw_queues == 1 ||
-                  blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
-                  q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
+       if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+               return;
+
+       if (plug && (q->nr_hw_queues == 1 ||
+           blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
+           q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
                 /*
                  * Use plugging if we have a ->commit_rqs() hook as well, as
                  * we know the driver uses bd->last in a smart fashion.
@@ -2245,22 +2567,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                 unsigned int request_count = plug->rq_count;
                 struct request *last = NULL;
  
-               if (!request_count)
+               if (!request_count) {
                         trace_block_plug(q);
-               else
-                       last = list_entry_rq(plug->mq_list.prev);
+               } else if (!blk_queue_nomerges(q)) {
+                       last = rq_list_peek(&plug->mq_list);
+                       if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE)
+                               last = NULL;
+               }
  
-               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
-                   blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
-                       blk_flush_plug_list(plug, false);
+               if (request_count >= blk_plug_max_rq_count(plug) || last) {
+                       blk_mq_flush_plug_list(plug, false);
                         trace_block_plug(q);
                 }
  
                 blk_add_rq_to_plug(plug, rq);
-       } else if (q->elevator) {
+       } else if (rq->rq_flags & RQF_ELV) {
                 /* Insert the request at the IO scheduler queue */
                 blk_mq_sched_insert_request(rq, false, true, true);
         } else if (plug && !blk_queue_nomerges(q)) {
+               struct request *next_rq = NULL;
+
                 /*
                  * We do limited plugging. If the bio can be merged, do that.
                  * Otherwise the existing request in the plug list will be
@@ -2268,39 +2594,32 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                  * The plug list might get flushed before this. If that happens,
                  * the plug list is empty, and same_queue_rq is invalid.
                  */
-               if (list_empty(&plug->mq_list))
-                       same_queue_rq = NULL;
                 if (same_queue_rq) {
-                       list_del_init(&same_queue_rq->queuelist);
+                       next_rq = rq_list_pop(&plug->mq_list);
                         plug->rq_count--;
                 }
                 blk_add_rq_to_plug(plug, rq);
                 trace_block_plug(q);
  
-               if (same_queue_rq) {
-                       data.hctx = same_queue_rq->mq_hctx;
+               if (next_rq) {
                         trace_block_unplug(q, 1, true);
-                       blk_mq_try_issue_directly(data.hctx, same_queue_rq,
-                                       &cookie);
+                       blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq);
                 }
         } else if ((q->nr_hw_queues > 1 && is_sync) ||
-                       !data.hctx->dispatch_busy) {
+                  !rq->mq_hctx->dispatch_busy) {
                 /*
                  * There is no scheduler and we can try to send directly
                  * to the hardware.
                  */
-               blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+               blk_mq_try_issue_directly(rq->mq_hctx, rq);
         } else {
                 /* Default case. */
                 blk_mq_sched_insert_request(rq, false, true, true);
         }
  
-       if (!hipri)
-               return BLK_QC_T_NONE;
-       return cookie;
+       return;
  queue_exit:
         blk_queue_exit(q);
-       return BLK_QC_T_NONE;
  }
  
  static size_t order_to_size(unsigned int order)
@@ -2309,19 +2628,22 @@ static size_t order_to_size(unsigned int order)
  }
  
  /* called before freeing request pool in @tags */
-static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
-               struct blk_mq_tags *tags, unsigned int hctx_idx)
+static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
+                                   struct blk_mq_tags *tags)
  {
-       struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
         struct page *page;
         unsigned long flags;
  
+       /* There is no need to clear a driver tags own mapping */
+       if (drv_tags == tags)
+               return;
+
         list_for_each_entry(page, &tags->page_list, lru) {
                 unsigned long start = (unsigned long)page_address(page);
                 unsigned long end = start + order_to_size(page->private);
                 int i;
  
-               for (i = 0; i < set->queue_depth; i++) {
+               for (i = 0; i < drv_tags->nr_tags; i++) {
                         struct request *rq = drv_tags->rqs[i];
                         unsigned long rq_addr = (unsigned long)rq;
  
@@ -2345,9 +2667,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx)
  {
+       struct blk_mq_tags *drv_tags;
         struct page *page;
  
-       if (tags->rqs && set->ops->exit_request) {
+       if (blk_mq_is_shared_tags(set->flags))
+               drv_tags = set->shared_tags;
+       else
+               drv_tags = set->tags[hctx_idx];
+
+       if (tags->static_rqs && set->ops->exit_request) {
                 int i;
  
                 for (i = 0; i < tags->nr_tags; i++) {
@@ -2360,7 +2688,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                 }
         }
  
-       blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+       blk_mq_clear_rq_mapping(drv_tags, tags);
  
         while (!list_empty(&tags->page_list)) {
                 page = list_first_entry(&tags->page_list, struct page, lru);
@@ -2374,21 +2702,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
         }
  }
  
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
  {
         kfree(tags->rqs);
         tags->rqs = NULL;
         kfree(tags->static_rqs);
         tags->static_rqs = NULL;
  
-       blk_mq_free_tags(tags, flags);
+       blk_mq_free_tags(tags);
  }
  
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-                                       unsigned int hctx_idx,
-                                       unsigned int nr_tags,
-                                       unsigned int reserved_tags,
-                                       unsigned int flags)
+static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                              unsigned int hctx_idx,
+                                              unsigned int nr_tags,
+                                              unsigned int reserved_tags)
  {
         struct blk_mq_tags *tags;
         int node;
@@ -2397,7 +2724,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
  
-       tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
+       tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
+                               BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
         if (!tags)
                 return NULL;
  
@@ -2405,7 +2733,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                  node);
         if (!tags->rqs) {
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                 return NULL;
         }
  
@@ -2414,7 +2742,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                         node);
         if (!tags->static_rqs) {
                 kfree(tags->rqs);
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                 return NULL;
         }
  
@@ -2436,8 +2764,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
         return 0;
  }
  
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-                    unsigned int hctx_idx, unsigned int depth)
+static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
+                           struct blk_mq_tags *tags,
+                           unsigned int hctx_idx, unsigned int depth)
  {
         unsigned int i, j, entries_per_page, max_order = 4;
         size_t rq_size, left;
@@ -2848,37 +3177,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
         }
  }
  
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
-                                       int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                            unsigned int hctx_idx,
+                                            unsigned int depth)
  {
-       unsigned int flags = set->flags;
-       int ret = 0;
+       struct blk_mq_tags *tags;
+       int ret;
  
-       set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
-                                       set->queue_depth, set->reserved_tags, flags);
-       if (!set->tags[hctx_idx])
-               return false;
+       tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
+       if (!tags)
+               return NULL;
  
-       ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
-                               set->queue_depth);
-       if (!ret)
-               return true;
+       ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
+       if (ret) {
+               blk_mq_free_rq_map(tags);
+               return NULL;
+       }
  
-       blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-       set->tags[hctx_idx] = NULL;
-       return false;
+       return tags;
  }
  
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
-                                        unsigned int hctx_idx)
+static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                      int hctx_idx)
  {
-       unsigned int flags = set->flags;
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->tags[hctx_idx] = set->shared_tags;
  
-       if (set->tags && set->tags[hctx_idx]) {
-               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-               blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-               set->tags[hctx_idx] = NULL;
+               return true;
         }
+
+       set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
+                                                      set->queue_depth);
+
+       return set->tags[hctx_idx];
+}
+
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                            struct blk_mq_tags *tags,
+                            unsigned int hctx_idx)
+{
+       if (tags) {
+               blk_mq_free_rqs(set, tags, hctx_idx);
+               blk_mq_free_rq_map(tags);
+       }
+}
+
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                                     unsigned int hctx_idx)
+{
+       if (!blk_mq_is_shared_tags(set->flags))
+               blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
+
+       set->tags[hctx_idx] = NULL;
  }
  
  static void blk_mq_map_swqueue(struct request_queue *q)
@@ -2911,7 +3261,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                         hctx_idx = set->map[j].mq_map[i];
                         /* unmapped hw queue can be remapped after CPU topo changed */
                         if (!set->tags[hctx_idx] &&
-                           !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+                           !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                 /*
                                  * If tags initialization fail for some hctx,
                                  * that hctx won't be brought online.  In this
@@ -2958,8 +3308,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                          * fallback in case of a new remap fails
                          * allocation
                          */
-                       if (i && set->tags[i])
-                               blk_mq_free_map_and_requests(set, i);
+                       if (i)
+                               __blk_mq_free_map_and_rqs(set, i);
  
                         hctx->tags = NULL;
                         continue;
@@ -3255,8 +3605,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                 struct blk_mq_hw_ctx *hctx = hctxs[j];
  
                 if (hctx) {
-                       if (hctx->tags)
-                               blk_mq_free_map_and_requests(set, j);
+                       __blk_mq_free_map_and_rqs(set, j);
                         blk_mq_exit_hctx(q, set, hctx, j);
                         hctxs[j] = NULL;
                 }
@@ -3343,8 +3692,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  {
         int i;
  
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->shared_tags = blk_mq_alloc_map_and_rqs(set,
+                                               BLK_MQ_NO_HCTX_IDX,
+                                               set->queue_depth);
+               if (!set->shared_tags)
+                       return -ENOMEM;
+       }
+
         for (i = 0; i < set->nr_hw_queues; i++) {
-               if (!__blk_mq_alloc_map_and_request(set, i))
+               if (!__blk_mq_alloc_map_and_rqs(set, i))
                         goto out_unwind;
                 cond_resched();
         }
@@ -3353,7 +3710,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  
  out_unwind:
         while (--i >= 0)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
+
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
  
         return -ENOMEM;
  }
@@ -3363,7 +3725,7 @@ out_unwind:
   * may reduce the depth asked for, if memory is tight. set->queue_depth
   * will be updated to reflect the allocated depth.
   */
-static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
  {
         unsigned int depth;
         int err;
@@ -3529,27 +3891,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
         if (ret)
                 goto out_free_mq_map;
  
-       ret = blk_mq_alloc_map_and_requests(set);
+       ret = blk_mq_alloc_set_map_and_rqs(set);
         if (ret)
                 goto out_free_mq_map;
  
-       if (blk_mq_is_sbitmap_shared(set->flags)) {
-               atomic_set(&set->active_queues_shared_sbitmap, 0);
-
-               if (blk_mq_init_shared_sbitmap(set)) {
-                       ret = -ENOMEM;
-                       goto out_free_mq_rq_maps;
-               }
-       }
-
         mutex_init(&set->tag_list_lock);
         INIT_LIST_HEAD(&set->tag_list);
  
         return 0;
  
-out_free_mq_rq_maps:
-       for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
  out_free_mq_map:
         for (i = 0; i < set->nr_maps; i++) {
                 kfree(set->map[i].mq_map);
@@ -3582,10 +3932,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
         int i, j;
  
         for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
  
-       if (blk_mq_is_sbitmap_shared(set->flags))
-               blk_mq_exit_shared_sbitmap(set);
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
  
         for (j = 0; j < set->nr_maps; j++) {
                 kfree(set->map[j].mq_map);
@@ -3620,20 +3972,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
                  * If we're using an MQ scheduler, just update the scheduler
                  * queue depth. This is similar to what the old code would do.
                  */
-               if (!hctx->sched_tags) {
-                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
-                                                       false);
-                       if (!ret && blk_mq_is_sbitmap_shared(set->flags))
-                               blk_mq_tag_resize_shared_sbitmap(set, nr);
-               } else {
+               if (hctx->sched_tags) {
                         ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
-                                                       nr, true);
-                       if (blk_mq_is_sbitmap_shared(set->flags)) {
-                               hctx->sched_tags->bitmap_tags =
-                                       &q->sched_bitmap_tags;
-                               hctx->sched_tags->breserved_tags =
-                                       &q->sched_breserved_tags;
-                       }
+                                                     nr, true);
+               } else {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
+                                                     false);
                 }
                 if (ret)
                         break;
@@ -3642,9 +3986,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
         }
         if (!ret) {
                 q->nr_requests = nr;
-               if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
-                       sbitmap_queue_resize(&q->sched_bitmap_tags,
-                                            nr - set->reserved_tags);
+               if (blk_mq_is_shared_tags(set->flags)) {
+                       if (q->elevator)
+                               blk_mq_tag_update_sched_shared_tags(q);
+                       else
+                               blk_mq_tag_resize_shared_tags(set, nr);
+               }
         }
  
         blk_mq_unquiesce_queue(q);
@@ -3863,15 +4210,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
         return ret;
  }
  
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
-                                    struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
  {
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
+       struct request *rq = blk_qc_to_rq(hctx, qc);
         struct hrtimer_sleeper hs;
         enum hrtimer_mode mode;
         unsigned int nsecs;
         ktime_t kt;
  
-       if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
+       /*
+        * If a request has completed on queue that uses an I/O scheduler, we
+        * won't get back a request from blk_qc_to_rq.
+        */
+       if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
                 return false;
  
         /*
@@ -3913,92 +4265,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
  
         __set_current_state(TASK_RUNNING);
         destroy_hrtimer_on_stack(&hs.timer);
-       return true;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q,
-                              struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
-{
-       struct request *rq;
-
-       if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
-               return false;
-
-       if (!blk_qc_t_is_internal(cookie))
-               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-       else {
-               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-               /*
-                * With scheduling, if the request has completed, we'll
-                * get a NULL return here, as we clear the sched tag when
-                * that happens. The request still remains valid, like always,
-                * so we should be safe with just the NULL check.
-                */
-               if (!rq)
-                       return false;
-       }
-
-       return blk_mq_poll_hybrid_sleep(q, rq);
-}
-
-/**
- * blk_poll - poll for IO completions
- * @q:  the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- *    Poll for completions on the passed in queue. Returns number of
- *    completed entries found. If @spin is true, then blk_poll will continue
- *    looping until at least one completion is found, unless the task is
- *    otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int state;
-
-       if (!blk_qc_t_valid(cookie) ||
-           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               return 0;
-
-       if (current->plug)
-               blk_flush_plug_list(current->plug, false);
-
-       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
  
         /*
-        * If we sleep, have the caller restart the poll loop to reset
-        * the state. Like for the other success return cases, the
-        * caller is responsible for checking if the IO completed. If
-        * the IO isn't complete, we'll get called again and will go
-        * straight to the busy poll loop. If specified not to spin,
-        * we also should not sleep.
+        * If we sleep, have the caller restart the poll loop to reset the
+        * state.  Like for the other success return cases, the caller is
+        * responsible for checking if the IO completed.  If the IO isn't
+        * complete, we'll get called again and will go straight to the busy
+        * poll loop.
          */
-       if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
-               return 1;
+       return true;
+}
  
-       hctx->poll_considered++;
+static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
+                              struct io_comp_batch *iob, unsigned int flags)
+{
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
+       long state = get_current_state();
+       int ret;
  
-       state = get_current_state();
         do {
-               int ret;
-
-               hctx->poll_invoked++;
-
-               ret = q->mq_ops->poll(hctx);
+               ret = q->mq_ops->poll(hctx, iob);
                 if (ret > 0) {
-                       hctx->poll_success++;
                         __set_current_state(TASK_RUNNING);
                         return ret;
                 }
  
                 if (signal_pending_state(state, current))
                         __set_current_state(TASK_RUNNING);
-
                 if (task_is_running(current))
                         return 1;
-               if (ret < 0 || !spin)
+
+               if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                         break;
                 cpu_relax();
         } while (!need_resched());
@@ -4006,7 +4303,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
         __set_current_state(TASK_RUNNING);
         return 0;
  }
-EXPORT_SYMBOL_GPL(blk_poll);
+
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags)
+{
+       if (!(flags & BLK_POLL_NOSLEEP) &&
+           q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
+               if (blk_mq_poll_hybrid(q, cookie))
+                       return 1;
+       }
+       return blk_mq_poll_classic(q, cookie, iob, flags);
+}
  
  unsigned int blk_mq_rq_cpu(struct request *rq)
  {
diff --git a/block/blk-mq.h b/block/blk-mq.h

index d08779f..28859fc 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -25,18 +25,14 @@ struct blk_mq_ctx {
         unsigned short          index_hw[HCTX_MAX_TYPES];
         struct blk_mq_hw_ctx    *hctxs[HCTX_MAX_TYPES];
  
-       /* incremented at dispatch time */
-       unsigned long           rq_dispatched[2];
-       unsigned long           rq_merged;
-
-       /* incremented at completion time */
-       unsigned long           ____cacheline_aligned_in_smp rq_completed[2];
-
         struct request_queue    *queue;
         struct blk_mq_ctxs      *ctxs;
         struct kobject          kobj;
  } ____cacheline_aligned_in_smp;
  
+void blk_mq_submit_bio(struct bio *bio);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags);
  void blk_mq_exit_queue(struct request_queue *q);
  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
  void blk_mq_wake_waiters(struct request_queue *q);
@@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
   */
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-                                       unsigned int hctx_idx,
-                                       unsigned int nr_tags,
-                                       unsigned int reserved_tags,
-                                       unsigned int flags);
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-                    unsigned int hctx_idx, unsigned int depth);
-
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                               unsigned int hctx_idx, unsigned int depth);
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                            struct blk_mq_tags *tags,
+                            unsigned int hctx_idx);
  /*
   * Internal helpers for request insertion into sw queues
   */
@@ -109,9 +102,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
         enum hctx_type type = HCTX_TYPE_DEFAULT;
  
         /*
-        * The caller ensure that if REQ_HIPRI, poll must be enabled.
+        * The caller ensure that if REQ_POLLED, poll must be enabled.
          */
-       if (flags & REQ_HIPRI)
+       if (flags & REQ_POLLED)
                 type = HCTX_TYPE_POLL;
         else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
                 type = HCTX_TYPE_READ;
@@ -128,6 +121,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
  extern int blk_mq_sysfs_register(struct request_queue *q);
  extern void blk_mq_sysfs_unregister(struct request_queue *q);
  extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_free_plug_rqs(struct blk_plug *plug);
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
  
  void blk_mq_release(struct request_queue *q);
  
@@ -154,23 +149,27 @@ struct blk_mq_alloc_data {
         blk_mq_req_flags_t flags;
         unsigned int shallow_depth;
         unsigned int cmd_flags;
+       unsigned int rq_flags;
+
+       /* allocate multiple requests/tags in one go */
+       unsigned int nr_tags;
+       struct request **cached_rq;
  
         /* input & output parameter */
         struct blk_mq_ctx *ctx;
         struct blk_mq_hw_ctx *hctx;
  };
  
-static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+static inline bool blk_mq_is_shared_tags(unsigned int flags)
  {
         return flags & BLK_MQ_F_TAG_HCTX_SHARED;
  }
  
  static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
  {
-       if (data->q->elevator)
-               return data->hctx->sched_tags;
-
-       return data->hctx->tags;
+       if (!(data->rq_flags & RQF_ELV))
+               return data->hctx->tags;
+       return data->hctx->sched_tags;
  }
  
  static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@@ -220,24 +219,24 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
  
  static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
  {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
         else
                 atomic_inc(&hctx->nr_active);
  }
  
  static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
  {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
         else
                 atomic_dec(&hctx->nr_active);
  }
  
  static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
  {
-       if (blk_mq_is_sbitmap_shared(hctx->flags))
-               return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
+       if (blk_mq_is_shared_tags(hctx->flags))
+               return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
         return atomic_read(&hctx->nr_active);
  }
  static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -260,7 +259,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
         __blk_mq_put_driver_tag(rq->mq_hctx, rq);
  }
  
-bool blk_mq_get_driver_tag(struct request *rq);
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+       if (rq->tag != BLK_MQ_NO_TAG &&
+           !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+               hctx->tags->rqs[rq->tag] = rq;
+               return true;
+       }
+
+       return __blk_mq_get_driver_tag(hctx, rq);
+}
  
  static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
  {
@@ -331,19 +343,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
         if (bt->sb.depth == 1)
                 return true;
  
-       if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+       if (blk_mq_is_shared_tags(hctx->flags)) {
                 struct request_queue *q = hctx->queue;
-               struct blk_mq_tag_set *set = q->tag_set;
  
                 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                         return true;
-               users = atomic_read(&set->active_queues_shared_sbitmap);
         } else {
                 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                         return true;
-               users = atomic_read(&hctx->tags->active_queues);
         }
  
+       users = atomic_read(&hctx->tags->active_queues);
+
         if (!users)
                 return true;
  
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h

index f000f83..3cfbc86 100644 (file)
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
          * BIO_TRACKED lets controllers know that a bio went through the
          * normal rq_qos path.
          */
-       bio_set_flag(bio, BIO_TRACKED);
-       if (q->rq_qos)
+       if (q->rq_qos) {
+               bio_set_flag(bio, BIO_TRACKED);
                 __rq_qos_throttle(q->rq_qos, bio);
+       }
  }
  
  static inline void rq_qos_track(struct request_queue *q, struct request *rq,
diff --git a/block/blk-settings.c b/block/blk-settings.c

index a7c857a..b880c70 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
  }
  EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
  
+static bool disk_has_partitions(struct gendisk *disk)
+{
+       unsigned long idx;
+       struct block_device *part;
+       bool ret = false;
+
+       rcu_read_lock();
+       xa_for_each(&disk->part_tbl, idx, part) {
+               if (bdev_is_partition(part)) {
+                       ret = true;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
  /**
   * blk_queue_set_zoned - configure a disk queue zoned model.
   * @disk:      the gendisk of the queue to configure
@@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
                  * we do nothing special as far as the block layer is concerned.
                  */
                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
-                   !xa_empty(&disk->part_tbl))
+                   disk_has_partitions(disk))
                         model = BLK_ZONED_NONE;
                 break;
         case BLK_ZONED_NONE:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index 614d9d4..cef1f71 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -17,6 +17,7 @@
  #include "blk-mq.h"
  #include "blk-mq-debugfs.h"
  #include "blk-wbt.h"
+#include "blk-throttle.h"
  
  struct queue_sysfs_entry {
         struct attribute attr;
@@ -432,26 +433,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
  static ssize_t queue_poll_store(struct request_queue *q, const char *page,
                                 size_t count)
  {
-       unsigned long poll_on;
-       ssize_t ret;
-
-       if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
-           !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
+       if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                 return -EINVAL;
-
-       ret = queue_var_store(&poll_on, page, count);
-       if (ret < 0)
-               return ret;
-
-       if (poll_on) {
-               blk_queue_flag_set(QUEUE_FLAG_POLL, q);
-       } else {
-               blk_mq_freeze_queue(q);
-               blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
-               blk_mq_unfreeze_queue(q);
-       }
-
-       return ret;
+       pr_info_ratelimited("writes to the poll attribute are ignored.\n");
+       pr_info_ratelimited("please use driver specific parameters instead.\n");
+       return count;
  }
  
  static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@@ -887,16 +873,15 @@ int blk_register_queue(struct gendisk *disk)
         }
  
         mutex_lock(&q->sysfs_lock);
+
+       ret = disk_register_independent_access_ranges(disk, NULL);
+       if (ret)
+               goto put_dev;
+
         if (q->elevator) {
                 ret = elv_register_queue(q, false);
-               if (ret) {
-                       mutex_unlock(&q->sysfs_lock);
-                       mutex_unlock(&q->sysfs_dir_lock);
-                       kobject_del(&q->kobj);
-                       blk_trace_remove_sysfs(dev);
-                       kobject_put(&dev->kobj);
-                       return ret;
-               }
+               if (ret)
+                       goto put_dev;
         }
  
         blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -928,6 +913,16 @@ unlock:
         }
  
         return ret;
+
+put_dev:
+       disk_unregister_independent_access_ranges(disk);
+       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->sysfs_dir_lock);
+       kobject_del(&q->kobj);
+       blk_trace_remove_sysfs(dev);
+       kobject_put(&dev->kobj);
+
+       return ret;
  }
  
  /**
@@ -972,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk)
         mutex_lock(&q->sysfs_lock);
         if (q->elevator)
                 elv_unregister_queue(q);
+       disk_unregister_independent_access_ranges(disk);
         mutex_unlock(&q->sysfs_lock);
         mutex_unlock(&q->sysfs_dir_lock);
  
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 7c4e799..39bb6e6 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -13,6 +13,7 @@
  #include <linux/blk-cgroup.h>
  #include "blk.h"
  #include "blk-cgroup-rwstat.h"
+#include "blk-throttle.h"
  
  /* Max dispatch from a group in 1 round */
  #define THROTL_GRP_QUANTUM 8
@@ -37,60 +38,9 @@
   */
  #define LATENCY_FILTERED_HD (1000L) /* 1ms */
  
-static struct blkcg_policy blkcg_policy_throtl;
-
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
  
-/*
- * To implement hierarchical throttling, throtl_grps form a tree and bios
- * are dispatched upwards level by level until they reach the top and get
- * issued.  When dispatching bios from the children and local group at each
- * level, if the bios are dispatched into a single bio_list, there's a risk
- * of a local or child group which can queue many bios at once filling up
- * the list starving others.
- *
- * To avoid such starvation, dispatched bios are queued separately
- * according to where they came from.  When they are again dispatched to
- * the parent, they're popped in round-robin order so that no single source
- * hogs the dispatch window.
- *
- * throtl_qnode is used to keep the queued bios separated by their sources.
- * Bios are queued to throtl_qnode which in turn is queued to
- * throtl_service_queue and then dispatched in round-robin order.
- *
- * It's also used to track the reference counts on blkg's.  A qnode always
- * belongs to a throtl_grp and gets queued on itself or the parent, so
- * incrementing the reference of the associated throtl_grp when a qnode is
- * queued and decrementing when dequeued is enough to keep the whole blkg
- * tree pinned while bios are in flight.
- */
-struct throtl_qnode {
-       struct list_head        node;           /* service_queue->queued[] */
-       struct bio_list         bios;           /* queued bios */
-       struct throtl_grp       *tg;            /* tg this qnode belongs to */
-};
-
-struct throtl_service_queue {
-       struct throtl_service_queue *parent_sq; /* the parent service_queue */
-
-       /*
-        * Bios queued directly to this service_queue or dispatched from
-        * children throtl_grp's.
-        */
-       struct list_head        queued[2];      /* throtl_qnode [READ/WRITE] */
-       unsigned int            nr_queued[2];   /* number of queued bios */
-
-       /*
-        * RB tree of active children throtl_grp's, which are sorted by
-        * their ->disptime.
-        */
-       struct rb_root_cached   pending_tree;   /* RB tree of active tgs */
-       unsigned int            nr_pending;     /* # queued in the tree */
-       unsigned long           first_pending_disptime; /* disptime of the first tg */
-       struct timer_list       pending_timer;  /* fires on first_pending_disptime */
-};
-
  enum tg_state_flags {
         THROTL_TG_PENDING       = 1 << 0,       /* on parent's pending tree */
         THROTL_TG_WAS_EMPTY     = 1 << 1,       /* bio_lists[] became non-empty */
@@ -98,93 +48,6 @@ enum tg_state_flags {
  
  #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
  
-enum {
-       LIMIT_LOW,
-       LIMIT_MAX,
-       LIMIT_CNT,
-};
-
-struct throtl_grp {
-       /* must be the first member */
-       struct blkg_policy_data pd;
-
-       /* active throtl group service_queue member */
-       struct rb_node rb_node;
-
-       /* throtl_data this group belongs to */
-       struct throtl_data *td;
-
-       /* this group's service queue */
-       struct throtl_service_queue service_queue;
-
-       /*
-        * qnode_on_self is used when bios are directly queued to this
-        * throtl_grp so that local bios compete fairly with bios
-        * dispatched from children.  qnode_on_parent is used when bios are
-        * dispatched from this throtl_grp into its parent and will compete
-        * with the sibling qnode_on_parents and the parent's
-        * qnode_on_self.
-        */
-       struct throtl_qnode qnode_on_self[2];
-       struct throtl_qnode qnode_on_parent[2];
-
-       /*
-        * Dispatch time in jiffies. This is the estimated time when group
-        * will unthrottle and is ready to dispatch more bio. It is used as
-        * key to sort active groups in service tree.
-        */
-       unsigned long disptime;
-
-       unsigned int flags;
-
-       /* are there any throtl rules between this group and td? */
-       bool has_rules[2];
-
-       /* internally used bytes per second rate limits */
-       uint64_t bps[2][LIMIT_CNT];
-       /* user configured bps limits */
-       uint64_t bps_conf[2][LIMIT_CNT];
-
-       /* internally used IOPS limits */
-       unsigned int iops[2][LIMIT_CNT];
-       /* user configured IOPS limits */
-       unsigned int iops_conf[2][LIMIT_CNT];
-
-       /* Number of bytes dispatched in current slice */
-       uint64_t bytes_disp[2];
-       /* Number of bio's dispatched in current slice */
-       unsigned int io_disp[2];
-
-       unsigned long last_low_overflow_time[2];
-
-       uint64_t last_bytes_disp[2];
-       unsigned int last_io_disp[2];
-
-       unsigned long last_check_time;
-
-       unsigned long latency_target; /* us */
-       unsigned long latency_target_conf; /* us */
-       /* When did we start a new slice */
-       unsigned long slice_start[2];
-       unsigned long slice_end[2];
-
-       unsigned long last_finish_time; /* ns / 1024 */
-       unsigned long checked_last_finish_time; /* ns / 1024 */
-       unsigned long avg_idletime; /* ns / 1024 */
-       unsigned long idletime_threshold; /* us */
-       unsigned long idletime_threshold_conf; /* us */
-
-       unsigned int bio_cnt; /* total bios */
-       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
-       unsigned long bio_cnt_reset_time;
-
-       atomic_t io_split_cnt[2];
-       atomic_t last_io_split_cnt[2];
-
-       struct blkg_rwstat stat_bytes;
-       struct blkg_rwstat stat_ios;
-};
-
  /* We measure latency for request size from <= 4k to >= 1M */
  #define LATENCY_BUCKET_SIZE 9
  
@@ -231,16 +94,6 @@ struct throtl_data
  
  static void throtl_pending_timer_fn(struct timer_list *t);
  
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
-{
-       return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
-}
-
-static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
-{
-       return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
-}
-
  static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
  {
         return pd_to_blkg(&tg->pd);
@@ -1794,7 +1647,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
         cancel_work_sync(&td->dispatch_work);
  }
  
-static struct blkcg_policy blkcg_policy_throtl = {
+struct blkcg_policy blkcg_policy_throtl = {
         .dfl_cftypes            = throtl_files,
         .legacy_cftypes         = throtl_legacy_files,
  
@@ -2208,9 +2061,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
         } while (parent);
  }
  
-bool blk_throtl_bio(struct bio *bio)
+bool __blk_throtl_bio(struct bio *bio)
  {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
         struct blkcg_gq *blkg = bio->bi_blkg;
         struct throtl_qnode *qn = NULL;
         struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -2221,19 +2074,12 @@ bool blk_throtl_bio(struct bio *bio)
  
         rcu_read_lock();
  
-       /* see throtl_charge_bio() */
-       if (bio_flagged(bio, BIO_THROTTLED))
-               goto out;
-
         if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
                 blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
                                 bio->bi_iter.bi_size);
                 blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
         }
  
-       if (!tg->has_rules[rw])
-               goto out;
-
         spin_lock_irq(&q->queue_lock);
  
         throtl_update_latency_buckets(td);
@@ -2317,7 +2163,6 @@ again:
  
  out_unlock:
         spin_unlock_irq(&q->queue_lock);
-out:
         bio_set_flag(bio, BIO_THROTTLED);
  
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-throttle.h b/block/blk-throttle.h

new file mode 100644 (file)

index 0000000..175f03a
--- /dev/null
+++ b/block/blk-throttle.h
@@ -0,0 +1,182 @@
+#ifndef BLK_THROTTLE_H
+#define BLK_THROTTLE_H
+
+#include "blk-cgroup-rwstat.h"
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued.  When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from.  When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's.  A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+       struct list_head        node;           /* service_queue->queued[] */
+       struct bio_list         bios;           /* queued bios */
+       struct throtl_grp       *tg;            /* tg this qnode belongs to */
+};
+
+struct throtl_service_queue {
+       struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+       /*
+        * Bios queued directly to this service_queue or dispatched from
+        * children throtl_grp's.
+        */
+       struct list_head        queued[2];      /* throtl_qnode [READ/WRITE] */
+       unsigned int            nr_queued[2];   /* number of queued bios */
+
+       /*
+        * RB tree of active children throtl_grp's, which are sorted by
+        * their ->disptime.
+        */
+       struct rb_root_cached   pending_tree;   /* RB tree of active tgs */
+       unsigned int            nr_pending;     /* # queued in the tree */
+       unsigned long           first_pending_disptime; /* disptime of the first tg */
+       struct timer_list       pending_timer;  /* fires on first_pending_disptime */
+};
+
+enum {
+       LIMIT_LOW,
+       LIMIT_MAX,
+       LIMIT_CNT,
+};
+
+struct throtl_grp {
+       /* must be the first member */
+       struct blkg_policy_data pd;
+
+       /* active throtl group service_queue member */
+       struct rb_node rb_node;
+
+       /* throtl_data this group belongs to */
+       struct throtl_data *td;
+
+       /* this group's service queue */
+       struct throtl_service_queue service_queue;
+
+       /*
+        * qnode_on_self is used when bios are directly queued to this
+        * throtl_grp so that local bios compete fairly with bios
+        * dispatched from children.  qnode_on_parent is used when bios are
+        * dispatched from this throtl_grp into its parent and will compete
+        * with the sibling qnode_on_parents and the parent's
+        * qnode_on_self.
+        */
+       struct throtl_qnode qnode_on_self[2];
+       struct throtl_qnode qnode_on_parent[2];
+
+       /*
+        * Dispatch time in jiffies. This is the estimated time when group
+        * will unthrottle and is ready to dispatch more bio. It is used as
+        * key to sort active groups in service tree.
+        */
+       unsigned long disptime;
+
+       unsigned int flags;
+
+       /* are there any throtl rules between this group and td? */
+       bool has_rules[2];
+
+       /* internally used bytes per second rate limits */
+       uint64_t bps[2][LIMIT_CNT];
+       /* user configured bps limits */
+       uint64_t bps_conf[2][LIMIT_CNT];
+
+       /* internally used IOPS limits */
+       unsigned int iops[2][LIMIT_CNT];
+       /* user configured IOPS limits */
+       unsigned int iops_conf[2][LIMIT_CNT];
+
+       /* Number of bytes dispatched in current slice */
+       uint64_t bytes_disp[2];
+       /* Number of bio's dispatched in current slice */
+       unsigned int io_disp[2];
+
+       unsigned long last_low_overflow_time[2];
+
+       uint64_t last_bytes_disp[2];
+       unsigned int last_io_disp[2];
+
+       unsigned long last_check_time;
+
+       unsigned long latency_target; /* us */
+       unsigned long latency_target_conf; /* us */
+       /* When did we start a new slice */
+       unsigned long slice_start[2];
+       unsigned long slice_end[2];
+
+       unsigned long last_finish_time; /* ns / 1024 */
+       unsigned long checked_last_finish_time; /* ns / 1024 */
+       unsigned long avg_idletime; /* ns / 1024 */
+       unsigned long idletime_threshold; /* us */
+       unsigned long idletime_threshold_conf; /* us */
+
+       unsigned int bio_cnt; /* total bios */
+       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+       unsigned long bio_cnt_reset_time;
+
+       atomic_t io_split_cnt[2];
+       atomic_t last_io_split_cnt[2];
+
+       struct blkg_rwstat stat_bytes;
+       struct blkg_rwstat stat_ios;
+};
+
+extern struct blkcg_policy blkcg_policy_throtl;
+
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
+{
+       return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
+}
+
+/*
+ * Internal throttling interface
+ */
+#ifndef CONFIG_BLK_DEV_THROTTLING
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
+static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+#else /* CONFIG_BLK_DEV_THROTTLING */
+int blk_throtl_init(struct request_queue *q);
+void blk_throtl_exit(struct request_queue *q);
+void blk_throtl_register_queue(struct request_queue *q);
+void blk_throtl_charge_bio_split(struct bio *bio);
+bool __blk_throtl_bio(struct bio *bio);
+static inline bool blk_throtl_bio(struct bio *bio)
+{
+       struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+
+       if (bio_flagged(bio, BIO_THROTTLED))
+               return false;
+       if (!tg->has_rules[bio_data_dir(bio)])
+               return false;
+
+       return __blk_throtl_bio(bio);
+}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif
diff --git a/block/blk-wbt.c b/block/blk-wbt.c

index 874c1c3..0c119be 100644 (file)
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
         unsigned int inflight = wbt_inflight(rwb);
         int status;
  
+       if (!rwb->rqos.q->disk)
+               return;
+
         status = latency_exceeded(rwb, cb->stat);
  
         trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
diff --git a/block/blk.h b/block/blk.h

index 6c3c00a..7afffd5 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,6 +12,8 @@
  #include "blk-mq.h"
  #include "blk-mq-sched.h"
  
+struct elevator_type;
+
  /* Max future timer expiry for timeouts */
  #define BLK_MAX_TIMEOUT                (5 * HZ)
  
@@ -94,6 +96,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
         return __bvec_gap_to_prev(q, bprv, offset);
  }
  
+static inline bool rq_mergeable(struct request *rq)
+{
+       if (blk_rq_is_passthrough(rq))
+               return false;
+
+       if (req_op(rq) == REQ_OP_FLUSH)
+               return false;
+
+       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+               return false;
+
+       if (req_op(rq) == REQ_OP_ZONE_APPEND)
+               return false;
+
+       if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
+               return false;
+       if (rq->rq_flags & RQF_NOMERGE_FLAGS)
+               return false;
+
+       return true;
+}
+
+/*
+ * There are two different ways to handle DISCARD merges:
+ *  1) If max_discard_segments > 1, the driver treats every bio as a range and
+ *     send the bios to controller together. The ranges don't need to be
+ *     contiguous.
+ *  2) Otherwise, the request will be normal read/write requests.  The ranges
+ *     need to be contiguous.
+ */
+static inline bool blk_discard_mergable(struct request *req)
+{
+       if (req_op(req) == REQ_OP_DISCARD &&
+           queue_max_discard_segments(req->q) > 1)
+               return true;
+       return false;
+}
+
  #ifdef CONFIG_BLK_DEV_INTEGRITY
  void blk_flush_integrity(void);
  bool __bio_integrity_endio(struct bio *);
@@ -175,21 +215,28 @@ static inline void blk_integrity_del(struct gendisk *disk)
  
  unsigned long blk_rq_timeout(unsigned long timeout);
  void blk_add_timer(struct request *req);
+void blk_print_req_error(struct request *req, blk_status_t status);
  
  bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-               unsigned int nr_segs, struct request **same_queue_rq);
+               unsigned int nr_segs, bool *same_queue_rq);
  bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                         struct bio *bio, unsigned int nr_segs);
  
-void blk_account_io_start(struct request *req);
-void blk_account_io_done(struct request *req, u64 now);
+void __blk_account_io_start(struct request *req);
+void __blk_account_io_done(struct request *req, u64 now);
+
+/*
+ * Plug flush limits
+ */
+#define BLK_MAX_REQUEST_COUNT  32
+#define BLK_PLUG_FLUSH_SIZE    (128 * 1024)
  
  /*
   * Internal elevator interface
   */
  #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
  
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
  
  int elevator_switch_mq(struct request_queue *q,
                               struct elevator_type *new_e);
@@ -202,7 +249,7 @@ static inline void elevator_exit(struct request_queue *q,
  {
         lockdep_assert_held(&q->sysfs_lock);
  
-       blk_mq_sched_free_requests(q);
+       blk_mq_sched_free_rqs(q);
         __elevator_exit(q, e);
  }
  
@@ -220,7 +267,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
  ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                 const char *, size_t);
  
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
+static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+{
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
+       case REQ_OP_WRITE_SAME:
+               return true; /* non-trivial splitting decisions */
+       default:
+               break;
+       }
+
+       /*
+        * All drivers must accept single-segments bios that are <= PAGE_SIZE.
+        * This is a quick and dirty check that relies on the fact that
+        * bi_io_vec[0] is always valid if a bio has data.  The check might
+        * lead to occasional false negatives when bios are cloned, but compared
+        * to the performance impact of cloned bios themselves the loop below
+        * doesn't matter anyway.
+        */
+       return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+               bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+}
+
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+                       unsigned int *nr_segs);
  int ll_back_merge_fn(struct request *req, struct bio *bio,
                 unsigned int nr_segs);
  bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -240,7 +312,25 @@ int blk_dev_init(void);
   */
  static inline bool blk_do_io_stat(struct request *rq)
  {
-       return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
+       return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
+}
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+       /*
+        * Account IO completion.  flush_rq isn't accounted as a
+        * normal IO on queueing nor completion.  Accounting the
+        * containing request is enough.
+        */
+       if (blk_do_io_stat(req) && req->part &&
+           !(req->rq_flags & RQF_FLUSH_SEQ))
+               __blk_account_io_done(req, now);
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+       if (blk_do_io_stat(req))
+               __blk_account_io_start(req);
  }
  
  static inline void req_set_nomerge(struct request_queue *q, struct request *req)
@@ -285,22 +375,6 @@ void ioc_clear_queue(struct request_queue *q);
  
  int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
  
-/*
- * Internal throttling interface
- */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_register_queue(struct request_queue *q);
-extern void blk_throtl_charge_bio_split(struct bio *bio);
-bool blk_throtl_bio(struct bio *bio);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
-static inline bool blk_throtl_bio(struct bio *bio) { return false; }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
  extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -368,13 +442,20 @@ extern struct device_attribute dev_attr_events;
  extern struct device_attribute dev_attr_events_async;
  extern struct device_attribute dev_attr_events_poll_msecs;
  
-static inline void bio_clear_hipri(struct bio *bio)
+static inline void bio_clear_polled(struct bio *bio)
  {
         /* can't support alloc cache if we turn off polling */
         bio_clear_flag(bio, BIO_PERCPU_CACHE);
-       bio->bi_opf &= ~REQ_HIPRI;
+       bio->bi_opf &= ~REQ_POLLED;
  }
  
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
  extern const struct address_space_operations def_blk_aops;
  
+int disk_register_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *new_iars);
+void disk_unregister_independent_access_ranges(struct gendisk *disk);
+
  #endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c

index 05fc714..7af1a72 100644 (file)
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,6 +14,7 @@
  #include <linux/pagemap.h>
  #include <linux/mempool.h>
  #include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
  #include <linux/backing-dev.h>
  #include <linux/init.h>
  #include <linux/hash.h>
diff --git a/block/bsg-lib.c b/block/bsg-lib.c

index ccb9827..10aa378 100644 (file)
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
         struct bsg_job *job;
         struct request *rq;
         struct bio *bio;
+       void *reply;
         int ret;
  
         if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
@@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
         if (!capable(CAP_SYS_RAWIO))
                 return -EPERM;
  
-       rq = blk_get_request(q, hdr->dout_xfer_len ?
+       rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
                              REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
         rq->timeout = timeout;
  
         job = blk_mq_rq_to_pdu(rq);
+       reply = job->reply;
+       memset(job, 0, sizeof(*job));
+       job->reply = reply;
+       job->reply_len = SCSI_SENSE_BUFFERSIZE;
+       job->dd_data = job + 1;
+
         job->request_len = hdr->request_len;
         job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
         if (IS_ERR(job->request)) {
                 ret = PTR_ERR(job->request);
-               goto out_put_request;
+               goto out_free_rq;
         }
  
         if (hdr->dout_xfer_len && hdr->din_xfer_len) {
-               job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
+               job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
                 if (IS_ERR(job->bidi_rq)) {
                         ret = PTR_ERR(job->bidi_rq);
                         goto out_free_job_request;
@@ -134,11 +141,11 @@ out_unmap_bidi_rq:
                 blk_rq_unmap_user(job->bidi_bio);
  out_free_bidi_rq:
         if (job->bidi_rq)
-               blk_put_request(job->bidi_rq);
+               blk_mq_free_request(job->bidi_rq);
  out_free_job_request:
         kfree(job->request);
-out_put_request:
-       blk_put_request(rq);
+out_free_rq:
+       blk_mq_free_request(rq);
         return ret;
  }
  
@@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
         return 0;
  }
  
-/* called right before the request is given to the request_queue user */
-static void bsg_initialize_rq(struct request *req)
-{
-       struct bsg_job *job = blk_mq_rq_to_pdu(req);
-       void *reply = job->reply;
-
-       memset(job, 0, sizeof(*job));
-       job->reply = reply;
-       job->reply_len = SCSI_SENSE_BUFFERSIZE;
-       job->dd_data = job + 1;
-}
-
  static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
                        unsigned int hctx_idx)
  {
@@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
         .queue_rq               = bsg_queue_rq,
         .init_request           = bsg_init_rq,
         .exit_request           = bsg_exit_rq,
-       .initialize_rq_fn       = bsg_initialize_rq,
         .complete               = bsg_complete,
         .timeout                = bsg_timeout,
  };
diff --git a/block/elevator.c b/block/elevator.c

index ff45d83..1f39f6e 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -26,7 +26,6 @@
  #include <linux/kernel.h>
  #include <linux/fs.h>
  #include <linux/blkdev.h>
-#include <linux/elevator.h>
  #include <linux/bio.h>
  #include <linux/module.h>
  #include <linux/slab.h>
@@ -40,6 +39,7 @@
  
  #include <trace/events/block.h>
  
+#include "elevator.h"
  #include "blk.h"
  #include "blk-mq-sched.h"
  #include "blk-pm.h"
@@ -637,7 +637,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
                 return NULL;
  
         if (q->nr_hw_queues != 1 &&
-                       !blk_mq_is_sbitmap_shared(q->tag_set->flags))
+           !blk_mq_is_shared_tags(q->tag_set->flags))
                 return NULL;
  
         return elevator_get(q, "mq-deadline", false);
diff --git a/include/linux/elevator.h b/block/elevator.h

similarity index 92%

rename from include/linux/elevator.h

rename to block/elevator.h

index ef9ceea..16cd8bd 100644 (file)
--- a/include/linux/elevator.h
+++ b/block/elevator.h
@@ -1,17 +1,13 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ELEVATOR_H
-#define _LINUX_ELEVATOR_H
+#ifndef _ELEVATOR_H
+#define _ELEVATOR_H
  
  #include <linux/percpu.h>
  #include <linux/hashtable.h>
  
-#ifdef CONFIG_BLOCK
-
  struct io_cq;
  struct elevator_type;
-#ifdef CONFIG_BLK_DEBUG_FS
  struct blk_mq_debugfs_attr;
-#endif
  
  /*
   * Return values from elevator merger
@@ -162,20 +158,9 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
  #define ELEVATOR_INSERT_FLUSH  5
  #define ELEVATOR_INSERT_SORT_MERGE     6
  
-#define rq_end_sector(rq)      (blk_rq_pos(rq) + blk_rq_sectors(rq))
  #define rb_entry_rq(node)      rb_entry((node), struct request, rb_node)
  
  #define rq_entry_fifo(ptr)     list_entry((ptr), struct request, queuelist)
  #define rq_fifo_clear(rq)      list_del_init(&(rq)->queuelist)
  
-/*
- * Elevator features.
- */
-
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE       (1U << 0)
-/* Supports scheduling on multiple hardware queues */
-#define ELEVATOR_F_MQ_AWARE            (1U << 1)
-
-#endif /* CONFIG_BLOCK */
-#endif
+#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c

index 1e970c2..4e22b07 100644 (file)
--- a/block/fops.c
+++ b/block/fops.c
@@ -17,7 +17,7 @@
  #include <linux/fs.h>
  #include "blk.h"
  
-static struct inode *bdev_file_inode(struct file *file)
+static inline struct inode *bdev_file_inode(struct file *file)
  {
         return file->f_mapping->host;
  }
@@ -54,14 +54,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                 struct iov_iter *iter, unsigned int nr_pages)
  {
-       struct file *file = iocb->ki_filp;
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+       struct block_device *bdev = iocb->ki_filp->private_data;
         struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
         loff_t pos = iocb->ki_pos;
         bool should_dirty = false;
         struct bio bio;
         ssize_t ret;
-       blk_qc_t qc;
  
         if ((pos | iov_iter_alignment(iter)) &
             (bdev_logical_block_size(bdev) - 1))
@@ -78,7 +76,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
  
         bio_init(&bio, vecs, nr_pages);
         bio_set_dev(&bio, bdev);
-       bio.bi_iter.bi_sector = pos >> 9;
+       bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
         bio.bi_write_hint = iocb->ki_hint;
         bio.bi_private = current;
         bio.bi_end_io = blkdev_bio_end_io_simple;
@@ -102,13 +100,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
         if (iocb->ki_flags & IOCB_HIPRI)
                 bio_set_polled(&bio, iocb);
  
-       qc = submit_bio(&bio);
+       submit_bio(&bio);
         for (;;) {
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 if (!READ_ONCE(bio.bi_private))
                         break;
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
+               if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
                         blk_io_schedule();
         }
         __set_current_state(TASK_RUNNING);
@@ -126,6 +123,11 @@ out:
         return ret;
  }
  
+enum {
+       DIO_SHOULD_DIRTY        = 1,
+       DIO_IS_SYNC             = 2,
+};
+
  struct blkdev_dio {
         union {
                 struct kiocb            *iocb;
@@ -133,35 +135,27 @@ struct blkdev_dio {
         };
         size_t                  size;
         atomic_t                ref;
-       bool                    multi_bio : 1;
-       bool                    should_dirty : 1;
-       bool                    is_sync : 1;
-       struct bio              bio;
+       unsigned int            flags;
+       struct bio              bio ____cacheline_aligned_in_smp;
  };
  
  static struct bio_set blkdev_dio_pool;
  
-static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
-{
-       struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
-       struct request_queue *q = bdev_get_queue(bdev);
-
-       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
-}
-
  static void blkdev_bio_end_io(struct bio *bio)
  {
         struct blkdev_dio *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
+       bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
  
         if (bio->bi_status && !dio->bio.bi_status)
                 dio->bio.bi_status = bio->bi_status;
  
-       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
-               if (!dio->is_sync) {
+       if (atomic_dec_and_test(&dio->ref)) {
+               if (!(dio->flags & DIO_IS_SYNC)) {
                         struct kiocb *iocb = dio->iocb;
                         ssize_t ret;
  
+                       WRITE_ONCE(iocb->private, NULL);
+
                         if (likely(!dio->bio.bi_status)) {
                                 ret = dio->size;
                                 iocb->ki_pos += ret;
@@ -169,9 +163,8 @@ static void blkdev_bio_end_io(struct bio *bio)
                                 ret = blk_status_to_errno(dio->bio.bi_status);
                         }
  
-                       dio->iocb->ki_complete(iocb, ret, 0);
-                       if (dio->multi_bio)
-                               bio_put(&dio->bio);
+                       dio->iocb->ki_complete(iocb, ret);
+                       bio_put(&dio->bio);
                 } else {
                         struct task_struct *waiter = dio->waiter;
  
@@ -191,16 +184,12 @@ static void blkdev_bio_end_io(struct bio *bio)
  static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 unsigned int nr_pages)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = bdev_file_inode(file);
-       struct block_device *bdev = I_BDEV(inode);
+       struct block_device *bdev = iocb->ki_filp->private_data;
         struct blk_plug plug;
         struct blkdev_dio *dio;
         struct bio *bio;
-       bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
         bool is_read = (iov_iter_rw(iter) == READ), is_sync;
         loff_t pos = iocb->ki_pos;
-       blk_qc_t qc = BLK_QC_T_NONE;
         int ret = 0;
  
         if ((pos | iov_iter_alignment(iter)) &
@@ -210,28 +199,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
  
         dio = container_of(bio, struct blkdev_dio, bio);
-       dio->is_sync = is_sync = is_sync_kiocb(iocb);
-       if (dio->is_sync) {
+       atomic_set(&dio->ref, 1);
+       /*
+        * Grab an extra reference to ensure the dio structure which is embedded
+        * into the first bio stays around.
+        */
+       bio_get(bio);
+
+       is_sync = is_sync_kiocb(iocb);
+       if (is_sync) {
+               dio->flags = DIO_IS_SYNC;
                 dio->waiter = current;
-               bio_get(bio);
         } else {
+               dio->flags = 0;
                 dio->iocb = iocb;
         }
  
         dio->size = 0;
-       dio->multi_bio = false;
-       dio->should_dirty = is_read && iter_is_iovec(iter);
+       if (is_read && iter_is_iovec(iter))
+               dio->flags |= DIO_SHOULD_DIRTY;
  
-       /*
-        * Don't plug for HIPRI/polled IO, as those should go straight
-        * to issue
-        */
-       if (!is_poll)
-               blk_start_plug(&plug);
+       blk_start_plug(&plug);
  
         for (;;) {
                 bio_set_dev(bio, bdev);
-               bio->bi_iter.bi_sector = pos >> 9;
+               bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
                 bio->bi_write_hint = iocb->ki_hint;
                 bio->bi_private = dio;
                 bio->bi_end_io = blkdev_bio_end_io;
@@ -246,7 +238,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
  
                 if (is_read) {
                         bio->bi_opf = REQ_OP_READ;
-                       if (dio->should_dirty)
+                       if (dio->flags & DIO_SHOULD_DIRTY)
                                 bio_set_pages_dirty(bio);
                 } else {
                         bio->bi_opf = dio_bio_write_op(iocb);
@@ -260,40 +252,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
  
                 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
                 if (!nr_pages) {
-                       bool polled = false;
-
-                       if (iocb->ki_flags & IOCB_HIPRI) {
-                               bio_set_polled(bio, iocb);
-                               polled = true;
-                       }
-
-                       qc = submit_bio(bio);
-
-                       if (polled)
-                               WRITE_ONCE(iocb->ki_cookie, qc);
+                       submit_bio(bio);
                         break;
                 }
-
-               if (!dio->multi_bio) {
-                       /*
-                        * AIO needs an extra reference to ensure the dio
-                        * structure which is embedded into the first bio
-                        * stays around.
-                        */
-                       if (!is_sync)
-                               bio_get(bio);
-                       dio->multi_bio = true;
-                       atomic_set(&dio->ref, 2);
-               } else {
-                       atomic_inc(&dio->ref);
-               }
-
+               atomic_inc(&dio->ref);
                 submit_bio(bio);
                 bio = bio_alloc(GFP_KERNEL, nr_pages);
         }
  
-       if (!is_poll)
-               blk_finish_plug(&plug);
+       blk_finish_plug(&plug);
  
         if (!is_sync)
                 return -EIOCBQUEUED;
@@ -302,10 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 if (!READ_ONCE(dio->waiter))
                         break;
-
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
-                       blk_io_schedule();
+               blk_io_schedule();
         }
         __set_current_state(TASK_RUNNING);
  
@@ -318,6 +282,94 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         return ret;
  }
  
+static void blkdev_bio_end_io_async(struct bio *bio)
+{
+       struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
+       struct kiocb *iocb = dio->iocb;
+       ssize_t ret;
+
+       if (likely(!bio->bi_status)) {
+               ret = dio->size;
+               iocb->ki_pos += ret;
+       } else {
+               ret = blk_status_to_errno(bio->bi_status);
+       }
+
+       iocb->ki_complete(iocb, ret);
+
+       if (dio->flags & DIO_SHOULD_DIRTY) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
+static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
+                                       struct iov_iter *iter,
+                                       unsigned int nr_pages)
+{
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct blkdev_dio *dio;
+       struct bio *bio;
+       loff_t pos = iocb->ki_pos;
+       int ret = 0;
+
+       if ((pos | iov_iter_alignment(iter)) &
+           (bdev_logical_block_size(bdev) - 1))
+               return -EINVAL;
+
+       bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+       dio = container_of(bio, struct blkdev_dio, bio);
+       dio->flags = 0;
+       dio->iocb = iocb;
+       bio_set_dev(bio, bdev);
+       bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+       bio->bi_write_hint = iocb->ki_hint;
+       bio->bi_end_io = blkdev_bio_end_io_async;
+       bio->bi_ioprio = iocb->ki_ioprio;
+
+       if (iov_iter_is_bvec(iter)) {
+               /*
+                * Users don't rely on the iterator being in any particular
+                * state for async I/O returning -EIOCBQUEUED, hence we can
+                * avoid expensive iov_iter_advance(). Bypass
+                * bio_iov_iter_get_pages() and set the bvec directly.
+                */
+               bio_iov_bvec_set(bio, iter);
+       } else {
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (unlikely(ret)) {
+                       bio->bi_status = BLK_STS_IOERR;
+                       bio_endio(bio);
+                       return ret;
+               }
+       }
+       dio->size = bio->bi_iter.bi_size;
+
+       if (iov_iter_rw(iter) == READ) {
+               bio->bi_opf = REQ_OP_READ;
+               if (iter_is_iovec(iter)) {
+                       dio->flags |= DIO_SHOULD_DIRTY;
+                       bio_set_pages_dirty(bio);
+               }
+       } else {
+               bio->bi_opf = dio_bio_write_op(iocb);
+               task_io_account_write(bio->bi_iter.bi_size);
+       }
+
+       if (iocb->ki_flags & IOCB_HIPRI) {
+               bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+               submit_bio(bio);
+               WRITE_ONCE(iocb->private, bio);
+       } else {
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       bio->bi_opf |= REQ_NOWAIT;
+               submit_bio(bio);
+       }
+       return -EIOCBQUEUED;
+}
+
  static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  {
         unsigned int nr_pages;
@@ -326,9 +378,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                 return 0;
  
         nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
-       if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
-               return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-
+       if (likely(nr_pages <= BIO_MAX_VECS)) {
+               if (is_sync_kiocb(iocb))
+                       return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+               return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+       }
         return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
  }
  
@@ -405,8 +459,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
  static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                 int datasync)
  {
-       struct inode *bd_inode = bdev_file_inode(filp);
-       struct block_device *bdev = I_BDEV(bd_inode);
+       struct block_device *bdev = filp->private_data;
         int error;
  
         error = file_write_and_wait_range(filp, start, end);
@@ -448,6 +501,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
         bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
+
+       filp->private_data = bdev;
         filp->f_mapping = bdev->bd_inode->i_mapping;
         filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
         return 0;
@@ -455,29 +510,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
  
  static int blkdev_close(struct inode *inode, struct file *filp)
  {
-       struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
+       struct block_device *bdev = filp->private_data;
  
         blkdev_put(bdev, filp->f_mode);
         return 0;
  }
  
-static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       fmode_t mode = file->f_mode;
-
-       /*
-        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
-        * to updated it before every ioctl.
-        */
-       if (file->f_flags & O_NDELAY)
-               mode |= FMODE_NDELAY;
-       else
-               mode &= ~FMODE_NDELAY;
-
-       return blkdev_ioctl(bdev, mode, cmd, arg);
-}
-
  /*
   * Write data to the block device.  Only intended for the block device itself
   * and the raw driver which basically is a fake block device.
@@ -487,14 +525,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
   */
  static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct inode *bd_inode = bdev->bd_inode;
         loff_t size = i_size_read(bd_inode);
         struct blk_plug plug;
         size_t shorted = 0;
         ssize_t ret;
  
-       if (bdev_read_only(I_BDEV(bd_inode)))
+       if (bdev_read_only(bdev))
                 return -EPERM;
  
         if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@@ -526,24 +564,26 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
  
  static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
-       loff_t size = i_size_read(bd_inode);
+       struct block_device *bdev = iocb->ki_filp->private_data;
+       loff_t size = i_size_read(bdev->bd_inode);
         loff_t pos = iocb->ki_pos;
         size_t shorted = 0;
         ssize_t ret;
  
-       if (pos >= size)
-               return 0;
-
-       size -= pos;
-       if (iov_iter_count(to) > size) {
-               shorted = iov_iter_count(to) - size;
-               iov_iter_truncate(to, size);
+       if (unlikely(pos + iov_iter_count(to) > size)) {
+               if (pos >= size)
+                       return 0;
+               size -= pos;
+               if (iov_iter_count(to) > size) {
+                       shorted = iov_iter_count(to) - size;
+                       iov_iter_truncate(to, size);
+               }
         }
  
         ret = generic_file_read_iter(iocb, to);
-       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+
+       if (unlikely(shorted))
+               iov_iter_reexpand(to, iov_iter_count(to) + shorted);
         return ret;
  }
  
@@ -565,7 +605,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                 return -EOPNOTSUPP;
  
         /* Don't go off the end of the device. */
-       isize = i_size_read(bdev->bd_inode);
+       isize = bdev_nr_bytes(bdev);
         if (start >= isize)
                 return -EINVAL;
         if (end >= isize) {
@@ -592,16 +632,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
         switch (mode) {
         case FALLOC_FL_ZERO_RANGE:
         case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                           GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
+               error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL,
+                                            BLKDEV_ZERO_NOUNMAP);
                 break;
         case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
+               error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL,
+                                            BLKDEV_ZERO_NOFALLBACK);
                 break;
         case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-               error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, 0);
+               error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+                                            len >> SECTOR_SHIFT, GFP_KERNEL, 0);
                 break;
         default:
                 error = -EOPNOTSUPP;
@@ -618,10 +660,10 @@ const struct file_operations def_blk_fops = {
         .llseek         = blkdev_llseek,
         .read_iter      = blkdev_read_iter,
         .write_iter     = blkdev_write_iter,
-       .iopoll         = blkdev_iopoll,
+       .iopoll         = iocb_bio_iopoll,
         .mmap           = generic_file_mmap,
         .fsync          = blkdev_fsync,
-       .unlocked_ioctl = block_ioctl,
+       .unlocked_ioctl = blkdev_ioctl,
  #ifdef CONFIG_COMPAT
         .compat_ioctl   = compat_blkdev_ioctl,
  #endif
diff --git a/block/genhd.c b/block/genhd.c

index b498585..febaaa5 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -19,6 +19,7 @@
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/kmod.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/idr.h>
  #include <linux/log2.h>
@@ -57,6 +58,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
  
         spin_lock(&bdev->bd_size_lock);
         i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+       bdev->bd_nr_sectors = sectors;
         spin_unlock(&bdev->bd_size_lock);
  }
  EXPORT_SYMBOL(set_capacity);
@@ -588,16 +590,6 @@ void del_gendisk(struct gendisk *disk)
          * Prevent new I/O from crossing bio_queue_enter().
          */
         blk_queue_start_drain(q);
-       blk_mq_freeze_queue_wait(q);
-
-       rq_qos_exit(q);
-       blk_sync_queue(q);
-       blk_flush_integrity();
-       /*
-        * Allow using passthrough request again after the queue is torn down.
-        */
-       blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
-       __blk_mq_unfreeze_queue(q, true);
  
         if (!(disk->flags & GENHD_FL_HIDDEN)) {
                 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -620,9 +612,41 @@ void del_gendisk(struct gendisk *disk)
                 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
         pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
         device_del(disk_to_dev(disk));
+
+       blk_mq_freeze_queue_wait(q);
+
+       rq_qos_exit(q);
+       blk_sync_queue(q);
+       blk_flush_integrity();
+       /*
+        * Allow using passthrough request again after the queue is torn down.
+        */
+       blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+       __blk_mq_unfreeze_queue(q, true);
+
  }
  EXPORT_SYMBOL(del_gendisk);
  
+/**
+ * invalidate_disk - invalidate the disk
+ * @disk: the struct gendisk to invalidate
+ *
+ * A helper to invalidates the disk. It will clean the disk's associated
+ * buffer/page caches and reset its internal states so that the disk
+ * can be reused by the drivers.
+ *
+ * Context: can sleep
+ */
+void invalidate_disk(struct gendisk *disk)
+{
+       struct block_device *bdev = disk->part0;
+
+       invalidate_bdev(bdev);
+       bdev->bd_inode->i_mapping->wb_err = 0;
+       set_capacity(disk, 0);
+}
+EXPORT_SYMBOL(invalidate_disk);
+
  /* sysfs access to bad-blocks list. */
  static ssize_t disk_badblocks_show(struct device *dev,
                                         struct device_attribute *attr,
@@ -882,7 +906,7 @@ ssize_t part_stat_show(struct device *dev,
                        struct device_attribute *attr, char *buf)
  {
         struct block_device *bdev = dev_to_bdev(dev);
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
         struct disk_stats stat;
         unsigned int inflight;
  
@@ -926,7 +950,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
  {
         struct block_device *bdev = dev_to_bdev(dev);
-       struct request_queue *q = bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bdev);
         unsigned int inflight[2];
  
         if (queue_is_mq(q))
@@ -1266,6 +1290,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
         if (!disk->bdi)
                 goto out_free_disk;
  
+       /* bdev_alloc() might need the queue, set before the first call */
+       disk->queue = q;
+
         disk->part0 = bdev_alloc(disk, 0);
         if (!disk->part0)
                 goto out_free_bdi;
@@ -1281,7 +1308,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
         disk_to_dev(disk)->type = &disk_type;
         device_initialize(disk_to_dev(disk));
         inc_diskseq(disk);
-       disk->queue = q;
         q->disk = disk;
         lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
  #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@@ -1386,12 +1412,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
  }
  EXPORT_SYMBOL(set_disk_ro);
  
-int bdev_read_only(struct block_device *bdev)
-{
-       return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
-}
-EXPORT_SYMBOL(bdev_read_only);
-
  void inc_diskseq(struct gendisk *disk)
  {
         disk->diskseq = atomic64_inc_return(&diskseq);
diff --git a/block/holder.c b/block/holder.c

index 9dc0841..27cddce 100644 (file)
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only
  #include <linux/genhd.h>
+#include <linux/slab.h>
  
  struct bd_holder_disk {
         struct list_head        list;
diff --git a/block/ioctl.c b/block/ioctl.c

index eb0491e..d6af0ac 100644 (file)
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -132,7 +132,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
         if (len & 511)
                 return -EINVAL;
  
-       if (start + len > i_size_read(bdev->bd_inode))
+       if (start + len > bdev_nr_bytes(bdev))
                 return -EINVAL;
  
         err = truncate_bdev_range(bdev, mode, start, start + len - 1);
@@ -164,7 +164,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
                 return -EINVAL;
         if (len & 511)
                 return -EINVAL;
-       if (end >= (uint64_t)i_size_read(bdev->bd_inode))
+       if (end >= (uint64_t)bdev_nr_bytes(bdev))
                 return -EINVAL;
         if (end < start)
                 return -EINVAL;
@@ -538,12 +538,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
   *
   * New commands must be compatible and go into blkdev_common_ioctl
   */
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
-                       unsigned long arg)
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  {
-       int ret;
-       loff_t size;
+       struct block_device *bdev = I_BDEV(file->f_mapping->host);
         void __user *argp = (void __user *)arg;
+       fmode_t mode = file->f_mode;
+       int ret;
+
+       /*
+        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+        * to updated it before every ioctl.
+        */
+       if (file->f_flags & O_NDELAY)
+               mode |= FMODE_NDELAY;
+       else
+               mode &= ~FMODE_NDELAY;
  
         switch (cmd) {
         /* These need separate implementations for the data structure */
@@ -560,10 +569,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                 return put_long(argp,
                         (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
         case BLKGETSIZE:
-               size = i_size_read(bdev->bd_inode);
-               if ((size >> 9) > ~0UL)
+               if (bdev_nr_sectors(bdev) > ~0UL)
                         return -EFBIG;
-               return put_ulong(argp, size >> 9);
+               return put_ulong(argp, bdev_nr_sectors(bdev));
  
         /* The data is compatible, but the command number is different */
         case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -571,7 +579,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
         case BLKBSZSET:
                 return blkdev_bszset(bdev, mode, argp);
         case BLKGETSIZE64:
-               return put_u64(argp, i_size_read(bdev->bd_inode));
+               return put_u64(argp, bdev_nr_bytes(bdev));
  
         /* Incompatible alignment on i386 */
         case BLKTRACESETUP:
@@ -588,7 +596,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                 return -ENOTTY;
         return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
  }
-EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
  
  #ifdef CONFIG_COMPAT
  
@@ -606,7 +613,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
         struct block_device *bdev = I_BDEV(file->f_mapping->host);
         struct gendisk *disk = bdev->bd_disk;
         fmode_t mode = file->f_mode;
-       loff_t size;
  
         /*
          * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -632,10 +638,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                 return compat_put_long(argp,
                         (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
         case BLKGETSIZE:
-               size = i_size_read(bdev->bd_inode);
-               if ((size >> 9) > ~0UL)
+               if (bdev_nr_sectors(bdev) > ~0UL)
                         return -EFBIG;
-               return compat_put_ulong(argp, size >> 9);
+               return compat_put_ulong(argp, bdev_nr_sectors(bdev));
  
         /* The data is compatible, but the command number is different */
         case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -643,7 +648,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
         case BLKBSZSET_32:
                 return blkdev_bszset(bdev, mode, argp);
         case BLKGETSIZE64_32:
-               return put_u64(argp, i_size_read(bdev->bd_inode));
+               return put_u64(argp, bdev_nr_bytes(bdev));
  
         /* Incompatible alignment on i386 */
         case BLKTRACESETUP32:
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c

deleted file mode 100644 (file)

index 2c4a55b..0000000
--- a/block/keyslot-manager.c
+++ /dev/null
@@ -1,578 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2019 Google LLC
- */
-
-/**
- * DOC: The Keyslot Manager
- *
- * Many devices with inline encryption support have a limited number of "slots"
- * into which encryption contexts may be programmed, and requests can be tagged
- * with a slot number to specify the key to use for en/decryption.
- *
- * As the number of slots is limited, and programming keys is expensive on
- * many inline encryption hardware, we don't want to program the same key into
- * multiple slots - if multiple requests are using the same key, we want to
- * program just one slot with that key and use that slot for all requests.
- *
- * The keyslot manager manages these keyslots appropriately, and also acts as
- * an abstraction between the inline encryption hardware and the upper layers.
- *
- * Lower layer devices will set up a keyslot manager in their request queue
- * and tell it how to perform device specific operations like programming/
- * evicting keys from keyslots.
- *
- * Upper layers will call blk_ksm_get_slot_for_key() to program a
- * key into some slot in the inline encryption hardware.
- */
-
-#define pr_fmt(fmt) "blk-crypto: " fmt
-
-#include <linux/keyslot-manager.h>
-#include <linux/device.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/pm_runtime.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-
-struct blk_ksm_keyslot {
-       atomic_t slot_refs;
-       struct list_head idle_slot_node;
-       struct hlist_node hash_node;
-       const struct blk_crypto_key *key;
-       struct blk_keyslot_manager *ksm;
-};
-
-static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm)
-{
-       /*
-        * Calling into the driver requires ksm->lock held and the device
-        * resumed.  But we must resume the device first, since that can acquire
-        * and release ksm->lock via blk_ksm_reprogram_all_keys().
-        */
-       if (ksm->dev)
-               pm_runtime_get_sync(ksm->dev);
-       down_write(&ksm->lock);
-}
-
-static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
-{
-       up_write(&ksm->lock);
-       if (ksm->dev)
-               pm_runtime_put_sync(ksm->dev);
-}
-
-static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm)
-{
-       return ksm->num_slots == 0;
-}
-
-/**
- * blk_ksm_init() - Initialize a keyslot manager
- * @ksm: The keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Allocate memory for keyslots and initialize a keyslot manager. Called by
- * e.g. storage drivers to set up a keyslot manager in their request_queue.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots)
-{
-       unsigned int slot;
-       unsigned int i;
-       unsigned int slot_hashtable_size;
-
-       memset(ksm, 0, sizeof(*ksm));
-
-       if (num_slots == 0)
-               return -EINVAL;
-
-       ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL);
-       if (!ksm->slots)
-               return -ENOMEM;
-
-       ksm->num_slots = num_slots;
-
-       init_rwsem(&ksm->lock);
-
-       init_waitqueue_head(&ksm->idle_slots_wait_queue);
-       INIT_LIST_HEAD(&ksm->idle_slots);
-
-       for (slot = 0; slot < num_slots; slot++) {
-               ksm->slots[slot].ksm = ksm;
-               list_add_tail(&ksm->slots[slot].idle_slot_node,
-                             &ksm->idle_slots);
-       }
-
-       spin_lock_init(&ksm->idle_slots_lock);
-
-       slot_hashtable_size = roundup_pow_of_two(num_slots);
-       /*
-        * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
-        * buckets.  This only makes a difference when there is only 1 keyslot.
-        */
-       if (slot_hashtable_size < 2)
-               slot_hashtable_size = 2;
-
-       ksm->log_slot_ht_size = ilog2(slot_hashtable_size);
-       ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size,
-                                            sizeof(ksm->slot_hashtable[0]),
-                                            GFP_KERNEL);
-       if (!ksm->slot_hashtable)
-               goto err_destroy_ksm;
-       for (i = 0; i < slot_hashtable_size; i++)
-               INIT_HLIST_HEAD(&ksm->slot_hashtable[i]);
-
-       return 0;
-
-err_destroy_ksm:
-       blk_ksm_destroy(ksm);
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init);
-
-static void blk_ksm_destroy_callback(void *ksm)
-{
-       blk_ksm_destroy(ksm);
-}
-
-/**
- * devm_blk_ksm_init() - Resource-managed blk_ksm_init()
- * @dev: The device which owns the blk_keyslot_manager.
- * @ksm: The blk_keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically
- * on driver detach.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-                     unsigned int num_slots)
-{
-       int err = blk_ksm_init(ksm, num_slots);
-
-       if (err)
-               return err;
-
-       return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm);
-}
-EXPORT_SYMBOL_GPL(devm_blk_ksm_init);
-
-static inline struct hlist_head *
-blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
-                           const struct blk_crypto_key *key)
-{
-       return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)];
-}
-
-static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot)
-{
-       struct blk_keyslot_manager *ksm = slot->ksm;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ksm->idle_slots_lock, flags);
-       list_del(&slot->idle_slot_node);
-       spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_keyslot(
-                                       struct blk_keyslot_manager *ksm,
-                                       const struct blk_crypto_key *key)
-{
-       const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key);
-       struct blk_ksm_keyslot *slotp;
-
-       hlist_for_each_entry(slotp, head, hash_node) {
-               if (slotp->key == key)
-                       return slotp;
-       }
-       return NULL;
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot(
-                                       struct blk_keyslot_manager *ksm,
-                                       const struct blk_crypto_key *key)
-{
-       struct blk_ksm_keyslot *slot;
-
-       slot = blk_ksm_find_keyslot(ksm, key);
-       if (!slot)
-               return NULL;
-       if (atomic_inc_return(&slot->slot_refs) == 1) {
-               /* Took first reference to this slot; remove it from LRU list */
-               blk_ksm_remove_slot_from_lru_list(slot);
-       }
-       return slot;
-}
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot)
-{
-       return slot - slot->ksm->slots;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx);
-
-/**
- * blk_ksm_get_slot_for_key() - Program a key into a keyslot.
- * @ksm: The keyslot manager to program the key into.
- * @key: Pointer to the key object to program, including the raw key, crypto
- *      mode, and data unit size.
- * @slot_ptr: A pointer to return the pointer of the allocated keyslot.
- *
- * Get a keyslot that's been programmed with the specified key.  If one already
- * exists, return it with incremented refcount.  Otherwise, wait for a keyslot
- * to become idle and program it.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the
- *        allocated keyslot), or some other blk_status_t otherwise (and
- *        keyslot is set to NULL).
- */
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     struct blk_ksm_keyslot **slot_ptr)
-{
-       struct blk_ksm_keyslot *slot;
-       int slot_idx;
-       int err;
-
-       *slot_ptr = NULL;
-
-       if (blk_ksm_is_passthrough(ksm))
-               return BLK_STS_OK;
-
-       down_read(&ksm->lock);
-       slot = blk_ksm_find_and_grab_keyslot(ksm, key);
-       up_read(&ksm->lock);
-       if (slot)
-               goto success;
-
-       for (;;) {
-               blk_ksm_hw_enter(ksm);
-               slot = blk_ksm_find_and_grab_keyslot(ksm, key);
-               if (slot) {
-                       blk_ksm_hw_exit(ksm);
-                       goto success;
-               }
-
-               /*
-                * If we're here, that means there wasn't a slot that was
-                * already programmed with the key. So try to program it.
-                */
-               if (!list_empty(&ksm->idle_slots))
-                       break;
-
-               blk_ksm_hw_exit(ksm);
-               wait_event(ksm->idle_slots_wait_queue,
-                          !list_empty(&ksm->idle_slots));
-       }
-
-       slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot,
-                               idle_slot_node);
-       slot_idx = blk_ksm_get_slot_idx(slot);
-
-       err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx);
-       if (err) {
-               wake_up(&ksm->idle_slots_wait_queue);
-               blk_ksm_hw_exit(ksm);
-               return errno_to_blk_status(err);
-       }
-
-       /* Move this slot to the hash list for the new key. */
-       if (slot->key)
-               hlist_del(&slot->hash_node);
-       slot->key = key;
-       hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key));
-
-       atomic_set(&slot->slot_refs, 1);
-
-       blk_ksm_remove_slot_from_lru_list(slot);
-
-       blk_ksm_hw_exit(ksm);
-success:
-       *slot_ptr = slot;
-       return BLK_STS_OK;
-}
-
-/**
- * blk_ksm_put_slot() - Release a reference to a slot
- * @slot: The keyslot to release the reference of.
- *
- * Context: Any context.
- */
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot)
-{
-       struct blk_keyslot_manager *ksm;
-       unsigned long flags;
-
-       if (!slot)
-               return;
-
-       ksm = slot->ksm;
-
-       if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
-                                       &ksm->idle_slots_lock, flags)) {
-               list_add_tail(&slot->idle_slot_node, &ksm->idle_slots);
-               spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-               wake_up(&ksm->idle_slots_wait_queue);
-       }
-}
-
-/**
- * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is
- *                                 supported by a ksm.
- * @ksm: The keyslot manager to check
- * @cfg: The crypto configuration to check for.
- *
- * Checks for crypto_mode/data unit size/dun bytes support.
- *
- * Return: Whether or not this ksm supports the specified crypto config.
- */
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-                                 const struct blk_crypto_config *cfg)
-{
-       if (!ksm)
-               return false;
-       if (!(ksm->crypto_modes_supported[cfg->crypto_mode] &
-             cfg->data_unit_size))
-               return false;
-       if (ksm->max_dun_bytes_supported < cfg->dun_bytes)
-               return false;
-       return true;
-}
-
-/**
- * blk_ksm_evict_key() - Evict a key from the lower layer device.
- * @ksm: The keyslot manager to evict from
- * @key: The key to evict
- *
- * Find the keyslot that the specified key was programmed into, and evict that
- * slot from the lower layer device. The slot must not be in use by any
- * in-flight IO when this function is called.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
- *        if the keyslot is still in use, or another -errno value on other
- *        error.
- */
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-                     const struct blk_crypto_key *key)
-{
-       struct blk_ksm_keyslot *slot;
-       int err = 0;
-
-       if (blk_ksm_is_passthrough(ksm)) {
-               if (ksm->ksm_ll_ops.keyslot_evict) {
-                       blk_ksm_hw_enter(ksm);
-                       err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1);
-                       blk_ksm_hw_exit(ksm);
-                       return err;
-               }
-               return 0;
-       }
-
-       blk_ksm_hw_enter(ksm);
-       slot = blk_ksm_find_keyslot(ksm, key);
-       if (!slot)
-               goto out_unlock;
-
-       if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
-               err = -EBUSY;
-               goto out_unlock;
-       }
-       err = ksm->ksm_ll_ops.keyslot_evict(ksm, key,
-                                           blk_ksm_get_slot_idx(slot));
-       if (err)
-               goto out_unlock;
-
-       hlist_del(&slot->hash_node);
-       slot->key = NULL;
-       err = 0;
-out_unlock:
-       blk_ksm_hw_exit(ksm);
-       return err;
-}
-
-/**
- * blk_ksm_reprogram_all_keys() - Re-program all keyslots.
- * @ksm: The keyslot manager
- *
- * Re-program all keyslots that are supposed to have a key programmed.  This is
- * intended only for use by drivers for hardware that loses its keys on reset.
- *
- * Context: Process context. Takes and releases ksm->lock.
- */
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
-{
-       unsigned int slot;
-
-       if (blk_ksm_is_passthrough(ksm))
-               return;
-
-       /* This is for device initialization, so don't resume the device */
-       down_write(&ksm->lock);
-       for (slot = 0; slot < ksm->num_slots; slot++) {
-               const struct blk_crypto_key *key = ksm->slots[slot].key;
-               int err;
-
-               if (!key)
-                       continue;
-
-               err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot);
-               WARN_ON(err);
-       }
-       up_write(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm)
-{
-       if (!ksm)
-               return;
-       kvfree(ksm->slot_hashtable);
-       kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots);
-       memzero_explicit(ksm, sizeof(*ksm));
-}
-EXPORT_SYMBOL_GPL(blk_ksm_destroy);
-
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q)
-{
-       if (blk_integrity_queue_supports_integrity(q)) {
-               pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-               return false;
-       }
-       q->ksm = ksm;
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_register);
-
-void blk_ksm_unregister(struct request_queue *q)
-{
-       q->ksm = NULL;
-}
-
-/**
- * blk_ksm_intersect_modes() - restrict supported modes by child device
- * @parent: The keyslot manager for parent device
- * @child: The keyslot manager for child device, or NULL
- *
- * Clear any crypto mode support bits in @parent that aren't set in @child.
- * If @child is NULL, then all parent bits are cleared.
- *
- * Only use this when setting up the keyslot manager for a layered device,
- * before it's been exposed yet.
- */
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-                            const struct blk_keyslot_manager *child)
-{
-       if (child) {
-               unsigned int i;
-
-               parent->max_dun_bytes_supported =
-                       min(parent->max_dun_bytes_supported,
-                           child->max_dun_bytes_supported);
-               for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported);
-                    i++) {
-                       parent->crypto_modes_supported[i] &=
-                               child->crypto_modes_supported[i];
-               }
-       } else {
-               parent->max_dun_bytes_supported = 0;
-               memset(parent->crypto_modes_supported, 0,
-                      sizeof(parent->crypto_modes_supported));
-       }
-}
-EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes);
-
-/**
- * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes
- *                        and DUN bytes that another KSM supports. Here,
- *                        "superset" refers to the mathematical meaning of the
- *                        word - i.e. if two KSMs have the *same* capabilities,
- *                        they *are* considered supersets of each other.
- * @ksm_superset: The KSM that we want to verify is a superset
- * @ksm_subset: The KSM that we want to verify is a subset
- *
- * Return: True if @ksm_superset supports a superset of the crypto modes and DUN
- *        bytes that @ksm_subset supports.
- */
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-                        struct blk_keyslot_manager *ksm_subset)
-{
-       int i;
-
-       if (!ksm_subset)
-               return true;
-
-       if (!ksm_superset)
-               return false;
-
-       for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) {
-               if (ksm_subset->crypto_modes_supported[i] &
-                   (~ksm_superset->crypto_modes_supported[i])) {
-                       return false;
-               }
-       }
-
-       if (ksm_subset->max_dun_bytes_supported >
-           ksm_superset->max_dun_bytes_supported) {
-               return false;
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_is_superset);
-
-/**
- * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of
- *                                another KSM
- * @target_ksm: The KSM whose restrictions to update.
- * @reference_ksm: The KSM to whose restrictions this function will update
- *                @target_ksm's restrictions to.
- *
- * Blk-crypto requires that crypto capabilities that were
- * advertised when a bio was created continue to be supported by the
- * device until that bio is ended. This is turn means that a device cannot
- * shrink its advertised crypto capabilities without any explicit
- * synchronization with upper layers. So if there's no such explicit
- * synchronization, @reference_ksm must support all the crypto capabilities that
- * @target_ksm does
- * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true).
- *
- * Note also that as long as the crypto capabilities are being expanded, the
- * order of updates becoming visible is not important because it's alright
- * for blk-crypto to see stale values - they only cause blk-crypto to
- * believe that a crypto capability isn't supported when it actually is (which
- * might result in blk-crypto-fallback being used if available, or the bio being
- * failed).
- */
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-                                struct blk_keyslot_manager *reference_ksm)
-{
-       memcpy(target_ksm->crypto_modes_supported,
-              reference_ksm->crypto_modes_supported,
-              sizeof(target_ksm->crypto_modes_supported));
-
-       target_ksm->max_dun_bytes_supported =
-                               reference_ksm->max_dun_bytes_supported;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities);
-
-/**
- * blk_ksm_init_passthrough() - Init a passthrough keyslot manager
- * @ksm: The keyslot manager to init
- *
- * Initialize a passthrough keyslot manager.
- * Called by e.g. storage drivers to set up a keyslot manager in their
- * request_queue, when the storage driver wants to manage its keys by itself.
- * This is useful for inline encryption hardware that doesn't have the concept
- * of keyslots, and for layered devices.
- */
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm)
-{
-       memset(ksm, 0, sizeof(*ksm));
-       init_rwsem(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c

index a0ffbab..fdd74a4 100644 (file)
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -9,12 +9,12 @@
  #include <linux/kernel.h>
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
-#include <linux/elevator.h>
  #include <linux/module.h>
  #include <linux/sbitmap.h>
  
  #include <trace/events/block.h>
  
+#include "elevator.h"
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-debugfs.h"
@@ -453,11 +453,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
  {
         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
         struct blk_mq_tags *tags = hctx->sched_tags;
-       unsigned int shift = tags->bitmap_tags->sb.shift;
+       unsigned int shift = tags->bitmap_tags.sb.shift;
  
         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
  
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
  }
  
  static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c

index 7f3c393..85d919b 100644 (file)
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -9,7 +9,6 @@
  #include <linux/fs.h>
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
-#include <linux/elevator.h>
  #include <linux/bio.h>
  #include <linux/module.h>
  #include <linux/slab.h>
@@ -20,6 +19,7 @@
  
  #include <trace/events/block.h>
  
+#include "elevator.h"
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-debugfs.h"
@@ -31,6 +31,11 @@
   */
  static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
  static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int prio_aging_expire = 10 * HZ;
  static const int writes_starved = 2;    /* max times reads can starve a write */
  static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                      by the above parameters. For throughput. */
@@ -51,17 +56,16 @@ enum dd_prio {
  
  enum { DD_PRIO_COUNT = 3 };
  
-/* I/O statistics per I/O priority. */
+/*
+ * I/O statistics per I/O priority. It is fine if these counters overflow.
+ * What matters is that these counters are at least as wide as
+ * log2(max_outstanding_requests).
+ */
  struct io_stats_per_prio {
-       local_t inserted;
-       local_t merged;
-       local_t dispatched;
-       local_t completed;
-};
-
-/* I/O statistics for all I/O priorities (enum dd_prio). */
-struct io_stats {
-       struct io_stats_per_prio stats[DD_PRIO_COUNT];
+       uint32_t inserted;
+       uint32_t merged;
+       uint32_t dispatched;
+       atomic_t completed;
  };
  
  /*
@@ -74,6 +78,7 @@ struct dd_per_prio {
         struct list_head fifo_list[DD_DIR_COUNT];
         /* Next request in FIFO order. Read, write or both are NULL. */
         struct request *next_rq[DD_DIR_COUNT];
+       struct io_stats_per_prio stats;
  };
  
  struct deadline_data {
@@ -88,8 +93,6 @@ struct deadline_data {
         unsigned int batching;          /* number of sequential requests made */
         unsigned int starved;           /* times reads have starved writes */
  
-       struct io_stats __percpu *stats;
-
         /*
          * settings that change how the i/o scheduler behaves
          */
@@ -98,38 +101,12 @@ struct deadline_data {
         int writes_starved;
         int front_merges;
         u32 async_depth;
+       int prio_aging_expire;
  
         spinlock_t lock;
         spinlock_t zone_lock;
  };
  
-/* Count one event of type 'event_type' and with I/O priority 'prio' */
-#define dd_count(dd, event_type, prio) do {                            \
-       struct io_stats *io_stats = get_cpu_ptr((dd)->stats);           \
-                                                                       \
-       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
-       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
-       local_inc(&io_stats->stats[(prio)].event_type);                 \
-       put_cpu_ptr(io_stats);                                          \
-} while (0)
-
-/*
- * Returns the total number of dd_count(dd, event_type, prio) calls across all
- * CPUs. No locking or barriers since it is fine if the returned sum is slightly
- * outdated.
- */
-#define dd_sum(dd, event_type, prio) ({                                        \
-       unsigned int cpu;                                               \
-       u32 sum = 0;                                                    \
-                                                                       \
-       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
-       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
-       for_each_present_cpu(cpu)                                       \
-               sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->      \
-                                 stats[(prio)].event_type);            \
-       sum;                                                            \
-})
-
  /* Maps an I/O priority class to a deadline scheduler priority. */
  static const enum dd_prio ioprio_class_to_prio[] = {
         [IOPRIO_CLASS_NONE]     = DD_BE_PRIO,
@@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
         const u8 ioprio_class = dd_rq_ioclass(next);
         const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
  
-       dd_count(dd, merged, prio);
+       lockdep_assert_held(&dd->lock);
+
+       dd->per_prio[prio].stats.merged++;
  
         /*
          * if next expires before rq, assign its expire time to rq
@@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
         deadline_remove_request(rq->q, per_prio, rq);
  }
  
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+       const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+       lockdep_assert_held(&dd->lock);
+
+       return stats->inserted - atomic_read(&stats->completed);
+}
+
  /*
   * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
   * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@@ -356,11 +345,26 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
  }
  
  /*
+ * Returns true if and only if @rq started after @latest_start where
+ * @latest_start is in jiffies.
+ */
+static bool started_after(struct deadline_data *dd, struct request *rq,
+                         unsigned long latest_start)
+{
+       unsigned long start_time = (unsigned long)rq->fifo_time;
+
+       start_time -= dd->fifo_expire[rq_data_dir(rq)];
+
+       return time_after(start_time, latest_start);
+}
+
+/*
   * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
   */
  static struct request *__dd_dispatch_request(struct deadline_data *dd,
-                                            struct dd_per_prio *per_prio)
+                                            struct dd_per_prio *per_prio,
+                                            unsigned long latest_start)
  {
         struct request *rq, *next_rq;
         enum dd_data_dir data_dir;
@@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
         if (!list_empty(&per_prio->dispatch)) {
                 rq = list_first_entry(&per_prio->dispatch, struct request,
                                       queuelist);
+               if (started_after(dd, rq, latest_start))
+                       return NULL;
                 list_del_init(&rq->queuelist);
                 goto done;
         }
@@ -449,6 +455,9 @@ dispatch_find_request:
         dd->batching = 0;
  
  dispatch_request:
+       if (started_after(dd, rq, latest_start))
+               return NULL;
+
         /*
          * rq is the selected appropriate request.
          */
@@ -457,7 +466,7 @@ dispatch_request:
  done:
         ioprio_class = dd_rq_ioclass(rq);
         prio = ioprio_class_to_prio[ioprio_class];
-       dd_count(dd, dispatched, prio);
+       dd->per_prio[prio].stats.dispatched++;
         /*
          * If the request needs its target zone locked, do it.
          */
@@ -467,6 +476,34 @@ done:
  }
  
  /*
+ * Check whether there are any requests with priority other than DD_RT_PRIO
+ * that were inserted more than prio_aging_expire jiffies ago.
+ */
+static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
+                                                     unsigned long now)
+{
+       struct request *rq;
+       enum dd_prio prio;
+       int prio_cnt;
+
+       lockdep_assert_held(&dd->lock);
+
+       prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
+                  !!dd_queued(dd, DD_IDLE_PRIO);
+       if (prio_cnt < 2)
+               return NULL;
+
+       for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
+                                          now - dd->prio_aging_expire);
+               if (rq)
+                       return rq;
+       }
+
+       return NULL;
+}
+
+/*
   * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
   *
   * One confusing aspect here is that we get called for a specific
@@ -477,15 +514,26 @@ done:
  static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
  {
         struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+       const unsigned long now = jiffies;
         struct request *rq;
         enum dd_prio prio;
  
         spin_lock(&dd->lock);
+       rq = dd_dispatch_prio_aged_requests(dd, now);
+       if (rq)
+               goto unlock;
+
+       /*
+        * Next, dispatch requests in priority order. Ignore lower priority
+        * requests if any higher priority requests are pending.
+        */
         for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-               rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
-               if (rq)
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
+               if (rq || dd_queued(dd, prio))
                         break;
         }
+
+unlock:
         spin_unlock(&dd->lock);
  
         return rq;
@@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
  
         dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
  
-       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
  }
  
  /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
  
         for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                 struct dd_per_prio *per_prio = &dd->per_prio[prio];
+               const struct io_stats_per_prio *stats = &per_prio->stats;
+               uint32_t queued;
  
                 WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
                 WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
-       }
  
-       free_percpu(dd->stats);
+               spin_lock(&dd->lock);
+               queued = dd_queued(dd, prio);
+               spin_unlock(&dd->lock);
+
+               WARN_ONCE(queued != 0,
+                         "statistics for priority %d: i %u m %u d %u c %u\n",
+                         prio, stats->inserted, stats->merged,
+                         stats->dispatched, atomic_read(&stats->completed));
+       }
  
         kfree(dd);
  }
@@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
  
         eq->elevator_data = dd;
  
-       dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
-                                    GFP_KERNEL | __GFP_ZERO);
-       if (!dd->stats)
-               goto free_dd;
-
         for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                 struct dd_per_prio *per_prio = &dd->per_prio[prio];
  
@@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
         dd->front_merges = 1;
         dd->last_dir = DD_WRITE;
         dd->fifo_batch = fifo_batch;
+       dd->prio_aging_expire = prio_aging_expire;
         spin_lock_init(&dd->lock);
         spin_lock_init(&dd->zone_lock);
  
         q->elevator = eq;
         return 0;
  
-free_dd:
-       kfree(dd);
-
  put_eq:
         kobject_put(&eq->kobj);
         return ret;
@@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
         blk_req_zone_write_unlock(rq);
  
         prio = ioprio_class_to_prio[ioprio_class];
-       dd_count(dd, inserted, prio);
-       rq->elv.priv[0] = (void *)(uintptr_t)1;
+       per_prio = &dd->per_prio[prio];
+       if (!rq->elv.priv[0]) {
+               per_prio->stats.inserted++;
+               rq->elv.priv[0] = (void *)(uintptr_t)1;
+       }
  
         if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
                 blk_mq_free_requests(&free);
@@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  
         trace_block_rq_insert(rq);
  
-       per_prio = &dd->per_prio[prio];
         if (at_head) {
                 list_add(&rq->queuelist, &per_prio->dispatch);
         } else {
@@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
  
         /*
          * The block layer core may call dd_finish_request() without having
-        * called dd_insert_requests(). Hence only update statistics for
-        * requests for which dd_insert_requests() has been called. See also
-        * blk_mq_request_bypass_insert().
+        * called dd_insert_requests(). Skip requests that bypassed I/O
+        * scheduling. See also blk_mq_request_bypass_insert().
          */
-       if (rq->elv.priv[0])
-               dd_count(dd, completed, prio);
+       if (!rq->elv.priv[0])
+               return;
+
+       atomic_inc(&per_prio->stats.completed);
  
         if (blk_queue_is_zoned(q)) {
                 unsigned long flags;
@@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)         \
  #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
  SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
  SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
  SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
  SHOW_INT(deadline_front_merges_show, dd->front_merges);
  SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
         STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
  STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
  STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
  STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
  STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
  STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
         DD_ATTR(front_merges),
         DD_ATTR(async_depth),
         DD_ATTR(fifo_batch),
+       DD_ATTR(prio_aging_expire),
         __ATTR_NULL
  };
  
@@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
         return 0;
  }
  
-/* Number of requests queued for a given priority level. */
-static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
-{
-       return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
-}
-
  static int dd_queued_show(void *data, struct seq_file *m)
  {
         struct request_queue *q = data;
         struct deadline_data *dd = q->elevator->elevator_data;
+       u32 rt, be, idle;
+
+       spin_lock(&dd->lock);
+       rt = dd_queued(dd, DD_RT_PRIO);
+       be = dd_queued(dd, DD_BE_PRIO);
+       idle = dd_queued(dd, DD_IDLE_PRIO);
+       spin_unlock(&dd->lock);
+
+       seq_printf(m, "%u %u %u\n", rt, be, idle);
  
-       seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
-                  dd_queued(dd, DD_BE_PRIO),
-                  dd_queued(dd, DD_IDLE_PRIO));
         return 0;
  }
  
  /* Number of requests owned by the block driver for a given priority. */
  static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
  {
-       return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
-               - dd_sum(dd, completed, prio);
+       const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+       lockdep_assert_held(&dd->lock);
+
+       return stats->dispatched + stats->merged -
+               atomic_read(&stats->completed);
  }
  
  static int dd_owned_by_driver_show(void *data, struct seq_file *m)
  {
         struct request_queue *q = data;
         struct deadline_data *dd = q->elevator->elevator_data;
+       u32 rt, be, idle;
+
+       spin_lock(&dd->lock);
+       rt = dd_owned_by_driver(dd, DD_RT_PRIO);
+       be = dd_owned_by_driver(dd, DD_BE_PRIO);
+       idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
+       spin_unlock(&dd->lock);
+
+       seq_printf(m, "%u %u %u\n", rt, be, idle);
  
-       seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
-                  dd_owned_by_driver(dd, DD_BE_PRIO),
-                  dd_owned_by_driver(dd, DD_IDLE_PRIO));
         return 0;
  }
  
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig

index 278593b..7aff4eb 100644 (file)
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -2,6 +2,8 @@
  #
  # Partition configuration
  #
+menu "Partition Types"
+
  config PARTITION_ADVANCED
         bool "Advanced partition selection"
         help
@@ -267,3 +269,5 @@ config CMDLINE_PARTITION
         help
           Say Y here if you want to read the partition table from bootargs.
           The format for the command line is just like mtdparts.
+
+endmenu
diff --git a/block/partitions/core.c b/block/partitions/core.c

index 7bea19d..334b72e 100644 (file)
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -5,6 +5,7 @@
   * Copyright (C) 2020 Christoph Hellwig
   */
  #include <linux/fs.h>
+#include <linux/major.h>
  #include <linux/slab.h>
  #include <linux/ctype.h>
  #include <linux/genhd.h>
@@ -90,6 +91,7 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
  {
         spin_lock(&bdev->bd_size_lock);
         i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+       bdev->bd_nr_sectors = sectors;
         spin_unlock(&bdev->bd_size_lock);
  }
  
@@ -203,7 +205,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
         struct block_device *bdev = dev_to_bdev(dev);
  
         return sprintf(buf, "%u\n",
-               queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
+               queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
                                 bdev->bd_start_sect));
  }
  
@@ -213,7 +215,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
         struct block_device *bdev = dev_to_bdev(dev);
  
         return sprintf(buf, "%u\n",
-               queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
+               queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
                                 bdev->bd_start_sect));
  }
  
diff --git a/block/partitions/efi.c b/block/partitions/efi.c

index 7ca5c4c..5e9be13 100644 (file)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
   */
  static u64 last_lba(struct gendisk *disk)
  {
-       return div_u64(disk->part0->bd_inode->i_size,
+       return div_u64(bdev_nr_bytes(disk->part0),
                        queue_logical_block_size(disk->queue)) - 1ULL;
  }
  
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c

index 9bca396..403756d 100644 (file)
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
                                 char name[],
                                 union label_t *label,
                                 sector_t labelsect,
-                               loff_t i_size,
+                               sector_t nr_sectors,
                                 dasd_information2_t *info)
  {
         loff_t offset, geo_size, size;
@@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
         } else {
                 /*
                  * Formated w/o large volume support. If the sanity check
-                * 'size based on geo == size based on i_size' is true, then
+                * 'size based on geo == size based on nr_sectors' is true, then
                  * we can safely assume that we know the formatted size of
                  * the disk, otherwise we need additional information
                  * that we can only get from a real DASD device.
                  */
                 geo_size = geo->cylinders * geo->heads
                         * geo->sectors * secperblk;
-               size = i_size >> 9;
+               size = nr_sectors;
                 if (size != geo_size) {
                         if (!info) {
                                 strlcat(state->pp_buf, "\n", PAGE_SIZE);
@@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
                         if (!strcmp(info->type, "ECKD"))
                                 if (geo_size < size)
                                         size = geo_size;
-                       /* else keep size based on i_size */
+                       /* else keep size based on nr_sectors */
                 }
         }
         /* first and only partition starts in the first block after the label */
@@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
         struct gendisk *disk = state->disk;
         struct block_device *bdev = disk->part0;
         int blocksize, res;
-       loff_t i_size, offset, size;
+       loff_t offset, size;
+       sector_t nr_sectors;
         dasd_information2_t *info;
         struct hd_geometry *geo;
         char type[5] = {0,};
@@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
         blocksize = bdev_logical_block_size(bdev);
         if (blocksize <= 0)
                 goto out_symbol;
-       i_size = i_size_read(bdev->bd_inode);
-       if (i_size == 0)
+       nr_sectors = bdev_nr_sectors(bdev);
+       if (nr_sectors == 0)
                 goto out_symbol;
         info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
         if (info == NULL)
@@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
                                                    label);
                 } else if (!strncmp(type, "LNX1", 4)) {
                         res = find_lnx1_partitions(state, geo, blocksize, name,
-                                                  label, labelsect, i_size,
+                                                  label, labelsect, nr_sectors,
                                                    info);
                 } else if (!strncmp(type, "CMS1", 4)) {
                         res = find_cms1_partitions(state, geo, blocksize, name,
@@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
                 res = 1;
                 if (info->format == DASD_FORMAT_LDL) {
                         strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
-                       size = i_size >> 9;
+                       size = nr_sectors;
                         offset = (info->label_block + 1) * (blocksize >> 9);
                         put_partition(state, 1, offset, size-offset);
                         strlcat(state->pp_buf, "\n", PAGE_SIZE);
diff --git a/block/t10-pi.c b/block/t10-pi.c

index 00c203b..25a52a2 100644 (file)
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -5,7 +5,7 @@
   */
  
  #include <linux/t10-pi.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/crc-t10dif.h>
  #include <linux/module.h>
  #include <net/checksum.h>
diff --git a/crypto/af_alg.c b/crypto/af_alg.c

index 8bd288d..3dd5a77 100644 (file)
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1076,7 +1076,7 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
         af_alg_free_resources(areq);
         sock_put(sk);
  
-       iocb->ki_complete(iocb, err ? err : (int)resultlen, 0);
+       iocb->ki_complete(iocb, err ? err : (int)resultlen);
  }
  EXPORT_SYMBOL_GPL(af_alg_async_cb);
  
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c

index eed6531..75f1a6c 100644 (file)
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2459,18 +2459,70 @@ static void ata_dev_config_devslp(struct ata_device *dev)
         }
  }
  
+static void ata_dev_config_cpr(struct ata_device *dev)
+{
+       unsigned int err_mask;
+       size_t buf_len;
+       int i, nr_cpr = 0;
+       struct ata_cpr_log *cpr_log = NULL;
+       u8 *desc, *buf = NULL;
+
+       if (!ata_identify_page_supported(dev,
+                                ATA_LOG_CONCURRENT_POSITIONING_RANGES))
+               goto out;
+
+       /*
+        * Read IDENTIFY DEVICE data log, page 0x47
+        * (concurrent positioning ranges). We can have at most 255 32B range
+        * descriptors plus a 64B header.
+        */
+       buf_len = (64 + 255 * 32 + 511) & ~511;
+       buf = kzalloc(buf_len, GFP_KERNEL);
+       if (!buf)
+               goto out;
+
+       err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE,
+                                    ATA_LOG_CONCURRENT_POSITIONING_RANGES,
+                                    buf, buf_len >> 9);
+       if (err_mask)
+               goto out;
+
+       nr_cpr = buf[0];
+       if (!nr_cpr)
+               goto out;
+
+       cpr_log = kzalloc(struct_size(cpr_log, cpr, nr_cpr), GFP_KERNEL);
+       if (!cpr_log)
+               goto out;
+
+       cpr_log->nr_cpr = nr_cpr;
+       desc = &buf[64];
+       for (i = 0; i < nr_cpr; i++, desc += 32) {
+               cpr_log->cpr[i].num = desc[0];
+               cpr_log->cpr[i].num_storage_elements = desc[1];
+               cpr_log->cpr[i].start_lba = get_unaligned_le64(&desc[8]);
+               cpr_log->cpr[i].num_lbas = get_unaligned_le64(&desc[16]);
+       }
+
+out:
+       swap(dev->cpr_log, cpr_log);
+       kfree(cpr_log);
+       kfree(buf);
+}
+
  static void ata_dev_print_features(struct ata_device *dev)
  {
         if (!(dev->flags & ATA_DFLAG_FEATURES_MASK))
                 return;
  
         ata_dev_info(dev,
-                    "Features:%s%s%s%s%s\n",
+                    "Features:%s%s%s%s%s%s\n",
                      dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "",
                      dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "",
                      dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "",
                      dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "",
-                    dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "");
+                    dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "",
+                    dev->cpr_log ? " CPR" : "");
  }
  
  /**
@@ -2634,6 +2686,7 @@ int ata_dev_configure(struct ata_device *dev)
                 ata_dev_config_sense_reporting(dev);
                 ata_dev_config_zac(dev);
                 ata_dev_config_trusted(dev);
+               ata_dev_config_cpr(dev);
                 dev->cdb_len = 32;
  
                 if (ata_msg_drv(ap) && print_info)
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c

index 1fb4611..15a279f 100644 (file)
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1895,7 +1895,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
   */
  static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
  {
-       int num_pages;
+       int i, num_pages = 0;
         static const u8 pages[] = {
                 0x00,   /* page 0x00, this page */
                 0x80,   /* page 0x80, unit serial no page */
@@ -1905,13 +1905,17 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
                 0xb1,   /* page 0xb1, block device characteristics page */
                 0xb2,   /* page 0xb2, thin provisioning page */
                 0xb6,   /* page 0xb6, zoned block device characteristics */
+               0xb9,   /* page 0xb9, concurrent positioning ranges */
         };
  
-       num_pages = sizeof(pages);
-       if (!(args->dev->flags & ATA_DFLAG_ZAC))
-               num_pages--;
+       for (i = 0; i < sizeof(pages); i++) {
+               if (pages[i] == 0xb6 &&
+                   !(args->dev->flags & ATA_DFLAG_ZAC))
+                       continue;
+               rbuf[num_pages + 4] = pages[i];
+               num_pages++;
+       }
         rbuf[3] = num_pages;    /* number of supported VPD pages */
-       memcpy(rbuf + 4, pages, num_pages);
         return 0;
  }
  
@@ -2121,6 +2125,26 @@ static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
         return 0;
  }
  
+static unsigned int ata_scsiop_inq_b9(struct ata_scsi_args *args, u8 *rbuf)
+{
+       struct ata_cpr_log *cpr_log = args->dev->cpr_log;
+       u8 *desc = &rbuf[64];
+       int i;
+
+       /* SCSI Concurrent Positioning Ranges VPD page: SBC-5 rev 1 or later */
+       rbuf[1] = 0xb9;
+       put_unaligned_be16(64 + (int)cpr_log->nr_cpr * 32 - 4, &rbuf[3]);
+
+       for (i = 0; i < cpr_log->nr_cpr; i++, desc += 32) {
+               desc[0] = cpr_log->cpr[i].num;
+               desc[1] = cpr_log->cpr[i].num_storage_elements;
+               put_unaligned_be64(cpr_log->cpr[i].start_lba, &desc[8]);
+               put_unaligned_be64(cpr_log->cpr[i].num_lbas, &desc[16]);
+       }
+
+       return 0;
+}
+
  /**
   *     modecpy - Prepare response for MODE SENSE
   *     @dest: output buffer
@@ -4120,11 +4144,17 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
                         ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
                         break;
                 case 0xb6:
-                       if (dev->flags & ATA_DFLAG_ZAC) {
+                       if (dev->flags & ATA_DFLAG_ZAC)
                                 ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
-                               break;
-                       }
-                       fallthrough;
+                       else
+                               ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+                       break;
+               case 0xb9:
+                       if (dev->cpr_log)
+                               ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b9);
+                       else
+                               ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+                       break;
                 default:
                         ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
                         break;
diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c

index cfa29dc..fabf870 100644 (file)
--- a/drivers/base/regmap/regcache-rbtree.c
+++ b/drivers/base/regmap/regcache-rbtree.c
@@ -281,14 +281,14 @@ static int regcache_rbtree_insert_to_block(struct regmap *map,
         if (!blk)
                 return -ENOMEM;
  
+       rbnode->block = blk;
+
         if (BITS_TO_LONGS(blklen) > BITS_TO_LONGS(rbnode->blklen)) {
                 present = krealloc(rbnode->cache_present,
                                    BITS_TO_LONGS(blklen) * sizeof(*present),
                                    GFP_KERNEL);
-               if (!present) {
-                       kfree(blk);
+               if (!present)
                         return -ENOMEM;
-               }
  
                 memset(present + BITS_TO_LONGS(rbnode->blklen), 0,
                        (BITS_TO_LONGS(blklen) - BITS_TO_LONGS(rbnode->blklen))
@@ -305,7 +305,6 @@ static int regcache_rbtree_insert_to_block(struct regmap *map,
         }
  
         /* update the rbnode block, its size and the base register */
-       rbnode->block = blk;
         rbnode->blklen = blklen;
         rbnode->base_reg = base_reg;
         rbnode->cache_present = present;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index ab3e37a..d97eaf6 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -180,14 +180,6 @@ config BLK_DEV_LOOP
           bits of, say, a sound file). This is also safe if the file resides
           on a remote file server.
  
-         There are several ways of encrypting disks. Some of these require
-         kernel patches. The vanilla kernel offers the cryptoloop option
-         and a Device Mapper target (which is superior, as it supports all
-         file systems). If you want to use the cryptoloop, say Y to both
-         LOOP and CRYPTOLOOP, and make sure you have a recent (version 2.12
-         or later) version of util-linux. Additionally, be aware that
-         the cryptoloop is not safe for storing journaled filesystems.
-
           Note that this loop device has nothing to do with the loopback
           device used for network connections from the machine to itself.
  
@@ -211,21 +203,6 @@ config BLK_DEV_LOOP_MIN_COUNT
           is used, it can be set to 0, since needed loop devices can be
           dynamically allocated with the /dev/loop-control interface.
  
-config BLK_DEV_CRYPTOLOOP
-       tristate "Cryptoloop Support (DEPRECATED)"
-       select CRYPTO
-       select CRYPTO_CBC
-       depends on BLK_DEV_LOOP
-       help
-         Say Y here if you want to be able to use the ciphers that are 
-         provided by the CryptoAPI as loop transformation. This might be
-         used as hard disk encryption.
-
-         WARNING: This device is not safe for journaled file systems like
-         ext3 or Reiserfs. Please use the Device Mapper crypto module
-         instead, which can be configured to be on-disk compatible with the
-         cryptoloop device.  cryptoloop support will be removed in Linux 5.16.
-
  source "drivers/block/drbd/Kconfig"
  
  config BLK_DEV_NBD
@@ -304,8 +281,8 @@ config BLK_DEV_RAM_SIZE
  config CDROM_PKTCDVD
         tristate "Packet writing on CD/DVD media (DEPRECATED)"
         depends on !UML
+       depends on SCSI
         select CDROM
-       select SCSI_COMMON
         help
           Note: This driver is deprecated and will be removed from the
           kernel in the near future!
diff --git a/drivers/block/Makefile b/drivers/block/Makefile

index bc68817..11a74f1 100644 (file)
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -24,7 +24,6 @@ obj-$(CONFIG_CDROM_PKTCDVD)   += pktcdvd.o
  obj-$(CONFIG_SUNVDC)           += sunvdc.o
  
  obj-$(CONFIG_BLK_DEV_NBD)      += nbd.o
-obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
  obj-$(CONFIG_VIRTIO_BLK)       += virtio_blk.o
  
  obj-$(CONFIG_BLK_DEV_SX8)      += sx8.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c

index 8b17140..bf5c124 100644 (file)
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -61,10 +61,10 @@
  #include <linux/hdreg.h>
  #include <linux/delay.h>
  #include <linux/init.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/fs.h>
  #include <linux/blk-mq.h>
-#include <linux/elevator.h>
  #include <linux/interrupt.h>
  #include <linux/platform_device.h>
  
@@ -1780,6 +1780,7 @@ static const struct blk_mq_ops amiflop_mq_ops = {
  static int fd_alloc_disk(int drive, int system)
  {
         struct gendisk *disk;
+       int err;
  
         disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
         if (IS_ERR(disk))
@@ -1798,8 +1799,10 @@ static int fd_alloc_disk(int drive, int system)
         set_capacity(disk, 880 * 2);
  
         unit[drive].gendisk[system] = disk;
-       add_disk(disk);
-       return 0;
+       err = add_disk(disk);
+       if (err)
+               blk_cleanup_disk(disk);
+       return err;
  }
  
  static int fd_alloc_drive(int drive)
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c

index 06b360f..52484bc 100644 (file)
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -37,8 +37,7 @@ static ssize_t aoedisk_show_state(struct device *dev,
         struct gendisk *disk = dev_to_disk(dev);
         struct aoedev *d = disk->private_data;
  
-       return snprintf(page, PAGE_SIZE,
-                       "%s%s\n",
+       return sysfs_emit(page, "%s%s\n",
                         (d->flags & DEVFL_UP) ? "up" : "down",
                         (d->flags & DEVFL_KICKME) ? ",kickme" :
                         (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : "");
@@ -52,8 +51,8 @@ static ssize_t aoedisk_show_mac(struct device *dev,
         struct aoetgt *t = d->targets[0];
  
         if (t == NULL)
-               return snprintf(page, PAGE_SIZE, "none\n");
-       return snprintf(page, PAGE_SIZE, "%pm\n", t->addr);
+               return sysfs_emit(page, "none\n");
+       return sysfs_emit(page, "%pm\n", t->addr);
  }
  static ssize_t aoedisk_show_netif(struct device *dev,
                                   struct device_attribute *attr, char *page)
@@ -85,7 +84,7 @@ static ssize_t aoedisk_show_netif(struct device *dev,
         ne = nd;
         nd = nds;
         if (*nd == NULL)
-               return snprintf(page, PAGE_SIZE, "none\n");
+               return sysfs_emit(page, "none\n");
         for (p = page; nd < ne; nd++)
                 p += scnprintf(p, PAGE_SIZE - (p-page), "%s%s",
                         p == page ? "" : ",", (*nd)->name);
@@ -99,7 +98,7 @@ static ssize_t aoedisk_show_fwver(struct device *dev,
         struct gendisk *disk = dev_to_disk(dev);
         struct aoedev *d = disk->private_data;
  
-       return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
+       return sysfs_emit(page, "0x%04x\n", (unsigned int) d->fw_ver);
  }
  static ssize_t aoedisk_show_payload(struct device *dev,
                                     struct device_attribute *attr, char *page)
@@ -107,7 +106,7 @@ static ssize_t aoedisk_show_payload(struct device *dev,
         struct gendisk *disk = dev_to_disk(dev);
         struct aoedev *d = disk->private_data;
  
-       return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+       return sysfs_emit(page, "%lu\n", d->maxbcnt);
  }
  
  static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
@@ -417,7 +416,9 @@ aoeblk_gdalloc(void *vp)
  
         spin_unlock_irqrestore(&d->lock, flags);
  
-       device_add_disk(NULL, gd, aoe_attr_groups);
+       err = device_add_disk(NULL, gd, aoe_attr_groups);
+       if (err)
+               goto out_disk_cleanup;
         aoedisk_add_debugfs(d);
  
         spin_lock_irqsave(&d->lock, flags);
@@ -426,6 +427,8 @@ aoeblk_gdalloc(void *vp)
         spin_unlock_irqrestore(&d->lock, flags);
         return;
  
+out_disk_cleanup:
+       blk_cleanup_disk(gd);
  err_tagset:
         blk_mq_free_tag_set(set);
  err_mempool:
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c

index a093644..d14bdc3 100644 (file)
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -68,6 +68,7 @@
  #include <linux/delay.h>
  #include <linux/init.h>
  #include <linux/blk-mq.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/completion.h>
  #include <linux/wait.h>
@@ -298,6 +299,7 @@ static struct atari_floppy_struct {
                                    disk change detection) */
         int flags;              /* flags */
         struct gendisk *disk[NUM_DISK_MINORS];
+       bool registered[NUM_DISK_MINORS];
         int ref;
         int type;
         struct blk_mq_tag_set tag_set;
@@ -456,10 +458,20 @@ static DEFINE_TIMER(fd_timer, check_change);
         
  static void fd_end_request_cur(blk_status_t err)
  {
+       DPRINT(("fd_end_request_cur(), bytes %d of %d\n",
+               blk_rq_cur_bytes(fd_request),
+               blk_rq_bytes(fd_request)));
+
         if (!blk_update_request(fd_request, err,
                                 blk_rq_cur_bytes(fd_request))) {
+               DPRINT(("calling __blk_mq_end_request()\n"));
                 __blk_mq_end_request(fd_request, err);
                 fd_request = NULL;
+       } else {
+               /* requeue rest of request */
+               DPRINT(("calling blk_mq_requeue_request()\n"));
+               blk_mq_requeue_request(fd_request, true);
+               fd_request = NULL;
         }
  }
  
@@ -653,9 +665,6 @@ static inline void copy_buffer(void *from, void *to)
                 *p2++ = *p1++;
  }
  
-  
-  
-
  /* General Interrupt Handling */
  
  static void (*FloppyIRQHandler)( int status ) = NULL;
@@ -700,12 +709,21 @@ static void fd_error( void )
         if (fd_request->error_count >= MAX_ERRORS) {
                 printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
                 fd_end_request_cur(BLK_STS_IOERR);
+               finish_fdc();
+               return;
         }
         else if (fd_request->error_count == RECALIBRATE_ERRORS) {
                 printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
                 if (SelectedDrive != -1)
                         SUD.track = -1;
         }
+       /* need to re-run request to recalibrate */
+       atari_disable_irq( IRQ_MFP_FDC );
+
+       setup_req_params( SelectedDrive );
+       do_fd_action( SelectedDrive );
+
+       atari_enable_irq( IRQ_MFP_FDC );
  }
  
  
@@ -732,8 +750,10 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
         if (type) {
                 type--;
                 if (type >= NUM_DISK_MINORS ||
-                   minor2disktype[type].drive_types > DriveType)
+                   minor2disktype[type].drive_types > DriveType) {
+                       finish_fdc();
                         return -EINVAL;
+               }
         }
  
         q = unit[drive].disk[type]->queue;
@@ -751,6 +771,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
         }
  
         if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) {
+               finish_fdc();
                 ret = -EINVAL;
                 goto out;
         }
@@ -791,6 +812,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
  
         wait_for_completion(&format_wait);
  
+       finish_fdc();
         ret = FormatError ? -EIO : 0;
  out:
         blk_mq_unquiesce_queue(q);
@@ -825,6 +847,7 @@ static void do_fd_action( int drive )
                     else {
                         /* all sectors finished */
                         fd_end_request_cur(BLK_STS_OK);
+                       finish_fdc();
                         return;
                     }
                 }
@@ -1229,6 +1252,7 @@ static void fd_rwsec_done1(int status)
         else {
                 /* all sectors finished */
                 fd_end_request_cur(BLK_STS_OK);
+               finish_fdc();
         }
         return;
    
@@ -1350,7 +1374,7 @@ static void fd_times_out(struct timer_list *unused)
  
  static void finish_fdc( void )
  {
-       if (!NeedSeek) {
+       if (!NeedSeek || !stdma_is_locked_by(floppy_irq)) {
                 finish_fdc_done( 0 );
         }
         else {
@@ -1385,7 +1409,8 @@ static void finish_fdc_done( int dummy )
         start_motor_off_timer();
  
         local_irq_save(flags);
-       stdma_release();
+       if (stdma_is_locked_by(floppy_irq))
+               stdma_release();
         local_irq_restore(flags);
  
         DPRINT(("finish_fdc() finished\n"));
@@ -1435,8 +1460,7 @@ static int floppy_revalidate(struct gendisk *disk)
         unsigned int drive = p - unit;
  
         if (test_bit(drive, &changed_floppies) ||
-           test_bit(drive, &fake_change) ||
-           p->disktype == 0) {
+           test_bit(drive, &fake_change) || !p->disktype) {
                 if (UD.flags & FTD_MSG)
                         printk(KERN_ERR "floppy: clear format %p!\n", UDT);
                 BufferDrive = -1;
@@ -1475,15 +1499,6 @@ static void setup_req_params( int drive )
                         ReqTrack, ReqSector, (unsigned long)ReqData ));
  }
  
-static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
-{
-       spin_lock_irq(&ataflop_lock);
-       atari_disable_irq(IRQ_MFP_FDC);
-       finish_fdc();
-       atari_enable_irq(IRQ_MFP_FDC);
-       spin_unlock_irq(&ataflop_lock);
-}
-
  static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                                      const struct blk_mq_queue_data *bd)
  {
@@ -1491,6 +1506,10 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
         int drive = floppy - unit;
         int type = floppy->type;
  
+       DPRINT(("Queue request: drive %d type %d sectors %d of %d last %d\n",
+               drive, type, blk_rq_cur_sectors(bd->rq),
+               blk_rq_sectors(bd->rq), bd->last));
+
         spin_lock_irq(&ataflop_lock);
         if (fd_request) {
                 spin_unlock_irq(&ataflop_lock);
@@ -1511,6 +1530,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                 /* drive not connected */
                 printk(KERN_ERR "Unknown Device: fd%d\n", drive );
                 fd_end_request_cur(BLK_STS_IOERR);
+               stdma_release();
                 goto out;
         }
                 
@@ -1527,11 +1547,13 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
                 if (--type >= NUM_DISK_MINORS) {
                         printk(KERN_WARNING "fd%d: invalid disk format", drive );
                         fd_end_request_cur(BLK_STS_IOERR);
+                       stdma_release();
                         goto out;
                 }
                 if (minor2disktype[type].drive_types > DriveType)  {
                         printk(KERN_WARNING "fd%d: unsupported disk format", drive );
                         fd_end_request_cur(BLK_STS_IOERR);
+                       stdma_release();
                         goto out;
                 }
                 type = minor2disktype[type].index;
@@ -1550,8 +1572,6 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
         setup_req_params( drive );
         do_fd_action( drive );
  
-       if (bd->last)
-               finish_fdc();
         atari_enable_irq( IRQ_MFP_FDC );
  
  out:
@@ -1634,6 +1654,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
                 /* what if type > 0 here? Overwrite specified entry ? */
                 if (type) {
                         /* refuse to re-set a predefined type for now */
+                       finish_fdc();
                         return -EINVAL;
                 }
  
@@ -1701,8 +1722,10 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
  
                 /* sanity check */
                 if (setprm.track != dtp->blocks/dtp->spt/2 ||
-                   setprm.head != 2)
+                   setprm.head != 2) {
+                       finish_fdc();
                         return -EINVAL;
+               }
  
                 UDT = dtp;
                 set_capacity(disk, UDT->blocks);
@@ -1962,7 +1985,6 @@ static const struct block_device_operations floppy_fops = {
  
  static const struct blk_mq_ops ataflop_mq_ops = {
         .queue_rq = ataflop_queue_rq,
-       .commit_rqs = ataflop_commit_rqs,
  };
  
  static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
@@ -2000,12 +2022,28 @@ static void ataflop_probe(dev_t dev)
                 return;
         mutex_lock(&ataflop_probe_lock);
         if (!unit[drive].disk[type]) {
-               if (ataflop_alloc_disk(drive, type) == 0)
+               if (ataflop_alloc_disk(drive, type) == 0) {
                         add_disk(unit[drive].disk[type]);
+                       unit[drive].registered[type] = true;
+               }
         }
         mutex_unlock(&ataflop_probe_lock);
  }
  
+static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs)
+{
+       int type;
+
+       for (type = 0; type < NUM_DISK_MINORS; type++) {
+               if (!fs->disk[type])
+                       continue;
+               if (fs->registered[type])
+                       del_gendisk(fs->disk[type]);
+               blk_cleanup_disk(fs->disk[type]);
+       }
+       blk_mq_free_tag_set(&fs->tag_set);
+}
+
  static int __init atari_floppy_init (void)
  {
         int i;
@@ -2064,7 +2102,10 @@ static int __init atari_floppy_init (void)
         for (i = 0; i < FD_MAX_UNITS; i++) {
                 unit[i].track = -1;
                 unit[i].flags = 0;
-               add_disk(unit[i].disk[0]);
+               ret = add_disk(unit[i].disk[0]);
+               if (ret)
+                       goto err_out_dma;
+               unit[i].registered[0] = true;
         }
  
         printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n",
@@ -2074,12 +2115,11 @@ static int __init atari_floppy_init (void)
  
         return 0;
  
+err_out_dma:
+       atari_stram_free(DMABuffer);
  err:
-       while (--i >= 0) {
-               blk_cleanup_queue(unit[i].disk[0]->queue);
-               put_disk(unit[i].disk[0]);
-               blk_mq_free_tag_set(&unit[i].tag_set);
-       }
+       while (--i >= 0)
+               atari_cleanup_floppy_disk(&unit[i]);
  
         unregister_blkdev(FLOPPY_MAJOR, "fd");
  out_unlock:
@@ -2128,18 +2168,10 @@ __setup("floppy=", atari_floppy_setup);
  
  static void __exit atari_floppy_exit(void)
  {
-       int i, type;
+       int i;
  
-       for (i = 0; i < FD_MAX_UNITS; i++) {
-               for (type = 0; type < NUM_DISK_MINORS; type++) {
-                       if (!unit[i].disk[type])
-                               continue;
-                       del_gendisk(unit[i].disk[type]);
-                       blk_cleanup_queue(unit[i].disk[type]->queue);
-                       put_disk(unit[i].disk[type]);
-               }
-               blk_mq_free_tag_set(&unit[i].tag_set);
-       }
+       for (i = 0; i < FD_MAX_UNITS; i++)
+               atari_cleanup_floppy_disk(&unit[i]);
         unregister_blkdev(FLOPPY_MAJOR, "fd");
  
         del_timer_sync(&fd_timer);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c

index 530b312..aa04727 100644 (file)
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -282,7 +282,7 @@ out:
         return err;
  }
  
-static blk_qc_t brd_submit_bio(struct bio *bio)
+static void brd_submit_bio(struct bio *bio)
  {
         struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
         sector_t sector = bio->bi_iter.bi_sector;
@@ -299,16 +299,14 @@ static blk_qc_t brd_submit_bio(struct bio *bio)
  
                 err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
                                   bio_op(bio), sector);
-               if (err)
-                       goto io_error;
+               if (err) {
+                       bio_io_error(bio);
+                       return;
+               }
                 sector += len >> SECTOR_SHIFT;
         }
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
-io_error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int brd_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c

deleted file mode 100644 (file)

index f0a91fa..0000000
--- a/drivers/block/cryptoloop.c
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-   Linux loop encryption enabling module
-
-   Copyright (C)  2002 Herbert Valerio Riedel <hvr@gnu.org>
-   Copyright (C)  2003 Fruhwirth Clemens <clemens@endorphin.org>
-
- */
-
-#include <linux/module.h>
-
-#include <crypto/skcipher.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/blkdev.h>
-#include <linux/scatterlist.h>
-#include <linux/uaccess.h>
-#include "loop.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI");
-MODULE_AUTHOR("Herbert Valerio Riedel <hvr@gnu.org>");
-
-#define LOOP_IV_SECTOR_BITS 9
-#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS)
-
-static int
-cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
-{
-       int err = -EINVAL;
-       int cipher_len;
-       int mode_len;
-       char cms[LO_NAME_SIZE];                 /* cipher-mode string */
-       char *mode;
-       char *cmsp = cms;                       /* c-m string pointer */
-       struct crypto_sync_skcipher *tfm;
-
-       /* encryption breaks for non sector aligned offsets */
-
-       if (info->lo_offset % LOOP_IV_SECTOR_SIZE)
-               goto out;
-
-       strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
-       cms[LO_NAME_SIZE - 1] = 0;
-
-       cipher_len = strcspn(cmsp, "-");
-
-       mode = cmsp + cipher_len;
-       mode_len = 0;
-       if (*mode) {
-               mode++;
-               mode_len = strcspn(mode, "-");
-       }
-
-       if (!mode_len) {
-               mode = "cbc";
-               mode_len = 3;
-       }
-
-       if (cipher_len + mode_len + 3 > LO_NAME_SIZE)
-               return -EINVAL;
-
-       memmove(cms, mode, mode_len);
-       cmsp = cms + mode_len;
-       *cmsp++ = '(';
-       memcpy(cmsp, info->lo_crypt_name, cipher_len);
-       cmsp += cipher_len;
-       *cmsp++ = ')';
-       *cmsp = 0;
-
-       tfm = crypto_alloc_sync_skcipher(cms, 0, 0);
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       err = crypto_sync_skcipher_setkey(tfm, info->lo_encrypt_key,
-                                         info->lo_encrypt_key_size);
-
-       if (err != 0)
-               goto out_free_tfm;
-
-       lo->key_data = tfm;
-       return 0;
-
- out_free_tfm:
-       crypto_free_sync_skcipher(tfm);
-
- out:
-       return err;
-}
-
-
-typedef int (*encdec_cbc_t)(struct skcipher_request *req);
-
-static int
-cryptoloop_transfer(struct loop_device *lo, int cmd,
-                   struct page *raw_page, unsigned raw_off,
-                   struct page *loop_page, unsigned loop_off,
-                   int size, sector_t IV)
-{
-       struct crypto_sync_skcipher *tfm = lo->key_data;
-       SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-       struct scatterlist sg_out;
-       struct scatterlist sg_in;
-
-       encdec_cbc_t encdecfunc;
-       struct page *in_page, *out_page;
-       unsigned in_offs, out_offs;
-       int err;
-
-       skcipher_request_set_sync_tfm(req, tfm);
-       skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
-                                     NULL, NULL);
-
-       sg_init_table(&sg_out, 1);
-       sg_init_table(&sg_in, 1);
-
-       if (cmd == READ) {
-               in_page = raw_page;
-               in_offs = raw_off;
-               out_page = loop_page;
-               out_offs = loop_off;
-               encdecfunc = crypto_skcipher_decrypt;
-       } else {
-               in_page = loop_page;
-               in_offs = loop_off;
-               out_page = raw_page;
-               out_offs = raw_off;
-               encdecfunc = crypto_skcipher_encrypt;
-       }
-
-       while (size > 0) {
-               const int sz = min(size, LOOP_IV_SECTOR_SIZE);
-               u32 iv[4] = { 0, };
-               iv[0] = cpu_to_le32(IV & 0xffffffff);
-
-               sg_set_page(&sg_in, in_page, sz, in_offs);
-               sg_set_page(&sg_out, out_page, sz, out_offs);
-
-               skcipher_request_set_crypt(req, &sg_in, &sg_out, sz, iv);
-               err = encdecfunc(req);
-               if (err)
-                       goto out;
-
-               IV++;
-               size -= sz;
-               in_offs += sz;
-               out_offs += sz;
-       }
-
-       err = 0;
-
-out:
-       skcipher_request_zero(req);
-       return err;
-}
-
-static int
-cryptoloop_ioctl(struct loop_device *lo, int cmd, unsigned long arg)
-{
-       return -EINVAL;
-}
-
-static int
-cryptoloop_release(struct loop_device *lo)
-{
-       struct crypto_sync_skcipher *tfm = lo->key_data;
-       if (tfm != NULL) {
-               crypto_free_sync_skcipher(tfm);
-               lo->key_data = NULL;
-               return 0;
-       }
-       printk(KERN_ERR "cryptoloop_release(): tfm == NULL?\n");
-       return -EINVAL;
-}
-
-static struct loop_func_table cryptoloop_funcs = {
-       .number = LO_CRYPT_CRYPTOAPI,
-       .init = cryptoloop_init,
-       .ioctl = cryptoloop_ioctl,
-       .transfer = cryptoloop_transfer,
-       .release = cryptoloop_release,
-       .owner = THIS_MODULE
-};
-
-static int __init
-init_cryptoloop(void)
-{
-       int rc = loop_register_transfer(&cryptoloop_funcs);
-
-       if (rc)
-               printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n");
-       else
-               pr_warn("the cryptoloop driver has been deprecated and will be removed in in Linux 5.16\n");
-       return rc;
-}
-
-static void __exit
-cleanup_cryptoloop(void)
-{
-       if (loop_unregister_transfer(LO_CRYPT_CRYPTOAPI))
-               printk(KERN_ERR
-                       "cryptoloop: loop_unregister_transfer failed\n");
-}
-
-module_init(init_cryptoloop);
-module_exit(cleanup_cryptoloop);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h

index 5d91813..f27d5b0 100644 (file)
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1448,7 +1448,7 @@ extern void conn_free_crypto(struct drbd_connection *connection);
  /* drbd_req */
  extern void do_submit(struct work_struct *ws);
  extern void __drbd_make_request(struct drbd_device *, struct bio *);
-extern blk_qc_t drbd_submit_bio(struct bio *bio);
+void drbd_submit_bio(struct bio *bio);
  extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
  extern int is_valid_ar_handle(struct drbd_request *, sector_t);
  
@@ -1826,8 +1826,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
  /* Returns the number of 512 byte sectors of the device */
  static inline sector_t drbd_get_capacity(struct block_device *bdev)
  {
-       /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
-       return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+       return bdev ? bdev_nr_sectors(bdev) : 0;
  }
  
  /**
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c

index 55234a5..19db80a 100644 (file)
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2794,7 +2794,9 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                 goto out_idr_remove_vol;
         }
  
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
  
         /* inherit the connection state */
         device->state.conn = first_connection(resource)->cstate;
@@ -2808,6 +2810,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
         drbd_debugfs_device_add(device);
         return NO_ERROR;
  
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
  out_idr_remove_vol:
         idr_remove(&connection->peer_devices, vnr);
  out_idr_remove_from_resource:
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index 5ca2336..3235532 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1596,7 +1596,7 @@ void do_submit(struct work_struct *ws)
         }
  }
  
-blk_qc_t drbd_submit_bio(struct bio *bio)
+void drbd_submit_bio(struct bio *bio)
  {
         struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
  
@@ -1609,7 +1609,6 @@ blk_qc_t drbd_submit_bio(struct bio *bio)
  
         inc_ap_bio(device);
         __drbd_make_request(device, bio);
-       return BLK_QC_T_NONE;
  }
  
  static bool net_timeout_reached(struct drbd_request *net_req,
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c

index fef79ea..3873e78 100644 (file)
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -184,6 +184,7 @@ static int print_unex = 1;
  #include <linux/ioport.h>
  #include <linux/interrupt.h>
  #include <linux/init.h>
+#include <linux/major.h>
  #include <linux/platform_device.h>
  #include <linux/mod_devicetable.h>
  #include <linux/mutex.h>
@@ -4478,6 +4479,7 @@ static const struct blk_mq_ops floppy_mq_ops = {
  };
  
  static struct platform_device floppy_device[N_DRIVE];
+static bool registered[N_DRIVE];
  
  static bool floppy_available(int drive)
  {
@@ -4693,8 +4695,12 @@ static int __init do_floppy_init(void)
                 if (err)
                         goto out_remove_drives;
  
-               device_add_disk(&floppy_device[drive].dev, disks[drive][0],
-                               NULL);
+               registered[drive] = true;
+
+               err = device_add_disk(&floppy_device[drive].dev,
+                                     disks[drive][0], NULL);
+               if (err)
+                       goto out_remove_drives;
         }
  
         return 0;
@@ -4703,7 +4709,8 @@ out_remove_drives:
         while (drive--) {
                 if (floppy_available(drive)) {
                         del_gendisk(disks[drive][0]);
-                       platform_device_unregister(&floppy_device[drive]);
+                       if (registered[drive])
+                               platform_device_unregister(&floppy_device[drive]);
                 }
         }
  out_release_dma:
@@ -4946,30 +4953,14 @@ static void __exit floppy_module_exit(void)
                                 if (disks[drive][i])
                                         del_gendisk(disks[drive][i]);
                         }
-                       platform_device_unregister(&floppy_device[drive]);
+                       if (registered[drive])
+                               platform_device_unregister(&floppy_device[drive]);
                 }
                 for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
                         if (disks[drive][i])
-                               blk_cleanup_queue(disks[drive][i]->queue);
+                               blk_cleanup_disk(disks[drive][i]);
                 }
                 blk_mq_free_tag_set(&tag_sets[drive]);
-
-               /*
-                * These disks have not called add_disk().  Don't put down
-                * queue reference in put_disk().
-                */
-               if (!(allowed_drive_mask & (1 << drive)) ||
-                   fdc_state[FDC(drive)].version == FDC_NONE) {
-                       for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
-                               if (disks[drive][i])
-                                       disks[drive][i]->queue = NULL;
-                       }
-               }
-
-               for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
-                       if (disks[drive][i])
-                               put_disk(disks[drive][i]);
-               }
         }
  
         cancel_delayed_work_sync(&fd_timeout);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index 7bf4686..3c09a33 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -133,58 +133,6 @@ static void loop_global_unlock(struct loop_device *lo, bool global)
  static int max_part;
  static int part_shift;
  
-static int transfer_xor(struct loop_device *lo, int cmd,
-                       struct page *raw_page, unsigned raw_off,
-                       struct page *loop_page, unsigned loop_off,
-                       int size, sector_t real_block)
-{
-       char *raw_buf = kmap_atomic(raw_page) + raw_off;
-       char *loop_buf = kmap_atomic(loop_page) + loop_off;
-       char *in, *out, *key;
-       int i, keysize;
-
-       if (cmd == READ) {
-               in = raw_buf;
-               out = loop_buf;
-       } else {
-               in = loop_buf;
-               out = raw_buf;
-       }
-
-       key = lo->lo_encrypt_key;
-       keysize = lo->lo_encrypt_key_size;
-       for (i = 0; i < size; i++)
-               *out++ = *in++ ^ key[(i & 511) % keysize];
-
-       kunmap_atomic(loop_buf);
-       kunmap_atomic(raw_buf);
-       cond_resched();
-       return 0;
-}
-
-static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
-{
-       if (unlikely(info->lo_encrypt_key_size <= 0))
-               return -EINVAL;
-       return 0;
-}
-
-static struct loop_func_table none_funcs = {
-       .number = LO_CRYPT_NONE,
-}; 
-
-static struct loop_func_table xor_funcs = {
-       .number = LO_CRYPT_XOR,
-       .transfer = transfer_xor,
-       .init = xor_init
-}; 
-
-/* xfer_funcs[0] is special - its release function is never called */
-static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
-       &none_funcs,
-       &xor_funcs
-};
-
  static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
  {
         loff_t loopsize;
@@ -228,8 +176,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
         /*
          * We support direct I/O only if lo_offset is aligned with the
          * logical I/O size of backing device, and the logical block
-        * size of loop is bigger than the backing device's and the loop
-        * needn't transform transfer.
+        * size of loop is bigger than the backing device's.
          *
          * TODO: the above condition may be loosed in the future, and
          * direct I/O may be switched runtime at that time because most
@@ -238,8 +185,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
         if (dio) {
                 if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
                                 !(lo->lo_offset & dio_align) &&
-                               mapping->a_ops->direct_IO &&
-                               !lo->transfer)
+                               mapping->a_ops->direct_IO)
                         use_dio = true;
                 else
                         use_dio = false;
@@ -273,19 +219,6 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
  }
  
  /**
- * loop_validate_block_size() - validates the passed in block size
- * @bsize: size to validate
- */
-static int
-loop_validate_block_size(unsigned short bsize)
-{
-       if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
   * loop_set_size() - sets device size and notifies userspace
   * @lo: struct loop_device to set the size for
   * @size: new size of the loop device
@@ -299,24 +232,6 @@ static void loop_set_size(struct loop_device *lo, loff_t size)
                 kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
  }
  
-static inline int
-lo_do_transfer(struct loop_device *lo, int cmd,
-              struct page *rpage, unsigned roffs,
-              struct page *lpage, unsigned loffs,
-              int size, sector_t rblock)
-{
-       int ret;
-
-       ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
-       if (likely(!ret))
-               return 0;
-
-       printk_ratelimited(KERN_ERR
-               "loop: Transfer error at byte offset %llu, length %i.\n",
-               (unsigned long long)rblock << 9, size);
-       return ret;
-}
-
  static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
  {
         struct iov_iter i;
@@ -356,41 +271,6 @@ static int lo_write_simple(struct loop_device *lo, struct request *rq,
         return ret;
  }
  
-/*
- * This is the slow, transforming version that needs to double buffer the
- * data as it cannot do the transformations in place without having direct
- * access to the destination pages of the backing file.
- */
-static int lo_write_transfer(struct loop_device *lo, struct request *rq,
-               loff_t pos)
-{
-       struct bio_vec bvec, b;
-       struct req_iterator iter;
-       struct page *page;
-       int ret = 0;
-
-       page = alloc_page(GFP_NOIO);
-       if (unlikely(!page))
-               return -ENOMEM;
-
-       rq_for_each_segment(bvec, rq, iter) {
-               ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
-                       bvec.bv_offset, bvec.bv_len, pos >> 9);
-               if (unlikely(ret))
-                       break;
-
-               b.bv_page = page;
-               b.bv_offset = 0;
-               b.bv_len = bvec.bv_len;
-               ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
-               if (ret < 0)
-                       break;
-       }
-
-       __free_page(page);
-       return ret;
-}
-
  static int lo_read_simple(struct loop_device *lo, struct request *rq,
                 loff_t pos)
  {
@@ -420,64 +300,12 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq,
         return 0;
  }
  
-static int lo_read_transfer(struct loop_device *lo, struct request *rq,
-               loff_t pos)
-{
-       struct bio_vec bvec, b;
-       struct req_iterator iter;
-       struct iov_iter i;
-       struct page *page;
-       ssize_t len;
-       int ret = 0;
-
-       page = alloc_page(GFP_NOIO);
-       if (unlikely(!page))
-               return -ENOMEM;
-
-       rq_for_each_segment(bvec, rq, iter) {
-               loff_t offset = pos;
-
-               b.bv_page = page;
-               b.bv_offset = 0;
-               b.bv_len = bvec.bv_len;
-
-               iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
-               len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
-               if (len < 0) {
-                       ret = len;
-                       goto out_free_page;
-               }
-
-               ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
-                       bvec.bv_offset, len, offset >> 9);
-               if (ret)
-                       goto out_free_page;
-
-               flush_dcache_page(bvec.bv_page);
-
-               if (len != bvec.bv_len) {
-                       struct bio *bio;
-
-                       __rq_for_each_bio(bio, rq)
-                               zero_fill_bio(bio);
-                       break;
-               }
-       }
-
-       ret = 0;
-out_free_page:
-       __free_page(page);
-       return ret;
-}
-
  static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
                         int mode)
  {
         /*
          * We use fallocate to manipulate the space mappings used by the image
-        * a.k.a. discard/zerorange. However we do not support this if
-        * encryption is enabled, because it may give an attacker useful
-        * information.
+        * a.k.a. discard/zerorange.
          */
         struct file *file = lo->lo_backing_file;
         struct request_queue *q = lo->lo_queue;
@@ -554,7 +382,7 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
                 blk_mq_complete_request(rq);
  }
  
-static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
  {
         struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
  
@@ -627,7 +455,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
         lo_rw_aio_do_completion(cmd);
  
         if (ret != -EIOCBQUEUED)
-               cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+               lo_rw_aio_complete(&cmd->iocb, ret);
         return 0;
  }
  
@@ -660,16 +488,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
         case REQ_OP_DISCARD:
                 return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
         case REQ_OP_WRITE:
-               if (lo->transfer)
-                       return lo_write_transfer(lo, rq, pos);
-               else if (cmd->use_aio)
+               if (cmd->use_aio)
                         return lo_rw_aio(lo, cmd, pos, WRITE);
                 else
                         return lo_write_simple(lo, rq, pos);
         case REQ_OP_READ:
-               if (lo->transfer)
-                       return lo_read_transfer(lo, rq, pos);
-               else if (cmd->use_aio)
+               if (cmd->use_aio)
                         return lo_rw_aio(lo, cmd, pos, READ);
                 else
                         return lo_read_simple(lo, rq, pos);
@@ -934,7 +758,7 @@ static void loop_config_discard(struct loop_device *lo)
          * not blkdev_issue_discard(). This maintains consistent behavior with
          * file-backed loop devices: discarded regions read back as zero.
          */
-       if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+       if (S_ISBLK(inode->i_mode)) {
                 struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
  
                 max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
@@ -943,11 +767,9 @@ static void loop_config_discard(struct loop_device *lo)
  
         /*
          * We use punch hole to reclaim the free space used by the
-        * image a.k.a. discard. However we do not support discard if
-        * encryption is enabled, because it may give an attacker
-        * useful information.
+        * image a.k.a. discard.
          */
-       } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
+       } else if (!file->f_op->fallocate) {
                 max_discard_sectors = 0;
                 granularity = 0;
  
@@ -1084,43 +906,6 @@ static void loop_update_rotational(struct loop_device *lo)
                 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
  }
  
-static int
-loop_release_xfer(struct loop_device *lo)
-{
-       int err = 0;
-       struct loop_func_table *xfer = lo->lo_encryption;
-
-       if (xfer) {
-               if (xfer->release)
-                       err = xfer->release(lo);
-               lo->transfer = NULL;
-               lo->lo_encryption = NULL;
-               module_put(xfer->owner);
-       }
-       return err;
-}
-
-static int
-loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
-              const struct loop_info64 *i)
-{
-       int err = 0;
-
-       if (xfer) {
-               struct module *owner = xfer->owner;
-
-               if (!try_module_get(owner))
-                       return -EINVAL;
-               if (xfer->init)
-                       err = xfer->init(lo, i);
-               if (err)
-                       module_put(owner);
-               else
-                       lo->lo_encryption = xfer;
-       }
-       return err;
-}
-
  /**
   * loop_set_status_from_info - configure device from loop_info
   * @lo: struct loop_device to configure
@@ -1133,55 +918,27 @@ static int
  loop_set_status_from_info(struct loop_device *lo,
                           const struct loop_info64 *info)
  {
-       int err;
-       struct loop_func_table *xfer;
-       kuid_t uid = current_uid();
-
         if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                 return -EINVAL;
  
-       err = loop_release_xfer(lo);
-       if (err)
-               return err;
-
-       if (info->lo_encrypt_type) {
-               unsigned int type = info->lo_encrypt_type;
-
-               if (type >= MAX_LO_CRYPT)
-                       return -EINVAL;
-               xfer = xfer_funcs[type];
-               if (xfer == NULL)
-                       return -EINVAL;
-       } else
-               xfer = NULL;
-
-       err = loop_init_xfer(lo, xfer, info);
-       if (err)
-               return err;
+       switch (info->lo_encrypt_type) {
+       case LO_CRYPT_NONE:
+               break;
+       case LO_CRYPT_XOR:
+               pr_warn("support for the xor transformation has been removed.\n");
+               return -EINVAL;
+       case LO_CRYPT_CRYPTOAPI:
+               pr_warn("support for cryptoloop has been removed.  Use dm-crypt instead.\n");
+               return -EINVAL;
+       default:
+               return -EINVAL;
+       }
  
         lo->lo_offset = info->lo_offset;
         lo->lo_sizelimit = info->lo_sizelimit;
         memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
-       memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
         lo->lo_file_name[LO_NAME_SIZE-1] = 0;
-       lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
-
-       if (!xfer)
-               xfer = &none_funcs;
-       lo->transfer = xfer->transfer;
-       lo->ioctl = xfer->ioctl;
-
         lo->lo_flags = info->lo_flags;
-
-       lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
-       lo->lo_init[0] = info->lo_init[0];
-       lo->lo_init[1] = info->lo_init[1];
-       if (info->lo_encrypt_key_size) {
-               memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
-                      info->lo_encrypt_key_size);
-               lo->lo_key_owner = uid;
-       }
-
         return 0;
  }
  
@@ -1236,7 +993,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
         }
  
         if (config->block_size) {
-               error = loop_validate_block_size(config->block_size);
+               error = blk_validate_block_size(config->block_size);
                 if (error)
                         goto out_unlock;
         }
@@ -1329,7 +1086,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
  {
         struct file *filp = NULL;
         gfp_t gfp = lo->old_gfp_mask;
-       struct block_device *bdev = lo->lo_device;
         int err = 0;
         bool partscan = false;
         int lo_number;
@@ -1381,36 +1137,23 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
         lo->lo_backing_file = NULL;
         spin_unlock_irq(&lo->lo_lock);
  
-       loop_release_xfer(lo);
-       lo->transfer = NULL;
-       lo->ioctl = NULL;
         lo->lo_device = NULL;
-       lo->lo_encryption = NULL;
         lo->lo_offset = 0;
         lo->lo_sizelimit = 0;
-       lo->lo_encrypt_key_size = 0;
-       memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
-       memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
         memset(lo->lo_file_name, 0, LO_NAME_SIZE);
         blk_queue_logical_block_size(lo->lo_queue, 512);
         blk_queue_physical_block_size(lo->lo_queue, 512);
         blk_queue_io_min(lo->lo_queue, 512);
-       if (bdev) {
-               invalidate_bdev(bdev);
-               bdev->bd_inode->i_mapping->wb_err = 0;
-       }
-       set_capacity(lo->lo_disk, 0);
+       invalidate_disk(lo->lo_disk);
         loop_sysfs_exit(lo);
-       if (bdev) {
-               /* let user-space know about this change */
-               kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
-       }
+       /* let user-space know about this change */
+       kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
         mapping_set_gfp_mask(filp->f_mapping, gfp);
         /* This is safe: open() is still holding a reference. */
         module_put(THIS_MODULE);
         blk_mq_unfreeze_queue(lo->lo_queue);
  
-       partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+       partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
         lo_number = lo->lo_number;
         disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
  out_unlock:
@@ -1498,7 +1241,6 @@ static int
  loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
  {
         int err;
-       kuid_t uid = current_uid();
         int prev_lo_flags;
         bool partscan = false;
         bool size_changed = false;
@@ -1506,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
         err = mutex_lock_killable(&lo->lo_mutex);
         if (err)
                 return err;
-       if (lo->lo_encrypt_key_size &&
-           !uid_eq(lo->lo_key_owner, uid) &&
-           !capable(CAP_SYS_ADMIN)) {
-               err = -EPERM;
-               goto out_unlock;
-       }
         if (lo->lo_state != Lo_bound) {
                 err = -ENXIO;
                 goto out_unlock;
@@ -1597,14 +1333,6 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
         info->lo_sizelimit = lo->lo_sizelimit;
         info->lo_flags = lo->lo_flags;
         memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
-       memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
-       info->lo_encrypt_type =
-               lo->lo_encryption ? lo->lo_encryption->number : 0;
-       if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
-               info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
-               memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
-                      lo->lo_encrypt_key_size);
-       }
  
         /* Drop lo_mutex while we call into the filesystem. */
         path = lo->lo_backing_file->f_path;
@@ -1630,16 +1358,8 @@ loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
         info64->lo_rdevice = info->lo_rdevice;
         info64->lo_offset = info->lo_offset;
         info64->lo_sizelimit = 0;
-       info64->lo_encrypt_type = info->lo_encrypt_type;
-       info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
         info64->lo_flags = info->lo_flags;
-       info64->lo_init[0] = info->lo_init[0];
-       info64->lo_init[1] = info->lo_init[1];
-       if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
-       else
-               memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
-       memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
  }
  
  static int
@@ -1651,16 +1371,8 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
         info->lo_inode = info64->lo_inode;
         info->lo_rdevice = info64->lo_rdevice;
         info->lo_offset = info64->lo_offset;
-       info->lo_encrypt_type = info64->lo_encrypt_type;
-       info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
         info->lo_flags = info64->lo_flags;
-       info->lo_init[0] = info64->lo_init[0];
-       info->lo_init[1] = info64->lo_init[1];
-       if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
-       else
-               memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
-       memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
  
         /* error in case values were truncated */
         if (info->lo_device != info64->lo_device ||
@@ -1759,7 +1471,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
         if (lo->lo_state != Lo_bound)
                 return -ENXIO;
  
-       err = loop_validate_block_size(arg);
+       err = blk_validate_block_size(arg);
         if (err)
                 return err;
  
@@ -1809,7 +1521,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
                 err = loop_set_block_size(lo, arg);
                 break;
         default:
-               err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+               err = -EINVAL;
         }
         mutex_unlock(&lo->lo_mutex);
         return err;
@@ -1885,7 +1597,6 @@ struct compat_loop_info {
         compat_ulong_t  lo_inode;       /* ioctl r/o */
         compat_dev_t    lo_rdevice;     /* ioctl r/o */
         compat_int_t    lo_offset;
-       compat_int_t    lo_encrypt_type;
         compat_int_t    lo_encrypt_key_size;    /* ioctl w/o */
         compat_int_t    lo_flags;       /* ioctl r/o */
         char            lo_name[LO_NAME_SIZE];
@@ -1914,16 +1625,8 @@ loop_info64_from_compat(const struct compat_loop_info __user *arg,
         info64->lo_rdevice = info.lo_rdevice;
         info64->lo_offset = info.lo_offset;
         info64->lo_sizelimit = 0;
-       info64->lo_encrypt_type = info.lo_encrypt_type;
-       info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
         info64->lo_flags = info.lo_flags;
-       info64->lo_init[0] = info.lo_init[0];
-       info64->lo_init[1] = info.lo_init[1];
-       if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
-       else
-               memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
-       memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
         return 0;
  }
  
@@ -1943,24 +1646,14 @@ loop_info64_to_compat(const struct loop_info64 *info64,
         info.lo_inode = info64->lo_inode;
         info.lo_rdevice = info64->lo_rdevice;
         info.lo_offset = info64->lo_offset;
-       info.lo_encrypt_type = info64->lo_encrypt_type;
-       info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
         info.lo_flags = info64->lo_flags;
-       info.lo_init[0] = info64->lo_init[0];
-       info.lo_init[1] = info64->lo_init[1];
-       if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
-               memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
-       else
-               memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
-       memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+       memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
  
         /* error in case values were truncated */
         if (info.lo_device != info64->lo_device ||
             info.lo_rdevice != info64->lo_rdevice ||
             info.lo_inode != info64->lo_inode ||
-           info.lo_offset != info64->lo_offset ||
-           info.lo_init[0] != info64->lo_init[0] ||
-           info.lo_init[1] != info64->lo_init[1])
+           info.lo_offset != info64->lo_offset)
                 return -EOVERFLOW;
  
         if (copy_to_user(arg, &info, sizeof(info)))
@@ -2101,43 +1794,6 @@ MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
  
-int loop_register_transfer(struct loop_func_table *funcs)
-{
-       unsigned int n = funcs->number;
-
-       if (n >= MAX_LO_CRYPT || xfer_funcs[n])
-               return -EINVAL;
-       xfer_funcs[n] = funcs;
-       return 0;
-}
-
-int loop_unregister_transfer(int number)
-{
-       unsigned int n = number;
-       struct loop_func_table *xfer;
-
-       if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
-               return -EINVAL;
-       /*
-        * This function is called from only cleanup_cryptoloop().
-        * Given that each loop device that has a transfer enabled holds a
-        * reference to the module implementing it we should never get here
-        * with a transfer that is set (unless forced module unloading is
-        * requested). Thus, check module's refcount and warn if this is
-        * not a clean unloading.
-        */
-#ifdef CONFIG_MODULE_UNLOAD
-       if (xfer->owner && module_refcount(xfer->owner) != -1)
-               pr_err("Danger! Unregistering an in use transfer function.\n");
-#endif
-
-       xfer_funcs[n] = NULL;
-       return 0;
-}
-
-EXPORT_SYMBOL(loop_register_transfer);
-EXPORT_SYMBOL(loop_unregister_transfer);
-
  static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                 const struct blk_mq_queue_data *bd)
  {
@@ -2394,13 +2050,19 @@ static int loop_add(int i)
         disk->event_flags       = DISK_EVENT_FLAG_UEVENT;
         sprintf(disk->disk_name, "loop%d", i);
         /* Make this loop device reachable from pathname. */
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
+
         /* Show this loop device. */
         mutex_lock(&loop_ctl_mutex);
         lo->idr_visible = true;
         mutex_unlock(&loop_ctl_mutex);
+
         return i;
  
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
  out_cleanup_tags:
         blk_mq_free_tag_set(&lo->tag_set);
  out_free_idr:
diff --git a/drivers/block/loop.h b/drivers/block/loop.h

index 04c88dd..082d4b6 100644 (file)
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -32,23 +32,10 @@ struct loop_device {
         loff_t          lo_offset;
         loff_t          lo_sizelimit;
         int             lo_flags;
-       int             (*transfer)(struct loop_device *, int cmd,
-                                   struct page *raw_page, unsigned raw_off,
-                                   struct page *loop_page, unsigned loop_off,
-                                   int size, sector_t real_block);
         char            lo_file_name[LO_NAME_SIZE];
-       char            lo_crypt_name[LO_NAME_SIZE];
-       char            lo_encrypt_key[LO_KEY_SIZE];
-       int             lo_encrypt_key_size;
-       struct loop_func_table *lo_encryption;
-       __u32           lo_init[2];
-       kuid_t          lo_key_owner;   /* Who set the key */
-       int             (*ioctl)(struct loop_device *, int cmd, 
-                                unsigned long arg); 
  
         struct file *   lo_backing_file;
         struct block_device *lo_device;
-       void            *key_data; 
  
         gfp_t           old_gfp_mask;
  
@@ -82,21 +69,4 @@ struct loop_cmd {
         struct cgroup_subsys_state *memcg_css;
  };
  
-/* Support for loadable transfer modules */
-struct loop_func_table {
-       int number;     /* filter type */ 
-       int (*transfer)(struct loop_device *lo, int cmd,
-                       struct page *raw_page, unsigned raw_off,
-                       struct page *loop_page, unsigned loop_off,
-                       int size, sector_t real_block);
-       int (*init)(struct loop_device *, const struct loop_info64 *); 
-       /* release is called from loop_unregister_transfer or clr_fd */
-       int (*release)(struct loop_device *); 
-       int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
-       struct module *owner;
-}; 
-
-int loop_register_transfer(struct loop_func_table *funcs);
-int loop_unregister_transfer(int number); 
-
  #endif
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c

index 9018557..c91b901 100644 (file)
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3633,7 +3633,9 @@ skip_create_disk:
         set_capacity(dd->disk, capacity);
  
         /* Enable the block device and add it to /dev */
-       device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups);
+       rv = device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups);
+       if (rv)
+               goto read_capacity_error;
  
         if (dd->mtip_svc_handler) {
                 set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
@@ -4061,7 +4063,6 @@ block_initialize_err:
  
  msi_initialize_err:
         if (dd->isr_workq) {
-               flush_workqueue(dd->isr_workq);
                 destroy_workqueue(dd->isr_workq);
                 drop_cpu(dd->work[0].cpu_binding);
                 drop_cpu(dd->work[1].cpu_binding);
@@ -4119,7 +4120,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
         mtip_block_remove(dd);
  
         if (dd->isr_workq) {
-               flush_workqueue(dd->isr_workq);
                 destroy_workqueue(dd->isr_workq);
                 drop_cpu(dd->work[0].cpu_binding);
                 drop_cpu(dd->work[1].cpu_binding);
diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c

index 26798da..78282f0 100644 (file)
--- a/drivers/block/n64cart.c
+++ b/drivers/block/n64cart.c
@@ -84,7 +84,7 @@ static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos)
         return true;
  }
  
-static blk_qc_t n64cart_submit_bio(struct bio *bio)
+static void n64cart_submit_bio(struct bio *bio)
  {
         struct bio_vec bvec;
         struct bvec_iter iter;
@@ -92,16 +92,14 @@ static blk_qc_t n64cart_submit_bio(struct bio *bio)
         u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
  
         bio_for_each_segment(bvec, bio, iter) {
-               if (!n64cart_do_bvec(dev, &bvec, pos))
-                       goto io_error;
+               if (!n64cart_do_bvec(dev, &bvec, pos)) {
+                       bio_io_error(bio);
+                       return;
+               }
                 pos += bvec.bv_len;
         }
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
-io_error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
  }
  
  static const struct block_device_operations n64cart_fops = {
@@ -117,6 +115,7 @@ static const struct block_device_operations n64cart_fops = {
  static int __init n64cart_probe(struct platform_device *pdev)
  {
         struct gendisk *disk;
+       int err = -ENOMEM;
  
         if (!start || !size) {
                 pr_err("start or size not specified\n");
@@ -134,7 +133,7 @@ static int __init n64cart_probe(struct platform_device *pdev)
  
         disk = blk_alloc_disk(NUMA_NO_NODE);
         if (!disk)
-               return -ENOMEM;
+               goto out;
  
         disk->first_minor = 0;
         disk->flags = GENHD_FL_NO_PART_SCAN;
@@ -149,11 +148,18 @@ static int __init n64cart_probe(struct platform_device *pdev)
         blk_queue_physical_block_size(disk->queue, 4096);
         blk_queue_logical_block_size(disk->queue, 4096);
  
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_cleanup_disk;
  
         pr_info("n64cart: %u kb disk\n", size / 1024);
  
         return 0;
+
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
+out:
+       return err;
  }
  
  static struct platform_driver n64cart_driver = {
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c

index 1183f78..b47b2a8 100644 (file)
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -122,15 +122,21 @@ struct nbd_device {
         struct work_struct remove_work;
  
         struct list_head list;
-       struct task_struct *task_recv;
         struct task_struct *task_setup;
  
         unsigned long flags;
+       pid_t pid; /* pid of nbd-client, if attached */
  
         char *backend;
  };
  
  #define NBD_CMD_REQUEUED       1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT       2
  
  struct nbd_cmd {
         struct nbd_device *nbd;
@@ -217,7 +223,7 @@ static ssize_t pid_show(struct device *dev,
         struct gendisk *disk = dev_to_disk(dev);
         struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
  
-       return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+       return sprintf(buf, "%d\n", nbd->pid);
  }
  
  static const struct device_attribute pid_attr = {
@@ -310,26 +316,19 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
         nsock->sent = 0;
  }
  
-static void nbd_size_clear(struct nbd_device *nbd)
-{
-       if (nbd->config->bytesize) {
-               set_capacity(nbd->disk, 0);
-               kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
-       }
-}
-
  static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
                 loff_t blksize)
  {
         if (!blksize)
                 blksize = 1u << NBD_DEF_BLKSIZE_BITS;
-       if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
+
+       if (blk_validate_block_size(blksize))
                 return -EINVAL;
  
         nbd->config->bytesize = bytesize;
         nbd->config->blksize_bits = __ffs(blksize);
  
-       if (!nbd->task_recv)
+       if (!nbd->pid)
                 return 0;
  
         if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
@@ -405,6 +404,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
         if (!mutex_trylock(&cmd->lock))
                 return BLK_EH_RESET_TIMER;
  
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               mutex_unlock(&cmd->lock);
+               return BLK_EH_DONE;
+       }
+
         if (!refcount_inc_not_zero(&nbd->config_refs)) {
                 cmd->status = BLK_STS_TIMEOUT;
                 mutex_unlock(&cmd->lock);
@@ -484,7 +488,8 @@ done:
  }
  
  /*
- *  Send or receive packet.
+ *  Send or receive packet. Return a positive value on success and
+ *  negtive value on failue, and never return 0.
   */
  static int sock_xmit(struct nbd_device *nbd, int index, int send,
                      struct iov_iter *iter, int msg_flags, int *sent)
@@ -610,7 +615,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
         result = sock_xmit(nbd, index, 1, &from,
                         (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
         trace_nbd_header_sent(req, handle);
-       if (result <= 0) {
+       if (result < 0) {
                 if (was_interrupted(result)) {
                         /* If we havne't sent anything we can just return BUSY,
                          * however if we have sent something we need to make
@@ -654,7 +659,7 @@ send_pages:
                                 skip = 0;
                         }
                         result = sock_xmit(nbd, index, 1, &from, flags, &sent);
-                       if (result <= 0) {
+                       if (result < 0) {
                                 if (was_interrupted(result)) {
                                         /* We've already sent the header, we
                                          * have no choice but to set pending and
@@ -688,38 +693,45 @@ out:
         return 0;
  }
  
-/* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
+static int nbd_read_reply(struct nbd_device *nbd, int index,
+                         struct nbd_reply *reply)
  {
-       struct nbd_config *config = nbd->config;
-       int result;
-       struct nbd_reply reply;
-       struct nbd_cmd *cmd;
-       struct request *req = NULL;
-       u64 handle;
-       u16 hwq;
-       u32 tag;
-       struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
+       struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
         struct iov_iter to;
-       int ret = 0;
+       int result;
  
-       reply.magic = 0;
-       iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
+       reply->magic = 0;
+       iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply));
         result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-       if (result <= 0) {
-               if (!nbd_disconnected(config))
+       if (result < 0) {
+               if (!nbd_disconnected(nbd->config))
                         dev_err(disk_to_dev(nbd->disk),
                                 "Receive control failed (result %d)\n", result);
-               return ERR_PTR(result);
+               return result;
         }
  
-       if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
+       if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
                 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
-                               (unsigned long)ntohl(reply.magic));
-               return ERR_PTR(-EPROTO);
+                               (unsigned long)ntohl(reply->magic));
+               return -EPROTO;
         }
  
-       memcpy(&handle, reply.handle, sizeof(handle));
+       return 0;
+}
+
+/* NULL returned = something went wrong, inform userspace */
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+                                       struct nbd_reply *reply)
+{
+       int result;
+       struct nbd_cmd *cmd;
+       struct request *req = NULL;
+       u64 handle;
+       u16 hwq;
+       u32 tag;
+       int ret = 0;
+
+       memcpy(&handle, reply->handle, sizeof(handle));
         tag = nbd_handle_to_tag(handle);
         hwq = blk_mq_unique_tag_to_hwq(tag);
         if (hwq < nbd->tag_set.nr_hw_queues)
@@ -734,6 +746,16 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
         cmd = blk_mq_rq_to_pdu(req);
  
         mutex_lock(&cmd->lock);
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+                       tag, cmd->status, cmd->flags);
+               ret = -ENOENT;
+               goto out;
+       }
+       if (cmd->index != index) {
+               dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+                       tag, index, cmd->index);
+       }
         if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
                 dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
                         req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
@@ -752,9 +774,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
                 ret = -ENOENT;
                 goto out;
         }
-       if (ntohl(reply.error)) {
+       if (ntohl(reply->error)) {
                 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
-                       ntohl(reply.error));
+                       ntohl(reply->error));
                 cmd->status = BLK_STS_IOERR;
                 goto out;
         }
@@ -763,11 +785,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
         if (rq_data_dir(req) != WRITE) {
                 struct req_iterator iter;
                 struct bio_vec bvec;
+               struct iov_iter to;
  
                 rq_for_each_segment(bvec, req, iter) {
                         iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
                         result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-                       if (result <= 0) {
+                       if (result < 0) {
                                 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
                                         result);
                                 /*
@@ -776,7 +799,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
                                  * and let the timeout stuff handle resubmitting
                                  * this request onto another connection.
                                  */
-                               if (nbd_disconnected(config)) {
+                               if (nbd_disconnected(nbd->config)) {
                                         cmd->status = BLK_STS_IOERR;
                                         goto out;
                                 }
@@ -800,24 +823,46 @@ static void recv_work(struct work_struct *work)
                                                      work);
         struct nbd_device *nbd = args->nbd;
         struct nbd_config *config = nbd->config;
+       struct request_queue *q = nbd->disk->queue;
+       struct nbd_sock *nsock;
         struct nbd_cmd *cmd;
         struct request *rq;
  
         while (1) {
-               cmd = nbd_read_stat(nbd, args->index);
-               if (IS_ERR(cmd)) {
-                       struct nbd_sock *nsock = config->socks[args->index];
+               struct nbd_reply reply;
  
-                       mutex_lock(&nsock->tx_lock);
-                       nbd_mark_nsock_dead(nbd, nsock, 1);
-                       mutex_unlock(&nsock->tx_lock);
+               if (nbd_read_reply(nbd, args->index, &reply))
+                       break;
+
+               /*
+                * Grab .q_usage_counter so request pool won't go away, then no
+                * request use-after-free is possible during nbd_handle_reply().
+                * If queue is frozen, there won't be any inflight requests, we
+                * needn't to handle the incoming garbage message.
+                */
+               if (!percpu_ref_tryget(&q->q_usage_counter)) {
+                       dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+                               __func__);
+                       break;
+               }
+
+               cmd = nbd_handle_reply(nbd, args->index, &reply);
+               if (IS_ERR(cmd)) {
+                       percpu_ref_put(&q->q_usage_counter);
                         break;
                 }
  
                 rq = blk_mq_rq_from_pdu(cmd);
                 if (likely(!blk_should_fake_timeout(rq->q)))
                         blk_mq_complete_request(rq);
+               percpu_ref_put(&q->q_usage_counter);
         }
+
+       nsock = config->socks[args->index];
+       mutex_lock(&nsock->tx_lock);
+       nbd_mark_nsock_dead(nbd, nsock, 1);
+       mutex_unlock(&nsock->tx_lock);
+
         nbd_config_put(nbd);
         atomic_dec(&config->recv_threads);
         wake_up(&config->recv_wq);
@@ -833,6 +878,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
                 return true;
  
         mutex_lock(&cmd->lock);
+       if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+               mutex_unlock(&cmd->lock);
+               return true;
+       }
         cmd->status = BLK_STS_IOERR;
         mutex_unlock(&cmd->lock);
  
@@ -914,7 +963,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
         if (!refcount_inc_not_zero(&nbd->config_refs)) {
                 dev_err_ratelimited(disk_to_dev(nbd->disk),
                                     "Socks array is empty\n");
-               blk_mq_start_request(req);
                 return -EINVAL;
         }
         config = nbd->config;
@@ -923,7 +971,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
                 dev_err_ratelimited(disk_to_dev(nbd->disk),
                                     "Attempted send on invalid socket\n");
                 nbd_config_put(nbd);
-               blk_mq_start_request(req);
                 return -EINVAL;
         }
         cmd->status = BLK_STS_OK;
@@ -947,7 +994,6 @@ again:
                          */
                         sock_shutdown(nbd);
                         nbd_config_put(nbd);
-                       blk_mq_start_request(req);
                         return -EIO;
                 }
                 goto again;
@@ -969,7 +1015,13 @@ again:
          * returns EAGAIN can be retried on a different socket.
          */
         ret = nbd_send_cmd(nbd, cmd, index);
-       if (ret == -EAGAIN) {
+       /*
+        * Access to this flag is protected by cmd->lock, thus it's safe to set
+        * the flag after nbd_send_cmd() succeed to send request to server.
+        */
+       if (!ret)
+               __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+       else if (ret == -EAGAIN) {
                 dev_err_ratelimited(disk_to_dev(nbd->disk),
                                     "Request send failed, requeueing\n");
                 nbd_mark_nsock_dead(nbd, nsock, 1);
@@ -1206,7 +1258,7 @@ static void send_disconnects(struct nbd_device *nbd)
                 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
                 mutex_lock(&nsock->tx_lock);
                 ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
-               if (ret <= 0)
+               if (ret < 0)
                         dev_err(disk_to_dev(nbd->disk),
                                 "Send disconnect failed %d\n", ret);
                 mutex_unlock(&nsock->tx_lock);
@@ -1237,11 +1289,13 @@ static void nbd_config_put(struct nbd_device *nbd)
                                         &nbd->config_lock)) {
                 struct nbd_config *config = nbd->config;
                 nbd_dev_dbg_close(nbd);
-               nbd_size_clear(nbd);
+               invalidate_disk(nbd->disk);
+               if (nbd->config->bytesize)
+                       kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
                 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
                                        &config->runtime_flags))
                         device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-               nbd->task_recv = NULL;
+               nbd->pid = 0;
                 if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
                                        &config->runtime_flags)) {
                         device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
@@ -1282,7 +1336,7 @@ static int nbd_start_device(struct nbd_device *nbd)
         int num_connections = config->num_connections;
         int error = 0, i;
  
-       if (nbd->task_recv)
+       if (nbd->pid)
                 return -EBUSY;
         if (!config->socks)
                 return -EINVAL;
@@ -1301,7 +1355,7 @@ static int nbd_start_device(struct nbd_device *nbd)
         }
  
         blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
-       nbd->task_recv = current;
+       nbd->pid = task_pid_nr(current);
  
         nbd_parse_flags(nbd);
  
@@ -1557,8 +1611,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
  {
         struct nbd_device *nbd = s->private;
  
-       if (nbd->task_recv)
-               seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+       if (nbd->pid)
+               seq_printf(s, "recv: %d\n", nbd->pid);
  
         return 0;
  }
@@ -1762,7 +1816,9 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
         disk->fops = &nbd_fops;
         disk->private_data = nbd;
         sprintf(disk->disk_name, "nbd%d", index);
-       add_disk(disk);
+       err = add_disk(disk);
+       if (err)
+               goto out_err_disk;
  
         /*
          * Now publish the device.
@@ -1771,6 +1827,8 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
         nbd_total_devices++;
         return nbd;
  
+out_err_disk:
+       blk_cleanup_disk(disk);
  out_free_idr:
         mutex_lock(&nbd_index_mutex);
         idr_remove(&nbd_index_idr, index);
@@ -2135,7 +2193,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
         mutex_lock(&nbd->config_lock);
         config = nbd->config;
         if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
-           !nbd->task_recv) {
+           !nbd->pid) {
                 dev_err(nbd_to_dev(nbd),
                         "not configured, cannot reconfigure\n");
                 ret = -EINVAL;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c

index 187d779..323af5c 100644 (file)
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -92,6 +92,10 @@ static int g_submit_queues = 1;
  module_param_named(submit_queues, g_submit_queues, int, 0444);
  MODULE_PARM_DESC(submit_queues, "Number of submission queues");
  
+static int g_poll_queues = 1;
+module_param_named(poll_queues, g_poll_queues, int, 0444);
+MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");
+
  static int g_home_node = NUMA_NO_NODE;
  module_param_named(home_node, g_home_node, int, 0444);
  MODULE_PARM_DESC(home_node, "Home node for the device");
@@ -324,29 +328,69 @@ nullb_device_##NAME##_store(struct config_item *item, const char *page,   \
  }                                                                      \
  CONFIGFS_ATTR(nullb_device_, NAME);
  
-static int nullb_apply_submit_queues(struct nullb_device *dev,
-                                    unsigned int submit_queues)
+static int nullb_update_nr_hw_queues(struct nullb_device *dev,
+                                    unsigned int submit_queues,
+                                    unsigned int poll_queues)
+
  {
-       struct nullb *nullb = dev->nullb;
         struct blk_mq_tag_set *set;
+       int ret, nr_hw_queues;
  
-       if (!nullb)
+       if (!dev->nullb)
                 return 0;
  
         /*
+        * Make sure at least one queue exists for each of submit and poll.
+        */
+       if (!submit_queues || !poll_queues)
+               return -EINVAL;
+
+       /*
          * Make sure that null_init_hctx() does not access nullb->queues[] past
          * the end of that array.
          */
-       if (submit_queues > nr_cpu_ids)
+       if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
                 return -EINVAL;
-       set = nullb->tag_set;
-       blk_mq_update_nr_hw_queues(set, submit_queues);
-       return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM;
+
+       /*
+        * Keep previous and new queue numbers in nullb_device for reference in
+        * the call back function null_map_queues().
+        */
+       dev->prev_submit_queues = dev->submit_queues;
+       dev->prev_poll_queues = dev->poll_queues;
+       dev->submit_queues = submit_queues;
+       dev->poll_queues = poll_queues;
+
+       set = dev->nullb->tag_set;
+       nr_hw_queues = submit_queues + poll_queues;
+       blk_mq_update_nr_hw_queues(set, nr_hw_queues);
+       ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;
+
+       if (ret) {
+               /* on error, revert the queue numbers */
+               dev->submit_queues = dev->prev_submit_queues;
+               dev->poll_queues = dev->prev_poll_queues;
+       }
+
+       return ret;
+}
+
+static int nullb_apply_submit_queues(struct nullb_device *dev,
+                                    unsigned int submit_queues)
+{
+       return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
+}
+
+static int nullb_apply_poll_queues(struct nullb_device *dev,
+                                  unsigned int poll_queues)
+{
+       return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
  }
  
  NULLB_DEVICE_ATTR(size, ulong, NULL);
  NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
  NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
+NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
  NULLB_DEVICE_ATTR(home_node, uint, NULL);
  NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
  NULLB_DEVICE_ATTR(blocksize, uint, NULL);
@@ -466,6 +510,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
         &nullb_device_attr_size,
         &nullb_device_attr_completion_nsec,
         &nullb_device_attr_submit_queues,
+       &nullb_device_attr_poll_queues,
         &nullb_device_attr_home_node,
         &nullb_device_attr_queue_mode,
         &nullb_device_attr_blocksize,
@@ -593,6 +638,9 @@ static struct nullb_device *null_alloc_dev(void)
         dev->size = g_gb * 1024;
         dev->completion_nsec = g_completion_nsec;
         dev->submit_queues = g_submit_queues;
+       dev->prev_submit_queues = g_submit_queues;
+       dev->poll_queues = g_poll_queues;
+       dev->prev_poll_queues = g_poll_queues;
         dev->home_node = g_home_node;
         dev->queue_mode = g_queue_mode;
         dev->blocksize = g_bs;
@@ -1422,7 +1470,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
         return &nullb->queues[index];
  }
  
-static blk_qc_t null_submit_bio(struct bio *bio)
+static void null_submit_bio(struct bio *bio)
  {
         sector_t sector = bio->bi_iter.bi_sector;
         sector_t nr_sectors = bio_sectors(bio);
@@ -1434,7 +1482,6 @@ static blk_qc_t null_submit_bio(struct bio *bio)
         cmd->bio = bio;
  
         null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
-       return BLK_QC_T_NONE;
  }
  
  static bool should_timeout_request(struct request *rq)
@@ -1455,12 +1502,100 @@ static bool should_requeue_request(struct request *rq)
         return false;
  }
  
+static int null_map_queues(struct blk_mq_tag_set *set)
+{
+       struct nullb *nullb = set->driver_data;
+       int i, qoff;
+       unsigned int submit_queues = g_submit_queues;
+       unsigned int poll_queues = g_poll_queues;
+
+       if (nullb) {
+               struct nullb_device *dev = nullb->dev;
+
+               /*
+                * Refer nr_hw_queues of the tag set to check if the expected
+                * number of hardware queues are prepared. If block layer failed
+                * to prepare them, use previous numbers of submit queues and
+                * poll queues to map queues.
+                */
+               if (set->nr_hw_queues ==
+                   dev->submit_queues + dev->poll_queues) {
+                       submit_queues = dev->submit_queues;
+                       poll_queues = dev->poll_queues;
+               } else if (set->nr_hw_queues ==
+                          dev->prev_submit_queues + dev->prev_poll_queues) {
+                       submit_queues = dev->prev_submit_queues;
+                       poll_queues = dev->prev_poll_queues;
+               } else {
+                       pr_warn("tag set has unexpected nr_hw_queues: %d\n",
+                               set->nr_hw_queues);
+                       return -EINVAL;
+               }
+       }
+
+       for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+               struct blk_mq_queue_map *map = &set->map[i];
+
+               switch (i) {
+               case HCTX_TYPE_DEFAULT:
+                       map->nr_queues = submit_queues;
+                       break;
+               case HCTX_TYPE_READ:
+                       map->nr_queues = 0;
+                       continue;
+               case HCTX_TYPE_POLL:
+                       map->nr_queues = poll_queues;
+                       break;
+               }
+               map->queue_offset = qoff;
+               qoff += map->nr_queues;
+               blk_mq_map_queues(map);
+       }
+
+       return 0;
+}
+
+static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+       struct nullb_queue *nq = hctx->driver_data;
+       LIST_HEAD(list);
+       int nr = 0;
+
+       spin_lock(&nq->poll_lock);
+       list_splice_init(&nq->poll_list, &list);
+       spin_unlock(&nq->poll_lock);
+
+       while (!list_empty(&list)) {
+               struct nullb_cmd *cmd;
+               struct request *req;
+
+               req = list_first_entry(&list, struct request, queuelist);
+               list_del_init(&req->queuelist);
+               cmd = blk_mq_rq_to_pdu(req);
+               cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
+                                               blk_rq_sectors(req));
+               end_cmd(cmd);
+               nr++;
+       }
+
+       return nr;
+}
+
  static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
  {
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
         struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
  
         pr_info("rq %p timed out\n", rq);
  
+       if (hctx->type == HCTX_TYPE_POLL) {
+               struct nullb_queue *nq = hctx->driver_data;
+
+               spin_lock(&nq->poll_lock);
+               list_del_init(&rq->queuelist);
+               spin_unlock(&nq->poll_lock);
+       }
+
         /*
          * If the device is marked as blocking (i.e. memory backed or zoned
          * device), the submission path may be blocked waiting for resources
@@ -1481,10 +1616,11 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
         struct nullb_queue *nq = hctx->driver_data;
         sector_t nr_sectors = blk_rq_sectors(bd->rq);
         sector_t sector = blk_rq_pos(bd->rq);
+       const bool is_poll = hctx->type == HCTX_TYPE_POLL;
  
         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
  
-       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+       if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
                 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                 cmd->timer.function = null_cmd_timer_expired;
         }
@@ -1508,6 +1644,13 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                         return BLK_STS_OK;
                 }
         }
+
+       if (is_poll) {
+               spin_lock(&nq->poll_lock);
+               list_add_tail(&bd->rq->queuelist, &nq->poll_list);
+               spin_unlock(&nq->poll_lock);
+               return BLK_STS_OK;
+       }
         if (cmd->fake_timeout)
                 return BLK_STS_OK;
  
@@ -1543,6 +1686,8 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
         init_waitqueue_head(&nq->wait);
         nq->queue_depth = nullb->queue_depth;
         nq->dev = nullb->dev;
+       INIT_LIST_HEAD(&nq->poll_list);
+       spin_lock_init(&nq->poll_lock);
  }
  
  static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
@@ -1568,6 +1713,8 @@ static const struct blk_mq_ops null_mq_ops = {
         .queue_rq       = null_queue_rq,
         .complete       = null_complete_rq,
         .timeout        = null_timeout_rq,
+       .poll           = null_poll,
+       .map_queues     = null_map_queues,
         .init_hctx      = null_init_hctx,
         .exit_hctx      = null_exit_hctx,
  };
@@ -1664,13 +1811,17 @@ static int setup_commands(struct nullb_queue *nq)
  
  static int setup_queues(struct nullb *nullb)
  {
-       nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue),
+       int nqueues = nr_cpu_ids;
+
+       if (g_poll_queues)
+               nqueues += g_poll_queues;
+
+       nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue),
                                 GFP_KERNEL);
         if (!nullb->queues)
                 return -ENOMEM;
  
         nullb->queue_depth = nullb->dev->hw_queue_depth;
-
         return 0;
  }
  
@@ -1722,9 +1873,14 @@ static int null_gendisk_register(struct nullb *nullb)
  
  static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
  {
+       int poll_queues;
+
         set->ops = &null_mq_ops;
         set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
                                                 g_submit_queues;
+       poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
+       if (poll_queues)
+               set->nr_hw_queues += poll_queues;
         set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
                                                 g_hw_queue_depth;
         set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
@@ -1734,7 +1890,11 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
                 set->flags |= BLK_MQ_F_NO_SCHED;
         if (g_shared_tag_bitmap)
                 set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
-       set->driver_data = NULL;
+       set->driver_data = nullb;
+       if (g_poll_queues)
+               set->nr_maps = 3;
+       else
+               set->nr_maps = 1;
  
         if ((nullb && nullb->dev->blocking) || g_blocking)
                 set->flags |= BLK_MQ_F_BLOCKING;
@@ -1754,6 +1914,13 @@ static int null_validate_conf(struct nullb_device *dev)
                 dev->submit_queues = nr_cpu_ids;
         else if (dev->submit_queues == 0)
                 dev->submit_queues = 1;
+       dev->prev_submit_queues = dev->submit_queues;
+
+       if (dev->poll_queues > g_poll_queues)
+               dev->poll_queues = g_poll_queues;
+       else if (dev->poll_queues == 0)
+               dev->poll_queues = 1;
+       dev->prev_poll_queues = dev->poll_queues;
  
         dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
         dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h

index 64bef12..78eb56b 100644 (file)
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -32,6 +32,9 @@ struct nullb_queue {
         struct nullb_device *dev;
         unsigned int requeue_selection;
  
+       struct list_head poll_list;
+       spinlock_t poll_lock;
+
         struct nullb_cmd *cmds;
  };
  
@@ -83,6 +86,9 @@ struct nullb_device {
         unsigned int zone_max_open; /* max number of open zones */
         unsigned int zone_max_active; /* max number of active zones */
         unsigned int submit_queues; /* number of submission queues */
+       unsigned int prev_submit_queues; /* number of submission queues before change */
+       unsigned int poll_queues; /* number of IOPOLL submission queues */
+       unsigned int prev_poll_queues; /* number of IOPOLL submission queues before change */
         unsigned int home_node; /* home node for the device */
         unsigned int queue_mode; /* block interface */
         unsigned int blocksize; /* block size */
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c

index f9cdd11..f6b1d63 100644 (file)
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -183,8 +183,6 @@ static int pcd_audio_ioctl(struct cdrom_device_info *cdi,
  static int pcd_packet(struct cdrom_device_info *cdi,
                       struct packet_command *cgc);
  
-static int pcd_detect(void);
-static void pcd_probe_capabilities(void);
  static void do_pcd_read_drq(void);
  static blk_status_t pcd_queue_rq(struct blk_mq_hw_ctx *hctx,
                                  const struct blk_mq_queue_data *bd);
@@ -302,53 +300,6 @@ static const struct blk_mq_ops pcd_mq_ops = {
         .queue_rq       = pcd_queue_rq,
  };
  
-static void pcd_init_units(void)
-{
-       struct pcd_unit *cd;
-       int unit;
-
-       pcd_drive_count = 0;
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               struct gendisk *disk;
-
-               if (blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1,
-                               BLK_MQ_F_SHOULD_MERGE))
-                       continue;
-
-               disk = blk_mq_alloc_disk(&cd->tag_set, cd);
-               if (IS_ERR(disk)) {
-                       blk_mq_free_tag_set(&cd->tag_set);
-                       continue;
-               }
-
-               INIT_LIST_HEAD(&cd->rq_list);
-               blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
-               cd->disk = disk;
-               cd->pi = &cd->pia;
-               cd->present = 0;
-               cd->last_sense = 0;
-               cd->changed = 1;
-               cd->drive = (*drives[unit])[D_SLV];
-               if ((*drives[unit])[D_PRT])
-                       pcd_drive_count++;
-
-               cd->name = &cd->info.name[0];
-               snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
-               cd->info.ops = &pcd_dops;
-               cd->info.handle = cd;
-               cd->info.speed = 0;
-               cd->info.capacity = 1;
-               cd->info.mask = 0;
-               disk->major = major;
-               disk->first_minor = unit;
-               disk->minors = 1;
-               strcpy(disk->disk_name, cd->name);      /* umm... */
-               disk->fops = &pcd_bdops;
-               disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-               disk->events = DISK_EVENT_MEDIA_CHANGE;
-       }
-}
-
  static int pcd_open(struct cdrom_device_info *cdi, int purpose)
  {
         struct pcd_unit *cd = cdi->handle;
@@ -630,10 +581,11 @@ static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr)
         return CDS_DISC_OK;
  }
  
-static int pcd_identify(struct pcd_unit *cd, char *id)
+static int pcd_identify(struct pcd_unit *cd)
  {
-       int k, s;
         char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+       char id[18];
+       int k, s;
  
         pcd_bufblk = -1;
  
@@ -661,108 +613,47 @@ static int pcd_identify(struct pcd_unit *cd, char *id)
  }
  
  /*
- * returns  0, with id set if drive is detected
- *         -1, if drive detection failed
+ * returns 0, with id set if drive is detected, otherwise an error code.
   */
-static int pcd_probe(struct pcd_unit *cd, int ms, char *id)
+static int pcd_probe(struct pcd_unit *cd, int ms)
  {
         if (ms == -1) {
                 for (cd->drive = 0; cd->drive <= 1; cd->drive++)
-                       if (!pcd_reset(cd) && !pcd_identify(cd, id))
+                       if (!pcd_reset(cd) && !pcd_identify(cd))
                                 return 0;
         } else {
                 cd->drive = ms;
-               if (!pcd_reset(cd) && !pcd_identify(cd, id))
+               if (!pcd_reset(cd) && !pcd_identify(cd))
                         return 0;
         }
-       return -1;
+       return -ENODEV;
  }
  
-static void pcd_probe_capabilities(void)
+static int pcd_probe_capabilities(struct pcd_unit *cd)
  {
-       int unit, r;
-       char buffer[32];
         char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 };
-       struct pcd_unit *cd;
-
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->present)
-                       continue;
-               r = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
-               if (r)
-                       continue;
-               /* we should now have the cap page */
-               if ((buffer[11] & 1) == 0)
-                       cd->info.mask |= CDC_CD_R;
-               if ((buffer[11] & 2) == 0)
-                       cd->info.mask |= CDC_CD_RW;
-               if ((buffer[12] & 1) == 0)
-                       cd->info.mask |= CDC_PLAY_AUDIO;
-               if ((buffer[14] & 1) == 0)
-                       cd->info.mask |= CDC_LOCK;
-               if ((buffer[14] & 8) == 0)
-                       cd->info.mask |= CDC_OPEN_TRAY;
-               if ((buffer[14] >> 6) == 0)
-                       cd->info.mask |= CDC_CLOSE_TRAY;
-       }
-}
-
-static int pcd_detect(void)
-{
-       char id[18];
-       int k, unit;
-       struct pcd_unit *cd;
+       char buffer[32];
+       int ret;
  
-       printk("%s: %s version %s, major %d, nice %d\n",
-              name, name, PCD_VERSION, major, nice);
+       ret = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
+       if (ret)
+               return ret;
+
+       /* we should now have the cap page */
+       if ((buffer[11] & 1) == 0)
+               cd->info.mask |= CDC_CD_R;
+       if ((buffer[11] & 2) == 0)
+               cd->info.mask |= CDC_CD_RW;
+       if ((buffer[12] & 1) == 0)
+               cd->info.mask |= CDC_PLAY_AUDIO;
+       if ((buffer[14] & 1) == 0)
+               cd->info.mask |= CDC_LOCK;
+       if ((buffer[14] & 8) == 0)
+               cd->info.mask |= CDC_OPEN_TRAY;
+       if ((buffer[14] >> 6) == 0)
+               cd->info.mask |= CDC_CLOSE_TRAY;
  
-       par_drv = pi_register_driver(name);
-       if (!par_drv) {
-               pr_err("failed to register %s driver\n", name);
-               return -1;
-       }
-
-       k = 0;
-       if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
-               cd = pcd;
-               if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1,
-                           pcd_buffer, PI_PCD, verbose, cd->name)) {
-                       if (!pcd_probe(cd, -1, id)) {
-                               cd->present = 1;
-                               k++;
-                       } else
-                               pi_release(cd->pi);
-               }
-       } else {
-               for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-                       int *conf = *drives[unit];
-                       if (!conf[D_PRT])
-                               continue;
-                       if (!cd->disk)
-                               continue;
-                       if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD],
-                                    conf[D_UNI], conf[D_PRO], conf[D_DLY],
-                                    pcd_buffer, PI_PCD, verbose, cd->name)) 
-                               continue;
-                       if (!pcd_probe(cd, conf[D_SLV], id)) {
-                               cd->present = 1;
-                               k++;
-                       } else
-                               pi_release(cd->pi);
-               }
-       }
-       if (k)
-               return 0;
-
-       printk("%s: No CD-ROM drive found\n", name);
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->disk)
-                       continue;
-               blk_cleanup_disk(cd->disk);
-               blk_mq_free_tag_set(&cd->tag_set);
-       }
-       pi_unregister_driver(par_drv);
-       return -1;
+       return 0;
  }
  
  /* I/O request processing */
@@ -999,43 +890,130 @@ static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn)
         return 0;
  }
  
+static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port,
+               int mode, int unit, int protocol, int delay, int ms)
+{
+       struct gendisk *disk;
+       int ret;
+
+       ret = blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1,
+                                     BLK_MQ_F_SHOULD_MERGE);
+       if (ret)
+               return ret;
+
+       disk = blk_mq_alloc_disk(&cd->tag_set, cd);
+       if (IS_ERR(disk)) {
+               ret = PTR_ERR(disk);
+               goto out_free_tag_set;
+       }
+
+       INIT_LIST_HEAD(&cd->rq_list);
+       blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+       cd->disk = disk;
+       cd->pi = &cd->pia;
+       cd->present = 0;
+       cd->last_sense = 0;
+       cd->changed = 1;
+       cd->drive = (*drives[cd - pcd])[D_SLV];
+
+       cd->name = &cd->info.name[0];
+       snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
+       cd->info.ops = &pcd_dops;
+       cd->info.handle = cd;
+       cd->info.speed = 0;
+       cd->info.capacity = 1;
+       cd->info.mask = 0;
+       disk->major = major;
+       disk->first_minor = unit;
+       disk->minors = 1;
+       strcpy(disk->disk_name, cd->name);      /* umm... */
+       disk->fops = &pcd_bdops;
+       disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       disk->events = DISK_EVENT_MEDIA_CHANGE;
+
+       if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pcd_buffer, PI_PCD, verbose, cd->name)) {
+               ret = -ENODEV;
+               goto out_free_disk;
+       }
+       ret = pcd_probe(cd, ms);
+       if (ret)
+               goto out_pi_release;
+
+       cd->present = 1;
+       pcd_probe_capabilities(cd);
+       ret = register_cdrom(cd->disk, &cd->info);
+       if (ret)
+               goto out_pi_release;
+       ret = add_disk(cd->disk);
+       if (ret)
+               goto out_unreg_cdrom;
+       return 0;
+
+out_unreg_cdrom:
+       unregister_cdrom(&cd->info);
+out_pi_release:
+       pi_release(cd->pi);
+out_free_disk:
+       blk_cleanup_disk(cd->disk);
+out_free_tag_set:
+       blk_mq_free_tag_set(&cd->tag_set);
+       return ret;
+}
+
  static int __init pcd_init(void)
  {
-       struct pcd_unit *cd;
-       int unit;
+       int found = 0, unit;
  
         if (disable)
                 return -EINVAL;
  
-       pcd_init_units();
+       if (register_blkdev(major, name))
+               return -EBUSY;
  
-       if (pcd_detect())
-               return -ENODEV;
+       pr_info("%s: %s version %s, major %d, nice %d\n",
+               name, name, PCD_VERSION, major, nice);
  
-       /* get the atapi capabilities page */
-       pcd_probe_capabilities();
+       par_drv = pi_register_driver(name);
+       if (!par_drv) {
+               pr_err("failed to register %s driver\n", name);
+               goto out_unregister_blkdev;
+       }
  
-       if (register_blkdev(major, name)) {
-               for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-                       if (!cd->disk)
-                               continue;
+       for (unit = 0; unit < PCD_UNITS; unit++) {
+               if ((*drives[unit])[D_PRT])
+                       pcd_drive_count++;
+       }
+
+       if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+               if (!pcd_init_unit(pcd, 1, -1, -1, -1, -1, -1, -1))
+                       found++;
+       } else {
+               for (unit = 0; unit < PCD_UNITS; unit++) {
+                       struct pcd_unit *cd = &pcd[unit];
+                       int *conf = *drives[unit];
  
-                       blk_cleanup_queue(cd->disk->queue);
-                       blk_mq_free_tag_set(&cd->tag_set);
-                       put_disk(cd->disk);
+                       if (!conf[D_PRT])
+                               continue;
+                       if (!pcd_init_unit(cd, 0, conf[D_PRT], conf[D_MOD],
+                                       conf[D_UNI], conf[D_PRO], conf[D_DLY],
+                                       conf[D_SLV]))
+                               found++;
                 }
-               return -EBUSY;
         }
  
-       for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (cd->present) {
-                       register_cdrom(cd->disk, &cd->info);
-                       cd->disk->private_data = cd;
-                       add_disk(cd->disk);
-               }
+       if (!found) {
+               pr_info("%s: No CD-ROM drive found\n", name);
+               goto out_unregister_pi_driver;
         }
  
         return 0;
+
+out_unregister_pi_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+       unregister_blkdev(major, name);
+       return -ENODEV;
  }
  
  static void __exit pcd_exit(void)
@@ -1044,20 +1022,18 @@ static void __exit pcd_exit(void)
         int unit;
  
         for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
-               if (!cd->disk)
+               if (!cd->present)
                         continue;
  
-               if (cd->present) {
-                       del_gendisk(cd->disk);
-                       pi_release(cd->pi);
-                       unregister_cdrom(&cd->info);
-               }
-               blk_cleanup_queue(cd->disk->queue);
+               unregister_cdrom(&cd->info);
+               del_gendisk(cd->disk);
+               pi_release(cd->pi);
+               blk_cleanup_disk(cd->disk);
+
                 blk_mq_free_tag_set(&cd->tag_set);
-               put_disk(cd->disk);
         }
-       unregister_blkdev(major, name);
         pi_unregister_driver(par_drv);
+       unregister_blkdev(major, name);
  }
  
  MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c

index 675327d..fba8650 100644 (file)
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -775,14 +775,14 @@ static int pd_special_command(struct pd_unit *disk,
         struct request *rq;
         struct pd_req *req;
  
-       rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
+       rq = blk_mq_alloc_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
         req = blk_mq_rq_to_pdu(rq);
  
         req->func = func;
         blk_execute_rq(disk->gd, rq, 0);
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
         return 0;
  }
  
@@ -875,9 +875,27 @@ static const struct blk_mq_ops pd_mq_ops = {
         .queue_rq       = pd_queue_rq,
  };
  
-static void pd_probe_drive(struct pd_unit *disk)
+static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port,
+               int mode, int unit, int protocol, int delay)
  {
+       int index = disk - pd;
+       int *parm = *drives[index];
         struct gendisk *p;
+       int ret;
+
+       disk->pi = &disk->pia;
+       disk->access = 0;
+       disk->changed = 1;
+       disk->capacity = 0;
+       disk->drive = parm[D_SLV];
+       snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a' + index);
+       disk->alt_geom = parm[D_GEO];
+       disk->standby = parm[D_SBY];
+       INIT_LIST_HEAD(&disk->rq_list);
+
+       if (!pi_init(disk->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pd_scratch, PI_PD, verbose, disk->name))
+               return -ENXIO;
  
         memset(&disk->tag_set, 0, sizeof(disk->tag_set));
         disk->tag_set.ops = &pd_mq_ops;
@@ -887,14 +905,14 @@ static void pd_probe_drive(struct pd_unit *disk)
         disk->tag_set.queue_depth = 2;
         disk->tag_set.numa_node = NUMA_NO_NODE;
         disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
-
-       if (blk_mq_alloc_tag_set(&disk->tag_set))
-               return;
+       ret = blk_mq_alloc_tag_set(&disk->tag_set);
+       if (ret)
+               goto pi_release;
  
         p = blk_mq_alloc_disk(&disk->tag_set, disk);
         if (IS_ERR(p)) {
-               blk_mq_free_tag_set(&disk->tag_set);
-               return;
+               ret = PTR_ERR(p);
+               goto free_tag_set;
         }
         disk->gd = p;
  
@@ -905,102 +923,88 @@ static void pd_probe_drive(struct pd_unit *disk)
         p->minors = 1 << PD_BITS;
         p->events = DISK_EVENT_MEDIA_CHANGE;
         p->private_data = disk;
-
         blk_queue_max_hw_sectors(p->queue, cluster);
         blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH);
  
         if (disk->drive == -1) {
-               for (disk->drive = 0; disk->drive <= 1; disk->drive++)
-                       if (pd_special_command(disk, pd_identify) == 0)
-                               return;
-       } else if (pd_special_command(disk, pd_identify) == 0)
-               return;
-       disk->gd = NULL;
+               for (disk->drive = 0; disk->drive <= 1; disk->drive++) {
+                       ret = pd_special_command(disk, pd_identify);
+                       if (ret == 0)
+                               break;
+               }
+       } else {
+               ret = pd_special_command(disk, pd_identify);
+       }
+       if (ret)
+               goto put_disk;
+       set_capacity(disk->gd, disk->capacity);
+       ret = add_disk(disk->gd);
+       if (ret)
+               goto cleanup_disk;
+       return 0;
+cleanup_disk:
+       blk_cleanup_disk(disk->gd);
+put_disk:
         put_disk(p);
+       disk->gd = NULL;
+free_tag_set:
+       blk_mq_free_tag_set(&disk->tag_set);
+pi_release:
+       pi_release(disk->pi);
+       return ret;
  }
  
-static int pd_detect(void)
+static int __init pd_init(void)
  {
         int found = 0, unit, pd_drive_count = 0;
         struct pd_unit *disk;
  
-       for (unit = 0; unit < PD_UNITS; unit++) {
-               int *parm = *drives[unit];
-               struct pd_unit *disk = pd + unit;
-               disk->pi = &disk->pia;
-               disk->access = 0;
-               disk->changed = 1;
-               disk->capacity = 0;
-               disk->drive = parm[D_SLV];
-               snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a'+unit);
-               disk->alt_geom = parm[D_GEO];
-               disk->standby = parm[D_SBY];
-               if (parm[D_PRT])
-                       pd_drive_count++;
-               INIT_LIST_HEAD(&disk->rq_list);
-       }
+       if (disable)
+               return -ENODEV;
+
+       if (register_blkdev(major, name))
+               return -ENODEV;
+
+       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+              name, name, PD_VERSION, major, cluster, nice);
  
         par_drv = pi_register_driver(name);
         if (!par_drv) {
                 pr_err("failed to register %s driver\n", name);
-               return -1;
+               goto out_unregister_blkdev;
         }
  
-       if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
-               disk = pd;
-               if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch,
-                           PI_PD, verbose, disk->name)) {
-                       pd_probe_drive(disk);
-                       if (!disk->gd)
-                               pi_release(disk->pi);
-               }
+       for (unit = 0; unit < PD_UNITS; unit++) {
+               int *parm = *drives[unit];
  
+               if (parm[D_PRT])
+                       pd_drive_count++;
+       }
+
+       if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+               if (!pd_probe_drive(pd, 1, -1, -1, -1, -1, -1))
+                       found++;
         } else {
                 for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
                         int *parm = *drives[unit];
                         if (!parm[D_PRT])
                                 continue;
-                       if (pi_init(disk->pi, 0, parm[D_PRT], parm[D_MOD],
-                                    parm[D_UNI], parm[D_PRO], parm[D_DLY],
-                                    pd_scratch, PI_PD, verbose, disk->name)) {
-                               pd_probe_drive(disk);
-                               if (!disk->gd)
-                                       pi_release(disk->pi);
-                       }
-               }
-       }
-       for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
-               if (disk->gd) {
-                       set_capacity(disk->gd, disk->capacity);
-                       add_disk(disk->gd);
-                       found = 1;
+                       if (!pd_probe_drive(disk, 0, parm[D_PRT], parm[D_MOD],
+                                       parm[D_UNI], parm[D_PRO], parm[D_DLY]))
+                               found++;
                 }
         }
         if (!found) {
                 printk("%s: no valid drive found\n", name);
-               pi_unregister_driver(par_drv);
+               goto out_pi_unregister_driver;
         }
-       return found;
-}
-
-static int __init pd_init(void)
-{
-       if (disable)
-               goto out1;
-
-       if (register_blkdev(major, name))
-               goto out1;
-
-       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
-              name, name, PD_VERSION, major, cluster, nice);
-       if (!pd_detect())
-               goto out2;
  
         return 0;
  
-out2:
+out_pi_unregister_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
         unregister_blkdev(major, name);
-out1:
         return -ENODEV;
  }
  
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c

index d5b9c88..bf8d0ef 100644 (file)
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -214,7 +214,6 @@ static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
  
  static void pf_release(struct gendisk *disk, fmode_t mode);
  
-static int pf_detect(void);
  static void do_pf_read(void);
  static void do_pf_read_start(void);
  static void do_pf_write(void);
@@ -285,45 +284,6 @@ static const struct blk_mq_ops pf_mq_ops = {
         .queue_rq       = pf_queue_rq,
  };
  
-static void __init pf_init_units(void)
-{
-       struct pf_unit *pf;
-       int unit;
-
-       pf_drive_count = 0;
-       for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) {
-               struct gendisk *disk;
-
-               if (blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1,
-                               BLK_MQ_F_SHOULD_MERGE))
-                       continue;
-
-               disk = blk_mq_alloc_disk(&pf->tag_set, pf);
-               if (IS_ERR(disk)) {
-                       blk_mq_free_tag_set(&pf->tag_set);
-                       continue;
-               }
-
-               INIT_LIST_HEAD(&pf->rq_list);
-               blk_queue_max_segments(disk->queue, cluster);
-               blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
-               pf->disk = disk;
-               pf->pi = &pf->pia;
-               pf->media_status = PF_NM;
-               pf->drive = (*drives[unit])[D_SLV];
-               pf->lun = (*drives[unit])[D_LUN];
-               snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit);
-               disk->major = major;
-               disk->first_minor = unit;
-               disk->minors = 1;
-               strcpy(disk->disk_name, pf->name);
-               disk->fops = &pf_fops;
-               disk->events = DISK_EVENT_MEDIA_CHANGE;
-               if (!(*drives[unit])[D_PRT])
-                       pf_drive_count++;
-       }
-}
-
  static int pf_open(struct block_device *bdev, fmode_t mode)
  {
         struct pf_unit *pf = bdev->bd_disk->private_data;
@@ -691,9 +651,9 @@ static int pf_identify(struct pf_unit *pf)
         return 0;
  }
  
-/*     returns  0, with id set if drive is detected
-               -1, if drive detection failed
-*/
+/*
+ * returns 0, with id set if drive is detected, otherwise an error code.
+ */
  static int pf_probe(struct pf_unit *pf)
  {
         if (pf->drive == -1) {
@@ -715,60 +675,7 @@ static int pf_probe(struct pf_unit *pf)
                         if (!pf_identify(pf))
                                 return 0;
         }
-       return -1;
-}
-
-static int pf_detect(void)
-{
-       struct pf_unit *pf = units;
-       int k, unit;
-
-       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
-              name, name, PF_VERSION, major, cluster, nice);
-
-       par_drv = pi_register_driver(name);
-       if (!par_drv) {
-               pr_err("failed to register %s driver\n", name);
-               return -1;
-       }
-       k = 0;
-       if (pf_drive_count == 0) {
-               if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF,
-                           verbose, pf->name)) {
-                       if (!pf_probe(pf) && pf->disk) {
-                               pf->present = 1;
-                               k++;
-                       } else
-                               pi_release(pf->pi);
-               }
-
-       } else
-               for (unit = 0; unit < PF_UNITS; unit++, pf++) {
-                       int *conf = *drives[unit];
-                       if (!conf[D_PRT])
-                               continue;
-                       if (pi_init(pf->pi, 0, conf[D_PRT], conf[D_MOD],
-                                   conf[D_UNI], conf[D_PRO], conf[D_DLY],
-                                   pf_scratch, PI_PF, verbose, pf->name)) {
-                               if (pf->disk && !pf_probe(pf)) {
-                                       pf->present = 1;
-                                       k++;
-                               } else
-                                       pi_release(pf->pi);
-                       }
-               }
-       if (k)
-               return 0;
-
-       printk("%s: No ATAPI disk detected\n", name);
-       for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               if (!pf->disk)
-                       continue;
-               blk_cleanup_disk(pf->disk);
-               blk_mq_free_tag_set(&pf->tag_set);
-       }
-       pi_unregister_driver(par_drv);
-       return -1;
+       return -ENODEV;
  }
  
  /* The i/o request engine */
@@ -1014,61 +921,134 @@ static void do_pf_write_done(void)
         next_request(0);
  }
  
+static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port,
+               int mode, int unit, int protocol, int delay, int ms)
+{
+       struct gendisk *disk;
+       int ret;
+
+       ret = blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1,
+                                     BLK_MQ_F_SHOULD_MERGE);
+       if (ret)
+               return ret;
+
+       disk = blk_mq_alloc_disk(&pf->tag_set, pf);
+       if (IS_ERR(disk)) {
+               ret = PTR_ERR(disk);
+               goto out_free_tag_set;
+       }
+       disk->major = major;
+       disk->first_minor = pf - units;
+       disk->minors = 1;
+       strcpy(disk->disk_name, pf->name);
+       disk->fops = &pf_fops;
+       disk->events = DISK_EVENT_MEDIA_CHANGE;
+       disk->private_data = pf;
+
+       blk_queue_max_segments(disk->queue, cluster);
+       blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+
+       INIT_LIST_HEAD(&pf->rq_list);
+       pf->disk = disk;
+       pf->pi = &pf->pia;
+       pf->media_status = PF_NM;
+       pf->drive = (*drives[disk->first_minor])[D_SLV];
+       pf->lun = (*drives[disk->first_minor])[D_LUN];
+       snprintf(pf->name, PF_NAMELEN, "%s%d", name, disk->first_minor);
+
+       if (!pi_init(pf->pi, autoprobe, port, mode, unit, protocol, delay,
+                       pf_scratch, PI_PF, verbose, pf->name)) {
+               ret = -ENODEV;
+               goto out_free_disk;
+       }
+       ret = pf_probe(pf);
+       if (ret)
+               goto out_pi_release;
+
+       ret = add_disk(disk);
+       if (ret)
+               goto out_pi_release;
+       pf->present = 1;
+       return 0;
+
+out_pi_release:
+       pi_release(pf->pi);
+out_free_disk:
+       blk_cleanup_disk(pf->disk);
+out_free_tag_set:
+       blk_mq_free_tag_set(&pf->tag_set);
+       return ret;
+}
+
  static int __init pf_init(void)
  {                              /* preliminary initialisation */
         struct pf_unit *pf;
-       int unit;
+       int found = 0, unit;
  
         if (disable)
                 return -EINVAL;
  
-       pf_init_units();
+       if (register_blkdev(major, name))
+               return -EBUSY;
  
-       if (pf_detect())
-               return -ENODEV;
-       pf_busy = 0;
+       printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+              name, name, PF_VERSION, major, cluster, nice);
  
-       if (register_blkdev(major, name)) {
-               for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-                       if (!pf->disk)
-                               continue;
-                       blk_cleanup_queue(pf->disk->queue);
-                       blk_mq_free_tag_set(&pf->tag_set);
-                       put_disk(pf->disk);
-               }
-               return -EBUSY;
+       par_drv = pi_register_driver(name);
+       if (!par_drv) {
+               pr_err("failed to register %s driver\n", name);
+               goto out_unregister_blkdev;
         }
  
-       for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               struct gendisk *disk = pf->disk;
+       for (unit = 0; unit < PF_UNITS; unit++) {
+               if (!(*drives[unit])[D_PRT])
+                       pf_drive_count++;
+       }
  
-               if (!pf->present)
-                       continue;
-               disk->private_data = pf;
-               add_disk(disk);
+       pf = units;
+       if (pf_drive_count == 0) {
+               if (pf_init_unit(pf, 1, -1, -1, -1, -1, -1, verbose))
+                       found++;
+       } else {
+               for (unit = 0; unit < PF_UNITS; unit++, pf++) {
+                       int *conf = *drives[unit];
+                       if (!conf[D_PRT])
+                               continue;
+                       if (pf_init_unit(pf, 0, conf[D_PRT], conf[D_MOD],
+                                   conf[D_UNI], conf[D_PRO], conf[D_DLY],
+                                   verbose))
+                               found++;
+               }
+       }
+       if (!found) {
+               printk("%s: No ATAPI disk detected\n", name);
+               goto out_unregister_pi_driver;
         }
+       pf_busy = 0;
         return 0;
+
+out_unregister_pi_driver:
+       pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+       unregister_blkdev(major, name);
+       return -ENODEV;
  }
  
  static void __exit pf_exit(void)
  {
         struct pf_unit *pf;
         int unit;
-       unregister_blkdev(major, name);
+
         for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
-               if (!pf->disk)
+               if (!pf->present)
                         continue;
-
-               if (pf->present)
-                       del_gendisk(pf->disk);
-
-               blk_cleanup_queue(pf->disk->queue);
+               del_gendisk(pf->disk);
+               blk_cleanup_disk(pf->disk);
                 blk_mq_free_tag_set(&pf->tag_set);
-               put_disk(pf->disk);
-
-               if (pf->present)
-                       pi_release(pf->pi);
+               pi_release(pf->pi);
         }
+
+       unregister_blkdev(major, name);
  }
  
  MODULE_LICENSE("GPL");
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c

index 0f26b25..b53f648 100644 (file)
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -703,7 +703,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
         struct request *rq;
         int ret = 0;
  
-       rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
+       rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
                              REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
@@ -726,7 +726,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
         if (scsi_req(rq)->result)
                 ret = -EIO;
  out:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
         return ret;
  }
  
@@ -2400,7 +2400,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
         }
  }
  
-static blk_qc_t pkt_submit_bio(struct bio *bio)
+static void pkt_submit_bio(struct bio *bio)
  {
         struct pktcdvd_device *pd;
         char b[BDEVNAME_SIZE];
@@ -2423,7 +2423,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
          */
         if (bio_data_dir(bio) == READ) {
                 pkt_make_request_read(pd, bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2455,10 +2455,9 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
                 pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
         } while (split != bio);
  
-       return BLK_QC_T_NONE;
+       return;
  end_io:
         bio_io_error(bio);
-       return BLK_QC_T_NONE;
  }
  
  static void pkt_init_queue(struct pktcdvd_device *pd)
@@ -2537,6 +2536,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
         int i;
         char b[BDEVNAME_SIZE];
         struct block_device *bdev;
+       struct scsi_device *sdev;
  
         if (pd->pkt_dev == dev) {
                 pkt_err(pd, "recursive setup not allowed\n");
@@ -2560,10 +2560,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
         bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL);
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
-       if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
+       sdev = scsi_device_from_queue(bdev->bd_disk->queue);
+       if (!sdev) {
                 blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
                 return -EINVAL;
         }
+       put_device(&sdev->sdev_gendev);
  
         /* This is safe, since we have a reference from open(). */
         __module_get(THIS_MODULE);
@@ -2729,7 +2731,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
         /* inherit events of the host device */
         disk->events = pd->bdev->bd_disk->events;
  
-       add_disk(disk);
+       ret = add_disk(disk);
+       if (ret)
+               goto out_mem2;
  
         pkt_sysfs_dev_new(pd);
         pkt_debugfs_dev_new(pd);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c

index c7b19e1..d1ebf19 100644 (file)
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -578,7 +578,7 @@ out:
         return next;
  }
  
-static blk_qc_t ps3vram_submit_bio(struct bio *bio)
+static void ps3vram_submit_bio(struct bio *bio)
  {
         struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
         struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -594,13 +594,11 @@ static blk_qc_t ps3vram_submit_bio(struct bio *bio)
         spin_unlock_irq(&priv->lock);
  
         if (busy)
-               return BLK_QC_T_NONE;
+               return;
  
         do {
                 bio = ps3vram_do_bio(dev, bio);
         } while (bio);
-
-       return BLK_QC_T_NONE;
  }
  
  static const struct block_device_operations ps3vram_fops = {
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c

index e65c9d7..953fa13 100644 (file)
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -836,7 +836,7 @@ struct rbd_options {
         u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
  };
  
-#define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_MAX_RQ
+#define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_DEFAULT_RQ
  #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
  #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
  #define RBD_READ_ONLY_DEFAULT  false
@@ -7054,7 +7054,9 @@ static ssize_t do_rbd_add(struct bus_type *bus,
         if (rc)
                 goto err_out_image_lock;
  
-       device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+       rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+       if (rc)
+               goto err_out_cleanup_disk;
  
         spin_lock(&rbd_dev_list_lock);
         list_add_tail(&rbd_dev->node, &rbd_dev_list);
@@ -7068,6 +7070,8 @@ out:
         module_put(THIS_MODULE);
         return rc;
  
+err_out_cleanup_disk:
+       rbd_free_disk(rbd_dev);
  err_out_image_lock:
         rbd_dev_image_unlock(rbd_dev);
         rbd_dev_device_release(rbd_dev);
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c

index bd4a41a..2df0657 100644 (file)
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
         return ret;
  }
  
-static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  {
         struct rnbd_queue *q = hctx->driver_data;
         struct rnbd_clt_dev *dev = q->dev;
@@ -1384,8 +1384,10 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
         blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
  }
  
-static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
+static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
  {
+       int err;
+
         dev->gd->major          = rnbd_client_major;
         dev->gd->first_minor    = idx << RNBD_PART_BITS;
         dev->gd->minors         = 1 << RNBD_PART_BITS;
@@ -1410,7 +1412,11 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
  
         if (!dev->rotational)
                 blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
-       add_disk(dev->gd);
+       err = add_disk(dev->gd);
+       if (err)
+               blk_cleanup_disk(dev->gd);
+
+       return err;
  }
  
  static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
@@ -1426,8 +1432,7 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
         rnbd_init_mq_hw_queues(dev);
  
         setup_request_queue(dev);
-       rnbd_clt_setup_gen_disk(dev, idx);
-       return 0;
+       return rnbd_clt_setup_gen_disk(dev, idx);
  }
  
  static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h

index c1bc5c0..de5d5a8 100644 (file)
--- a/drivers/block/rnbd/rnbd-proto.h
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -10,7 +10,7 @@
  #define RNBD_PROTO_H
  
  #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  #include <linux/limits.h>
  #include <linux/inet.h>
  #include <linux/in.h>
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c

index 8363671..8d9d69f 100644 (file)
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -935,7 +935,9 @@ static int rsxx_pci_probe(struct pci_dev *dev,
                         card->size8 = 0;
         }
  
-       rsxx_attach_dev(card);
+       st = rsxx_attach_dev(card);
+       if (st)
+               goto failed_create_dev;
  
         /************* Setup Debugfs *************/
         rsxx_debugfs_dev_new(card);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c

index 1cc40b0..dd33f1b 100644 (file)
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -50,7 +50,7 @@ struct rsxx_bio_meta {
  
  static struct kmem_cache *bio_meta_pool;
  
-static blk_qc_t rsxx_submit_bio(struct bio *bio);
+static void rsxx_submit_bio(struct bio *bio);
  
  /*----------------- Block Device Operations -----------------*/
  static int rsxx_blkdev_ioctl(struct block_device *bdev,
@@ -120,7 +120,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
         }
  }
  
-static blk_qc_t rsxx_submit_bio(struct bio *bio)
+static void rsxx_submit_bio(struct bio *bio)
  {
         struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data;
         struct rsxx_bio_meta *bio_meta;
@@ -169,7 +169,7 @@ static blk_qc_t rsxx_submit_bio(struct bio *bio)
         if (st)
                 goto queue_err;
  
-       return BLK_QC_T_NONE;
+       return;
  
  queue_err:
         kmem_cache_free(bio_meta_pool, bio_meta);
@@ -177,7 +177,6 @@ req_err:
         if (st)
                 bio->bi_status = st;
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  /*----------------- Device Setup -------------------*/
@@ -192,6 +191,8 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
  
  int rsxx_attach_dev(struct rsxx_cardinfo *card)
  {
+       int err = 0;
+
         mutex_lock(&card->dev_lock);
  
         /* The block device requires the stripe size from the config. */
@@ -200,13 +201,17 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
                         set_capacity(card->gendisk, card->size8 >> 9);
                 else
                         set_capacity(card->gendisk, 0);
-               device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL);
-               card->bdev_attached = 1;
+               err = device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL);
+               if (err == 0)
+                       card->bdev_attached = 1;
         }
  
         mutex_unlock(&card->dev_lock);
  
-       return 0;
+       if (err)
+               blk_cleanup_disk(card->gendisk);
+
+       return err;
  }
  
  void rsxx_detach_dev(struct rsxx_cardinfo *card)
diff --git a/drivers/block/swim.c b/drivers/block/swim.c

index 7ccc8d2..821594c 100644 (file)
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -16,6 +16,7 @@
  #include <linux/fd.h>
  #include <linux/slab.h>
  #include <linux/blk-mq.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/hdreg.h>
  #include <linux/kernel.h>
@@ -184,6 +185,7 @@ struct floppy_state {
  
         int             track;
         int             ref_count;
+       bool registered;
  
         struct gendisk *disk;
         struct blk_mq_tag_set tag_set;
@@ -771,6 +773,20 @@ static const struct blk_mq_ops swim_mq_ops = {
         .queue_rq = swim_queue_rq,
  };
  
+static void swim_cleanup_floppy_disk(struct floppy_state *fs)
+{
+       struct gendisk *disk = fs->disk;
+
+       if (!disk)
+               return;
+
+       if (fs->registered)
+               del_gendisk(fs->disk);
+
+       blk_cleanup_disk(disk);
+       blk_mq_free_tag_set(&fs->tag_set);
+}
+
  static int swim_floppy_init(struct swim_priv *swd)
  {
         int err;
@@ -827,7 +843,10 @@ static int swim_floppy_init(struct swim_priv *swd)
                 swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
                 swd->unit[drive].disk->private_data = &swd->unit[drive];
                 set_capacity(swd->unit[drive].disk, 2880);
-               add_disk(swd->unit[drive].disk);
+               err = add_disk(swd->unit[drive].disk);
+               if (err)
+                       goto exit_put_disks;
+               swd->unit[drive].registered = true;
         }
  
         return 0;
@@ -835,12 +854,7 @@ static int swim_floppy_init(struct swim_priv *swd)
  exit_put_disks:
         unregister_blkdev(FLOPPY_MAJOR, "fd");
         do {
-               struct gendisk *disk = swd->unit[drive].disk;
-
-               if (!disk)
-                       continue;
-               blk_cleanup_disk(disk);
-               blk_mq_free_tag_set(&swd->unit[drive].tag_set);
+               swim_cleanup_floppy_disk(&swd->unit[drive]);
         } while (drive--);
         return err;
  }
@@ -909,12 +923,8 @@ static int swim_remove(struct platform_device *dev)
         int drive;
         struct resource *res;
  
-       for (drive = 0; drive < swd->floppy_count; drive++) {
-               del_gendisk(swd->unit[drive].disk);
-               blk_cleanup_queue(swd->unit[drive].disk->queue);
-               blk_mq_free_tag_set(&swd->unit[drive].tag_set);
-               put_disk(swd->unit[drive].disk);
-       }
+       for (drive = 0; drive < swd->floppy_count; drive++)
+               swim_cleanup_floppy_disk(&swd->unit[drive]);
  
         unregister_blkdev(FLOPPY_MAJOR, "fd");
  
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c

index 965af0a..4b91c9a 100644 (file)
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -27,6 +27,7 @@
  #include <linux/module.h>
  #include <linux/spinlock.h>
  #include <linux/wait.h>
+#include <linux/major.h>
  #include <asm/io.h>
  #include <asm/dbdma.h>
  #include <asm/prom.h>
@@ -1229,7 +1230,9 @@ static int swim3_attach(struct macio_dev *mdev,
         disk->flags |= GENHD_FL_REMOVABLE;
         sprintf(disk->disk_name, "fd%d", floppy_count);
         set_capacity(disk, 2880);
-       add_disk(disk);
+       rc = add_disk(disk);
+       if (rc)
+               goto out_cleanup_disk;
  
         disks[floppy_count++] = disk;
         return 0;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c

index 420cd95..d1676fe 100644 (file)
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -297,6 +297,7 @@ struct carm_host {
  
         struct work_struct              fsm_task;
  
+       int probe_err;
         struct completion               probe_comp;
  };
  
@@ -1181,8 +1182,11 @@ static void carm_fsm_task (struct work_struct *work)
                                 struct gendisk *disk = port->disk;
  
                                 set_capacity(disk, port->capacity);
-                               add_disk(disk);
-                               activated++;
+                               host->probe_err = add_disk(disk);
+                               if (!host->probe_err)
+                                       activated++;
+                               else
+                                       break;
                         }
  
                 printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n",
@@ -1192,11 +1196,9 @@ static void carm_fsm_task (struct work_struct *work)
                 reschedule = 1;
                 break;
         }
-
         case HST_PROBE_FINISHED:
                 complete(&host->probe_comp);
                 break;
-
         case HST_ERROR:
                 /* FIXME: TODO */
                 break;
@@ -1507,7 +1509,12 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                 goto err_out_free_irq;
  
         DPRINTK("waiting for probe_comp\n");
+       host->probe_err = -ENODEV;
         wait_for_completion(&host->probe_comp);
+       if (host->probe_err) {
+               rc = host->probe_err;
+               goto err_out_free_irq;
+       }
  
         printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n",
                host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c

index 303caf2..fc4fc95 100644 (file)
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -312,7 +312,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
         struct request *req;
         int err;
  
-       req = blk_get_request(q, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
         if (IS_ERR(req))
                 return PTR_ERR(req);
  
@@ -323,7 +323,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
         blk_execute_rq(vblk->disk, req, false);
         err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
  out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
         return err;
  }
  
@@ -815,9 +815,17 @@ static int virtblk_probe(struct virtio_device *vdev)
         err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
                                    struct virtio_blk_config, blk_size,
                                    &blk_size);
-       if (!err)
+       if (!err) {
+               err = blk_validate_block_size(blk_size);
+               if (err) {
+                       dev_err(&vdev->dev,
+                               "virtio_blk: invalid block size: 0x%x\n",
+                               blk_size);
+                       goto out_cleanup_disk;
+               }
+
                 blk_queue_logical_block_size(q, blk_size);
-       else
+       } else
                 blk_size = queue_logical_block_size(q);
  
         /* Use topology information if available */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c

index 33eba3d..914587a 100644 (file)
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,7 +98,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
                 return;
         }
  
-       err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+       err = sync_blockdev(blkif->vbd.bdev);
         if (err) {
                 xenbus_dev_error(blkif->be->dev, err, "block flush");
                 return;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index 7290210..8e3983e 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -42,6 +42,7 @@
  #include <linux/cdrom.h>
  #include <linux/module.h>
  #include <linux/slab.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/scatterlist.h>
  #include <linux/bitmap.h>
@@ -2385,7 +2386,13 @@ static void blkfront_connect(struct blkfront_info *info)
         for_each_rinfo(info, rinfo, i)
                 kick_pending_request_queues(rinfo);
  
-       device_add_disk(&info->xbdev->dev, info->gd, NULL);
+       err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
+       if (err) {
+               blk_cleanup_disk(info->gd);
+               blk_mq_free_tag_set(&info->tag_set);
+               info->rq = NULL;
+               goto fail;
+       }
  
         info->is_ready = 1;
         return;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index fcaf275..a68297f 100644 (file)
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1598,22 +1598,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
  /*
   * Handler function for all zram I/O requests.
   */
-static blk_qc_t zram_submit_bio(struct bio *bio)
+static void zram_submit_bio(struct bio *bio)
  {
         struct zram *zram = bio->bi_bdev->bd_disk->private_data;
  
         if (!valid_io_request(zram, bio->bi_iter.bi_sector,
                                         bio->bi_iter.bi_size)) {
                 atomic64_inc(&zram->stats.invalid_io);
-               goto error;
+               bio_io_error(bio);
+               return;
         }
  
         __zram_make_request(zram, bio);
-       return BLK_QC_T_NONE;
-
-error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
  }
  
  static void zram_slot_free_notify(struct block_device *bdev,
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c

index bd2e5b1..9877e41 100644 (file)
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -344,6 +344,12 @@ static void cdrom_sysctl_register(void);
  
  static LIST_HEAD(cdrom_list);
  
+static void signal_media_change(struct cdrom_device_info *cdi)
+{
+       cdi->mc_flags = 0x3; /* set media changed bits, on both queues */
+       cdi->last_media_change_ms = ktime_to_ms(ktime_get());
+}
+
  int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
                                struct packet_command *cgc)
  {
@@ -616,6 +622,7 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi)
         ENSURE(cdo, generic_packet, CDC_GENERIC_PACKET);
         cdi->mc_flags = 0;
         cdi->options = CDO_USE_FFLAGS;
+       cdi->last_media_change_ms = ktime_to_ms(ktime_get());
  
         if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
                 cdi->options |= (int) CDO_AUTO_CLOSE;
@@ -864,7 +871,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi)
  {
         struct packet_command cgc;
         char buffer[32];
-       int ret, mmc3_profile;
+       int mmc3_profile;
  
         init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ);
  
@@ -874,7 +881,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi)
         cgc.cmd[8] = sizeof(buffer);            /* Allocation Length */
         cgc.quiet = 1;
  
-       if ((ret = cdi->ops->generic_packet(cdi, &cgc)))
+       if (cdi->ops->generic_packet(cdi, &cgc))
                 mmc3_profile = 0xffff;
         else
                 mmc3_profile = (buffer[6] << 8) | buffer[7];
@@ -1421,8 +1428,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
                 cdi->ops->check_events(cdi, 0, slot);
  
         if (slot == CDSL_NONE) {
-               /* set media changed bits, on both queues */
-               cdi->mc_flags = 0x3;
+               signal_media_change(cdi);
                 return cdrom_load_unload(cdi, -1);
         }
  
@@ -1455,7 +1461,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
                 slot = curslot;
  
         /* set media changed bits on both queues */
-       cdi->mc_flags = 0x3;
+       signal_media_change(cdi);
         if ((ret = cdrom_load_unload(cdi, slot)))
                 return ret;
  
@@ -1521,7 +1527,7 @@ int media_changed(struct cdrom_device_info *cdi, int queue)
         cdi->ioctl_events = 0;
  
         if (changed) {
-               cdi->mc_flags = 0x3;    /* set bit on both queues */
+               signal_media_change(cdi);
                 ret |= 1;
                 cdi->media_written = 0;
         }
@@ -2336,6 +2342,49 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
         return ret;
  }
  
+/*
+ * Media change detection with timing information.
+ *
+ * arg is a pointer to a cdrom_timed_media_change_info struct.
+ * arg->last_media_change may be set by calling code to signal
+ * the timestamp (in ms) of the last known media change (by the caller).
+ * Upon successful return, ioctl call will set arg->last_media_change
+ * to the latest media change timestamp known by the kernel/driver
+ * and set arg->has_changed to 1 if that timestamp is more recent
+ * than the timestamp set by the caller.
+ */
+static int cdrom_ioctl_timed_media_change(struct cdrom_device_info *cdi,
+               unsigned long arg)
+{
+       int ret;
+       struct cdrom_timed_media_change_info __user *info;
+       struct cdrom_timed_media_change_info tmp_info;
+
+       if (!CDROM_CAN(CDC_MEDIA_CHANGED))
+               return -ENOSYS;
+
+       info = (struct cdrom_timed_media_change_info __user *)arg;
+       cd_dbg(CD_DO_IOCTL, "entering CDROM_TIMED_MEDIA_CHANGE\n");
+
+       ret = cdrom_ioctl_media_changed(cdi, CDSL_CURRENT);
+       if (ret < 0)
+               return ret;
+
+       if (copy_from_user(&tmp_info, info, sizeof(tmp_info)) != 0)
+               return -EFAULT;
+
+       tmp_info.media_flags = 0;
+       if (tmp_info.last_media_change - cdi->last_media_change_ms < 0)
+               tmp_info.media_flags |= MEDIA_CHANGED_FLAG;
+
+       tmp_info.last_media_change = cdi->last_media_change_ms;
+
+       if (copy_to_user(info, &tmp_info, sizeof(*info)) != 0)
+               return -EFAULT;
+
+       return 0;
+}
+
  static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
                 unsigned long arg)
  {
@@ -3313,6 +3362,8 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
                 return cdrom_ioctl_eject_sw(cdi, arg);
         case CDROM_MEDIA_CHANGED:
                 return cdrom_ioctl_media_changed(cdi, arg);
+       case CDROM_TIMED_MEDIA_CHANGE:
+               return cdrom_ioctl_timed_media_change(cdi, arg);
         case CDROM_SET_OPTIONS:
                 return cdrom_ioctl_set_options(cdi, arg);
         case CDROM_CLEAR_OPTIONS:
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c

index 8e1fe75..d50cc1f 100644 (file)
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -805,9 +805,14 @@ static int probe_gdrom(struct platform_device *devptr)
                 err = -ENOMEM;
                 goto probe_fail_free_irqs;
         }
-       add_disk(gd.disk);
+       err = add_disk(gd.disk);
+       if (err)
+               goto probe_fail_add_disk;
+
         return 0;
  
+probe_fail_add_disk:
+       kfree(gd.toc);
  probe_fail_free_irqs:
         free_irq(HW_EVENT_GDROM_DMA, &gd);
         free_irq(HW_EVENT_GDROM_CMD, &gd);
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig

index d6ba644..4a55164 100644 (file)
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -76,7 +76,7 @@ config TCG_TIS_SPI_CR50
  
  config TCG_TIS_SYNQUACER
         tristate "TPM Interface Specification 1.2 Interface / TPM 2.0 FIFO Interface (MMIO - SynQuacer)"
-       depends on ARCH_SYNQUACER
+       depends on ARCH_SYNQUACER || COMPILE_TEST
         select TCG_TIS_CORE
         help
           If you have a TPM security chip that is compliant with the
diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c

index 784b8b3..97e9168 100644 (file)
--- a/drivers/char/tpm/tpm2-space.c
+++ b/drivers/char/tpm/tpm2-space.c
@@ -455,6 +455,9 @@ static int tpm2_map_response_body(struct tpm_chip *chip, u32 cc, u8 *rsp,
         if (be32_to_cpu(data->capability) != TPM2_CAP_HANDLES)
                 return 0;
  
+       if (be32_to_cpu(data->count) > (UINT_MAX - TPM_HEADER_SIZE - 9) / 4)
+               return -EFAULT;
+
         if (len != TPM_HEADER_SIZE + 9 + 4 * be32_to_cpu(data->count))
                 return -EFAULT;
  
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c

index 69579ef..b2659a4 100644 (file)
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -48,6 +48,7 @@ static int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask,
                 unsigned long timeout, wait_queue_head_t *queue,
                 bool check_cancel)
  {
+       struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
         unsigned long stop;
         long rc;
         u8 status;
@@ -80,8 +81,8 @@ again:
                 }
         } else {
                 do {
-                       usleep_range(TPM_TIMEOUT_USECS_MIN,
-                                    TPM_TIMEOUT_USECS_MAX);
+                       usleep_range(priv->timeout_min,
+                                    priv->timeout_max);
                         status = chip->ops->status(chip);
                         if ((status & mask) == mask)
                                 return 0;
@@ -945,7 +946,22 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
         chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX);
         chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX);
         chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX);
+       priv->timeout_min = TPM_TIMEOUT_USECS_MIN;
+       priv->timeout_max = TPM_TIMEOUT_USECS_MAX;
         priv->phy_ops = phy_ops;
+
+       rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor);
+       if (rc < 0)
+               goto out_err;
+
+       priv->manufacturer_id = vendor;
+
+       if (priv->manufacturer_id == TPM_VID_ATML &&
+               !(chip->flags & TPM_CHIP_FLAG_TPM2)) {
+               priv->timeout_min = TIS_TIMEOUT_MIN_ATML;
+               priv->timeout_max = TIS_TIMEOUT_MAX_ATML;
+       }
+
         dev_set_drvdata(&chip->dev, priv);
  
         if (is_bsw()) {
@@ -988,12 +1004,6 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
         if (rc)
                 goto out_err;
  
-       rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor);
-       if (rc < 0)
-               goto out_err;
-
-       priv->manufacturer_id = vendor;
-
         rc = tpm_tis_read8(priv, TPM_RID(0), &rid);
         if (rc < 0)
                 goto out_err;
diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h

index b2a3c6c..3be24f2 100644 (file)
--- a/drivers/char/tpm/tpm_tis_core.h
+++ b/drivers/char/tpm/tpm_tis_core.h
@@ -54,6 +54,8 @@ enum tis_defaults {
         TIS_MEM_LEN = 0x5000,
         TIS_SHORT_TIMEOUT = 750,        /* ms */
         TIS_LONG_TIMEOUT = 2000,        /* 2 sec */
+       TIS_TIMEOUT_MIN_ATML = 14700,   /* usecs */
+       TIS_TIMEOUT_MAX_ATML = 15000,   /* usecs */
  };
  
  /* Some timeout values are needed before it is known whether the chip is
@@ -98,6 +100,8 @@ struct tpm_tis_data {
         wait_queue_head_t read_queue;
         const struct tpm_tis_phy_ops *phy_ops;
         unsigned short rng_quality;
+       unsigned int timeout_min; /* usecs */
+       unsigned int timeout_max; /* usecs */
  };
  
  struct tpm_tis_phy_ops {
diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c

index 54584b4..aaa59a0 100644 (file)
--- a/drivers/char/tpm/tpm_tis_spi_main.c
+++ b/drivers/char/tpm/tpm_tis_spi_main.c
@@ -267,6 +267,7 @@ static const struct spi_device_id tpm_tis_spi_id[] = {
         { "st33htpm-spi", (unsigned long)tpm_tis_spi_probe },
         { "slb9670", (unsigned long)tpm_tis_spi_probe },
         { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe },
+       { "tpm_tis-spi", (unsigned long)tpm_tis_spi_probe },
         { "cr50", (unsigned long)cr50_spi_probe },
         {}
  };
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c

index 0506046..510a996 100644 (file)
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -58,11 +58,8 @@ static int clk_composite_determine_rate(struct clk_hw *hw,
         long rate;
         int i;
  
-       if (rate_hw && rate_ops && rate_ops->determine_rate) {
-               __clk_hw_set_clk(rate_hw, hw);
-               return rate_ops->determine_rate(rate_hw, req);
-       } else if (rate_hw && rate_ops && rate_ops->round_rate &&
-                  mux_hw && mux_ops && mux_ops->set_parent) {
+       if (rate_hw && rate_ops && rate_ops->round_rate &&
+           mux_hw && mux_ops && mux_ops->set_parent) {
                 req->best_parent_hw = NULL;
  
                 if (clk_hw_get_flags(hw) & CLK_SET_RATE_NO_REPARENT) {
@@ -107,6 +104,9 @@ static int clk_composite_determine_rate(struct clk_hw *hw,
  
                 req->rate = best_rate;
                 return 0;
+       } else if (rate_hw && rate_ops && rate_ops->determine_rate) {
+               __clk_hw_set_clk(rate_hw, hw);
+               return rate_ops->determine_rate(rate_hw, req);
         } else if (mux_hw && mux_ops && mux_ops->determine_rate) {
                 __clk_hw_set_clk(mux_hw, hw);
                 return mux_ops->determine_rate(mux_hw, req);
diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c

index 177d03e..40a052b 100644 (file)
--- a/drivers/gpio/gpio-mlxbf2.c
+++ b/drivers/gpio/gpio-mlxbf2.c
@@ -256,6 +256,11 @@ mlxbf2_gpio_probe(struct platform_device *pdev)
                         NULL,
                         0);
  
+       if (ret) {
+               dev_err(dev, "bgpio_init failed\n");
+               return ret;
+       }
+
         gc->direction_input = mlxbf2_gpio_direction_input;
         gc->direction_output = mlxbf2_gpio_direction_output;
         gc->ngpio = npins;
diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c

index fa9b4d8..43ca52f 100644 (file)
--- a/drivers/gpio/gpio-xgs-iproc.c
+++ b/drivers/gpio/gpio-xgs-iproc.c
@@ -224,7 +224,7 @@ static int iproc_gpio_probe(struct platform_device *pdev)
         }
  
         chip->gc.label = dev_name(dev);
-       if (of_property_read_u32(dn, "ngpios", &num_gpios))
+       if (!of_property_read_u32(dn, "ngpios", &num_gpios))
                 chip->gc.ngpio = num_gpios;
  
         irq = platform_get_irq(pdev, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c

index ff80786..01efda4 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -1257,7 +1257,7 @@ static int nv_common_early_init(void *handle)
                         AMD_PG_SUPPORT_VCN_DPG |
                         AMD_PG_SUPPORT_JPEG;
                 if (adev->pdev->device == 0x1681)
-                       adev->external_rev_id = adev->rev_id + 0x19;
+                       adev->external_rev_id = 0x20;
                 else
                         adev->external_rev_id = adev->rev_id + 0x01;
                 break;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c

index 87daa78..8080bba 100644 (file)
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
@@ -263,7 +263,7 @@ static ssize_t dp_link_settings_write(struct file *f, const char __user *buf,
         if (!wr_buf)
                 return -ENOSPC;
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -487,7 +487,7 @@ static ssize_t dp_phy_settings_write(struct file *f, const char __user *buf,
         if (!wr_buf)
                 return -ENOSPC;
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -639,7 +639,7 @@ static ssize_t dp_phy_test_pattern_debugfs_write(struct file *f, const char __us
         if (!wr_buf)
                 return -ENOSPC;
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -914,7 +914,7 @@ static ssize_t dp_dsc_passthrough_set(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            &param, buf,
                                            max_param_num,
                                            &param_nums)) {
@@ -1211,7 +1211,7 @@ static ssize_t trigger_hotplug(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                                 (long *)param, buf,
                                                 max_param_num,
                                                 &param_nums)) {
@@ -1396,7 +1396,7 @@ static ssize_t dp_dsc_clock_en_write(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                             (long *)param, buf,
                                             max_param_num,
                                             &param_nums)) {
@@ -1581,7 +1581,7 @@ static ssize_t dp_dsc_slice_width_write(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                             (long *)param, buf,
                                             max_param_num,
                                             &param_nums)) {
@@ -1766,7 +1766,7 @@ static ssize_t dp_dsc_slice_height_write(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                             (long *)param, buf,
                                             max_param_num,
                                             &param_nums)) {
@@ -1944,7 +1944,7 @@ static ssize_t dp_dsc_bits_per_pixel_write(struct file *f, const char __user *bu
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                             (long *)param, buf,
                                             max_param_num,
                                             &param_nums)) {
@@ -2382,7 +2382,7 @@ static ssize_t dp_max_bpc_write(struct file *f, const char __user *buf,
                 return -ENOSPC;
         }
  
-       if (parse_write_buffer_into_params(wr_buf, size,
+       if (parse_write_buffer_into_params(wr_buf, wr_buf_size,
                                            (long *)param, buf,
                                            max_param_num,
                                            &param_nums)) {
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c

index 4a4894e..377c4e5 100644 (file)
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c
@@ -366,32 +366,32 @@ static struct wm_table lpddr5_wm_table = {
                         .wm_inst = WM_A,
                         .wm_type = WM_TYPE_PSTATE_CHG,
                         .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 5.32,
-                       .sr_enter_plus_exit_time_us = 6.38,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                         .valid = true,
                 },
                 {
                         .wm_inst = WM_B,
                         .wm_type = WM_TYPE_PSTATE_CHG,
                         .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.82,
-                       .sr_enter_plus_exit_time_us = 11.196,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                         .valid = true,
                 },
                 {
                         .wm_inst = WM_C,
                         .wm_type = WM_TYPE_PSTATE_CHG,
                         .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.89,
-                       .sr_enter_plus_exit_time_us = 11.24,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                         .valid = true,
                 },
                 {
                         .wm_inst = WM_D,
                         .wm_type = WM_TYPE_PSTATE_CHG,
                         .pstate_latency_us = 11.65333,
-                       .sr_exit_time_us = 9.748,
-                       .sr_enter_plus_exit_time_us = 11.102,
+                       .sr_exit_time_us = 11.5,
+                       .sr_enter_plus_exit_time_us = 14.5,
                         .valid = true,
                 },
         }
@@ -518,14 +518,21 @@ static unsigned int find_clk_for_voltage(
                 unsigned int voltage)
  {
         int i;
+       int max_voltage = 0;
+       int clock = 0;
  
         for (i = 0; i < NUM_SOC_VOLTAGE_LEVELS; i++) {
-               if (clock_table->SocVoltage[i] == voltage)
+               if (clock_table->SocVoltage[i] == voltage) {
                         return clocks[i];
+               } else if (clock_table->SocVoltage[i] >= max_voltage &&
+                               clock_table->SocVoltage[i] < voltage) {
+                       max_voltage = clock_table->SocVoltage[i];
+                       clock = clocks[i];
+               }
         }
  
-       ASSERT(0);
-       return 0;
+       ASSERT(clock);
+       return clock;
  }
  
  void dcn31_clk_mgr_helper_populate_bw_params(
diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c

index 3f2333e..3afa115 100644 (file)
--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c
@@ -76,10 +76,6 @@ void dcn31_init_hw(struct dc *dc)
         if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks)
                 dc->clk_mgr->funcs->init_clocks(dc->clk_mgr);
  
-       // Initialize the dccg
-       if (res_pool->dccg->funcs->dccg_init)
-               res_pool->dccg->funcs->dccg_init(res_pool->dccg);
-
         if (IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment)) {
  
                 REG_WRITE(REFCLK_CNTL, 0);
@@ -106,6 +102,9 @@ void dcn31_init_hw(struct dc *dc)
                 hws->funcs.bios_golden_init(dc);
                 hws->funcs.disable_vga(dc->hwseq);
         }
+       // Initialize the dccg
+       if (res_pool->dccg->funcs->dccg_init)
+               res_pool->dccg->funcs->dccg_init(res_pool->dccg);
  
         if (dc->debug.enable_mem_low_power.bits.dmcu) {
                 // Force ERAM to shutdown if DMCU is not enabled
diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c

index 0006bba..79e92ec 100644 (file)
--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
@@ -217,8 +217,8 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_1_soc = {
         .num_states = 5,
         .sr_exit_time_us = 9.0,
         .sr_enter_plus_exit_time_us = 11.0,
-       .sr_exit_z8_time_us = 402.0,
-       .sr_enter_plus_exit_z8_time_us = 520.0,
+       .sr_exit_z8_time_us = 442.0,
+       .sr_enter_plus_exit_z8_time_us = 560.0,
         .writeback_latency_us = 12.0,
         .dram_channel_width_bytes = 4,
         .round_trip_ping_latency_dcfclk_cycles = 106,
@@ -928,7 +928,7 @@ static const struct dc_debug_options debug_defaults_drv = {
         .disable_dcc = DCC_ENABLE,
         .vsr_support = true,
         .performance_trace = false,
-       .max_downscale_src_width = 3840,/*upto 4K*/
+       .max_downscale_src_width = 4096,/*upto true 4K*/
         .disable_pplib_wm_range = false,
         .scl_reset_length10 = true,
         .sanity_checks = false,
@@ -1590,6 +1590,13 @@ static int dcn31_populate_dml_pipes_from_context(
                 pipe = &res_ctx->pipe_ctx[i];
                 timing = &pipe->stream->timing;
  
+               /*
+                * Immediate flip can be set dynamically after enabling the plane.
+                * We need to require support for immediate flip or underflow can be
+                * intermittently experienced depending on peak b/w requirements.
+                */
+               pipes[pipe_cnt].pipe.src.immediate_flip = true;
+
                 pipes[pipe_cnt].pipe.src.unbounded_req_mode = false;
                 pipes[pipe_cnt].pipe.src.gpuvm = true;
                 pipes[pipe_cnt].pipe.src.dcc_fraction_of_zs_req_luma = 0;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c

index ce55c9c..d58925c 100644 (file)
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
@@ -5398,9 +5398,9 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l
  
                                         v->MaximumReadBandwidthWithPrefetch =
                                                         v->MaximumReadBandwidthWithPrefetch
-                                                                       + dml_max4(
-                                                                                       v->VActivePixelBandwidth[i][j][k],
-                                                                                       v->VActiveCursorBandwidth[i][j][k]
+                                                                       + dml_max3(
+                                                                                       v->VActivePixelBandwidth[i][j][k]
+                                                                                                       + v->VActiveCursorBandwidth[i][j][k]
                                                                                                         + v->NoOfDPP[i][j][k]
                                                                                                                         * (v->meta_row_bandwidth[i][j][k]
                                                                                                                                         + v->dpte_row_bandwidth[i][j][k]),
diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h

index 5adc471..3d2f081 100644 (file)
--- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h
+++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
@@ -227,7 +227,7 @@ enum {
  #define FAMILY_YELLOW_CARP                     146
  
  #define YELLOW_CARP_A0 0x01
-#define YELLOW_CARP_B0 0x1A
+#define YELLOW_CARP_B0 0x20
  #define YELLOW_CARP_UNKNOWN 0xFF
  
  #ifndef ASICREV_IS_YELLOW_CARP
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c

index e9bd84e..be61975 100644 (file)
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
@@ -105,6 +105,7 @@ static enum mod_hdcp_status remove_display_from_topology_v3(
         dtm_cmd->dtm_status = TA_DTM_STATUS__GENERIC_FAILURE;
  
         psp_dtm_invoke(psp, dtm_cmd->cmd_id);
+       mutex_unlock(&psp->dtm_context.mutex);
  
         if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
                 status = remove_display_from_topology_v2(hdcp, index);
@@ -115,8 +116,6 @@ static enum mod_hdcp_status remove_display_from_topology_v3(
                 HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index);
         }
  
-       mutex_unlock(&psp->dtm_context.mutex);
-
         return status;
  }
  
@@ -205,6 +204,7 @@ static enum mod_hdcp_status add_display_to_topology_v3(
         dtm_cmd->dtm_in_message.topology_update_v3.link_hdcp_cap = link->hdcp_supported_informational;
  
         psp_dtm_invoke(psp, dtm_cmd->cmd_id);
+       mutex_unlock(&psp->dtm_context.mutex);
  
         if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
                 status = add_display_to_topology_v2(hdcp, display);
@@ -214,8 +214,6 @@ static enum mod_hdcp_status add_display_to_topology_v3(
                 HDCP_TOP_ADD_DISPLAY_TRACE(hdcp, display->index);
         }
  
-       mutex_unlock(&psp->dtm_context.mutex);
-
         return status;
  }
  
diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c

index f6bdec7..e1b2ce4 100644 (file)
--- a/drivers/gpu/drm/drm_panel_orientation_quirks.c
+++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c
@@ -134,6 +134,12 @@ static const struct dmi_system_id orientation_data[] = {
                   DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T103HAF"),
                 },
                 .driver_data = (void *)&lcd800x1280_rightside_up,
+       }, {    /* AYA NEO 2021 */
+               .matches = {
+                 DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AYADEVICE"),
+                 DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "AYA NEO 2021"),
+               },
+               .driver_data = (void *)&lcd800x1280_rightside_up,
         }, {    /* GPD MicroPC (generic strings, also match on bios date) */
                 .matches = {
                   DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Default string"),
@@ -185,6 +191,12 @@ static const struct dmi_system_id orientation_data[] = {
                   DMI_EXACT_MATCH(DMI_BOARD_NAME, "Default string"),
                 },
                 .driver_data = (void *)&gpd_win2,
+       }, {    /* GPD Win 3 */
+               .matches = {
+                 DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"),
+                 DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1618-03")
+               },
+               .driver_data = (void *)&lcd720x1280_rightside_up,
         }, {    /* I.T.Works TW891 */
                 .matches = {
                   DMI_EXACT_MATCH(DMI_SYS_VENDOR, "To be filled by O.E.M."),
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c

index abe3d61..5cf152b 100644 (file)
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -1916,6 +1916,9 @@ void intel_dp_sync_state(struct intel_encoder *encoder,
  {
         struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
  
+       if (!crtc_state)
+               return;
+
         /*
          * Don't clobber DPCD if it's been already read out during output
          * setup (eDP) or detect.
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c

index 1257f4f..438bbc7 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -64,7 +64,7 @@ intel_timeline_pin_map(struct intel_timeline *timeline)
  
         timeline->hwsp_map = vaddr;
         timeline->hwsp_seqno = memset(vaddr + ofs, 0, TIMELINE_SEQNO_BYTES);
-       clflush(vaddr + ofs);
+       drm_clflush_virt_range(vaddr + ofs, TIMELINE_SEQNO_BYTES);
  
         return 0;
  }
@@ -225,7 +225,7 @@ void intel_timeline_reset_seqno(const struct intel_timeline *tl)
  
         memset(hwsp_seqno + 1, 0, TIMELINE_SEQNO_BYTES - sizeof(*hwsp_seqno));
         WRITE_ONCE(*hwsp_seqno, tl->seqno);
-       clflush(hwsp_seqno);
+       drm_clflush_virt_range(hwsp_seqno, TIMELINE_SEQNO_BYTES);
  }
  
  void intel_timeline_enter(struct intel_timeline *tl)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h

index 4037030..9023d4e 100644 (file)
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -11048,12 +11048,6 @@ enum skl_power_gate {
  #define  DC_STATE_DEBUG_MASK_CORES     (1 << 0)
  #define  DC_STATE_DEBUG_MASK_MEMORY_UP (1 << 1)
  
-#define BXT_P_CR_MC_BIOS_REQ_0_0_0     _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x7114)
-#define  BXT_REQ_DATA_MASK                     0x3F
-#define  BXT_DRAM_CHANNEL_ACTIVE_SHIFT         12
-#define  BXT_DRAM_CHANNEL_ACTIVE_MASK          (0xF << 12)
-#define  BXT_MEMORY_FREQ_MULTIPLIER_HZ         133333333
-
  #define BXT_D_CR_DRP0_DUNIT8                   0x1000
  #define BXT_D_CR_DRP0_DUNIT9                   0x1200
  #define  BXT_D_CR_DRP0_DUNIT_START             8
@@ -11084,9 +11078,7 @@ enum skl_power_gate {
  #define  BXT_DRAM_TYPE_LPDDR4                  (0x2 << 22)
  #define  BXT_DRAM_TYPE_DDR4                    (0x4 << 22)
  
-#define SKL_MEMORY_FREQ_MULTIPLIER_HZ          266666666
  #define SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU      _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5E04)
-#define  SKL_REQ_DATA_MASK                     (0xF << 0)
  #define  DG1_GEAR_TYPE                         REG_BIT(16)
  
  #define SKL_MAD_INTER_CHANNEL_0_0_0_MCHBAR_MCMAIN _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5000)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h

index 806ad68..63fec1c 100644 (file)
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -794,7 +794,6 @@ DECLARE_EVENT_CLASS(i915_request,
             TP_STRUCT__entry(
                              __field(u32, dev)
                              __field(u64, ctx)
-                            __field(u32, guc_id)
                              __field(u16, class)
                              __field(u16, instance)
                              __field(u32, seqno)
@@ -805,16 +804,14 @@ DECLARE_EVENT_CLASS(i915_request,
                            __entry->dev = rq->engine->i915->drm.primary->index;
                            __entry->class = rq->engine->uabi_class;
                            __entry->instance = rq->engine->uabi_instance;
-                          __entry->guc_id = rq->context->guc_id;
                            __entry->ctx = rq->fence.context;
                            __entry->seqno = rq->fence.seqno;
                            __entry->tail = rq->tail;
                            ),
  
-           TP_printk("dev=%u, engine=%u:%u, guc_id=%u, ctx=%llu, seqno=%u, tail=%u",
+           TP_printk("dev=%u, engine=%u:%u, ctx=%llu, seqno=%u, tail=%u",
                       __entry->dev, __entry->class, __entry->instance,
-                     __entry->guc_id, __entry->ctx, __entry->seqno,
-                     __entry->tail)
+                     __entry->ctx, __entry->seqno, __entry->tail)
  );
  
  DEFINE_EVENT(i915_request, i915_request_add,
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h

index 5259eda..066a911 100644 (file)
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -30,6 +30,7 @@
  #include <linux/sched.h>
  #include <linux/types.h>
  #include <linux/workqueue.h>
+#include <linux/sched/clock.h>
  
  struct drm_i915_private;
  struct timer_list;
diff --git a/drivers/gpu/drm/i915/intel_dram.c b/drivers/gpu/drm/i915/intel_dram.c

index 9186652..7acce64 100644 (file)
--- a/drivers/gpu/drm/i915/intel_dram.c
+++ b/drivers/gpu/drm/i915/intel_dram.c
@@ -244,7 +244,6 @@ static int
  skl_get_dram_info(struct drm_i915_private *i915)
  {
         struct dram_info *dram_info = &i915->dram_info;
-       u32 mem_freq_khz, val;
         int ret;
  
         dram_info->type = skl_get_dram_type(i915);
@@ -255,17 +254,6 @@ skl_get_dram_info(struct drm_i915_private *i915)
         if (ret)
                 return ret;
  
-       val = intel_uncore_read(&i915->uncore,
-                               SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU);
-       mem_freq_khz = DIV_ROUND_UP((val & SKL_REQ_DATA_MASK) *
-                                   SKL_MEMORY_FREQ_MULTIPLIER_HZ, 1000);
-
-       if (dram_info->num_channels * mem_freq_khz == 0) {
-               drm_info(&i915->drm,
-                        "Couldn't get system memory bandwidth\n");
-               return -EINVAL;
-       }
-
         return 0;
  }
  
@@ -350,24 +338,10 @@ static void bxt_get_dimm_info(struct dram_dimm_info *dimm, u32 val)
  static int bxt_get_dram_info(struct drm_i915_private *i915)
  {
         struct dram_info *dram_info = &i915->dram_info;
-       u32 dram_channels;
-       u32 mem_freq_khz, val;
-       u8 num_active_channels, valid_ranks = 0;
+       u32 val;
+       u8 valid_ranks = 0;
         int i;
  
-       val = intel_uncore_read(&i915->uncore, BXT_P_CR_MC_BIOS_REQ_0_0_0);
-       mem_freq_khz = DIV_ROUND_UP((val & BXT_REQ_DATA_MASK) *
-                                   BXT_MEMORY_FREQ_MULTIPLIER_HZ, 1000);
-
-       dram_channels = val & BXT_DRAM_CHANNEL_ACTIVE_MASK;
-       num_active_channels = hweight32(dram_channels);
-
-       if (mem_freq_khz * num_active_channels == 0) {
-               drm_info(&i915->drm,
-                        "Couldn't get system memory bandwidth\n");
-               return -EINVAL;
-       }
-
         /*
          * Now read each DUNIT8/9/10/11 to check the rank of each dimms.
          */
diff --git a/drivers/gpu/drm/selftests/test-drm_damage_helper.c b/drivers/gpu/drm/selftests/test-drm_damage_helper.c

index 1c19a5d..8d8d8e2 100644 (file)
--- a/drivers/gpu/drm/selftests/test-drm_damage_helper.c
+++ b/drivers/gpu/drm/selftests/test-drm_damage_helper.c
@@ -30,6 +30,7 @@ static void mock_setup(struct drm_plane_state *state)
         mock_device.driver = &mock_driver;
         mock_device.mode_config.prop_fb_damage_clips = &mock_prop;
         mock_plane.dev = &mock_device;
+       mock_obj_props.count = 0;
         mock_plane.base.properties = &mock_obj_props;
         mock_prop.base.id = 1; /* 0 is an invalid id */
         mock_prop.dev = &mock_device;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c

index 1c5ffe2..abf2d7a 100644 (file)
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -190,6 +190,7 @@ static void ttm_transfered_destroy(struct ttm_buffer_object *bo)
         struct ttm_transfer_obj *fbo;
  
         fbo = container_of(bo, struct ttm_transfer_obj, base);
+       dma_resv_fini(&fbo->base.base._resv);
         ttm_bo_put(fbo->bo);
         kfree(fbo);
  }
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c

index a20b810..c00f8e2 100644 (file)
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -706,8 +706,9 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
  
         /* Construct the family header first */
         header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
-       memcpy(header->device_name, dev_name(&query->port->agent->device->dev),
-              LS_DEVICE_NAME_MAX);
+       strscpy_pad(header->device_name,
+                   dev_name(&query->port->agent->device->dev),
+                   LS_DEVICE_NAME_MAX);
         header->port_num = query->port->port_num;
  
         if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c

index 489b436..3d42bd2 100644 (file)
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -878,6 +878,7 @@ void sc_disable(struct send_context *sc)
  {
         u64 reg;
         struct pio_buf *pbuf;
+       LIST_HEAD(wake_list);
  
         if (!sc)
                 return;
@@ -912,19 +913,21 @@ void sc_disable(struct send_context *sc)
         spin_unlock(&sc->release_lock);
  
         write_seqlock(&sc->waitlock);
-       while (!list_empty(&sc->piowait)) {
+       if (!list_empty(&sc->piowait))
+               list_move(&sc->piowait, &wake_list);
+       write_sequnlock(&sc->waitlock);
+       while (!list_empty(&wake_list)) {
                 struct iowait *wait;
                 struct rvt_qp *qp;
                 struct hfi1_qp_priv *priv;
  
-               wait = list_first_entry(&sc->piowait, struct iowait, list);
+               wait = list_first_entry(&wake_list, struct iowait, list);
                 qp = iowait_to_qp(wait);
                 priv = qp->priv;
                 list_del_init(&priv->s_iowait.list);
                 priv->s_iowait.lock = NULL;
                 hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
         }
-       write_sequnlock(&sc->waitlock);
  
         spin_unlock_irq(&sc->alloc_lock);
  }
diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c

index 5fb92de..9b544a3 100644 (file)
--- a/drivers/infiniband/hw/irdma/uk.c
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -1092,12 +1092,12 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info)
                 if (cq->avoid_mem_cflct) {
                         ext_cqe = (__le64 *)((u8 *)cqe + 32);
                         get_64bit_val(ext_cqe, 24, &qword7);
-                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7);
                 } else {
                         peek_head = (cq->cq_ring.head + 1) % cq->cq_ring.size;
                         ext_cqe = cq->cq_base[peek_head].buf;
                         get_64bit_val(ext_cqe, 24, &qword7);
-                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+                       polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7);
                         if (!peek_head)
                                 polarity ^= 1;
                 }
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c

index 7110ebf..102dc93 100644 (file)
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -3399,9 +3399,13 @@ static void irdma_process_cqe(struct ib_wc *entry,
                 }
  
                 if (cq_poll_info->ud_vlan_valid) {
-                       entry->vlan_id = cq_poll_info->ud_vlan & VLAN_VID_MASK;
-                       entry->wc_flags |= IB_WC_WITH_VLAN;
+                       u16 vlan = cq_poll_info->ud_vlan & VLAN_VID_MASK;
+
                         entry->sl = cq_poll_info->ud_vlan >> VLAN_PRIO_SHIFT;
+                       if (vlan) {
+                               entry->vlan_id = vlan;
+                               entry->wc_flags |= IB_WC_WITH_VLAN;
+                       }
                 } else {
                         entry->sl = 0;
                 }
diff --git a/drivers/infiniband/hw/irdma/ws.c b/drivers/infiniband/hw/irdma/ws.c

index b68c575..b0d6ee0 100644 (file)
--- a/drivers/infiniband/hw/irdma/ws.c
+++ b/drivers/infiniband/hw/irdma/ws.c
@@ -330,8 +330,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri)
  
                 tc_node->enable = true;
                 ret = irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_MODIFY_NODE);
-               if (ret)
+               if (ret) {
+                       vsi->unregister_qset(vsi, tc_node);
                         goto reg_err;
+               }
         }
         ibdev_dbg(to_ibdev(vsi->dev),
                   "WS: Using node %d which represents VSI %d TC %d\n",
@@ -350,6 +352,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri)
         }
         goto exit;
  
+reg_err:
+       irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_DELETE_NODE);
+       list_del(&tc_node->siblings);
+       irdma_free_node(vsi, tc_node);
  leaf_add_err:
         if (list_empty(&vsi_node->child_list_head)) {
                 if (irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_DELETE_NODE))
@@ -369,11 +375,6 @@ vsi_add_err:
  exit:
         mutex_unlock(&vsi->dev->ws_mutex);
         return ret;
-
-reg_err:
-       mutex_unlock(&vsi->dev->ws_mutex);
-       irdma_ws_remove(vsi, user_pri);
-       return ret;
  }
  
  /**
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c

index 3be36eb..22e2f4d 100644 (file)
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1339,7 +1339,6 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
                 goto err_2;
         }
         mr->mmkey.type = MLX5_MKEY_MR;
-       mr->desc_size = sizeof(struct mlx5_mtt);
         mr->umem = umem;
         set_mr_fields(dev, mr, umem->length, access_flags);
         kvfree(in);
@@ -1533,6 +1532,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
                 ib_umem_release(&odp->umem);
                 return ERR_CAST(mr);
         }
+       xa_init(&mr->implicit_children);
  
         odp->private = mr;
         err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c

index b2fca11..e5abbcf 100644 (file)
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4458,6 +4458,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                 MLX5_SET(dctc, dctc, mtu, attr->path_mtu);
                 MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index);
                 MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
+               if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+                       MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7);
  
                 err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in,
                                            MLX5_ST_SZ_BYTES(create_dct_in), out,
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h

index 3cb4feb..8def88c 100644 (file)
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -455,6 +455,7 @@ struct qedr_qp {
         /* synchronization objects used with iwarp ep */
         struct kref refcnt;
         struct completion iwarp_cm_comp;
+       struct completion qp_rel_comp;
         unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
  };
  
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c

index 1715fbe..a51fc68 100644 (file)
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -83,7 +83,7 @@ static void qedr_iw_free_qp(struct kref *ref)
  {
         struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
  
-       kfree(qp);
+       complete(&qp->qp_rel_comp);
  }
  
  static void
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c

index 3fbf172..dcb3653 100644 (file)
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -1357,6 +1357,7 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev,
         if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
                 kref_init(&qp->refcnt);
                 init_completion(&qp->iwarp_cm_comp);
+               init_completion(&qp->qp_rel_comp);
         }
  
         qp->pd = pd;
@@ -2857,8 +2858,10 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
  
         qedr_free_qp_resources(dev, qp, udata);
  
-       if (rdma_protocol_iwarp(&dev->ibdev, 1))
+       if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
                 qedr_iw_qp_rem_ref(&qp->ibqp);
+               wait_for_completion(&qp->qp_rel_comp);
+       }
  
         return 0;
  }
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c

index a67599b..ac11943 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -602,7 +602,7 @@ done:
  /*
   * How many pages in this iovec element?
   */
-static int qib_user_sdma_num_pages(const struct iovec *iov)
+static size_t qib_user_sdma_num_pages(const struct iovec *iov)
  {
         const unsigned long addr  = (unsigned long) iov->iov_base;
         const unsigned long  len  = iov->iov_len;
@@ -658,7 +658,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev,
  static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
                                    struct qib_user_sdma_queue *pq,
                                    struct qib_user_sdma_pkt *pkt,
-                                  unsigned long addr, int tlen, int npages)
+                                  unsigned long addr, int tlen, size_t npages)
  {
         struct page *pages[8];
         int i, j;
@@ -722,7 +722,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
         unsigned long idx;
  
         for (idx = 0; idx < niov; idx++) {
-               const int npages = qib_user_sdma_num_pages(iov + idx);
+               const size_t npages = qib_user_sdma_num_pages(iov + idx);
                 const unsigned long addr = (unsigned long) iov[idx].iov_base;
  
                 ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
@@ -824,8 +824,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                 unsigned pktnw;
                 unsigned pktnwc;
                 int nfrags = 0;
-               int npages = 0;
-               int bytes_togo = 0;
+               size_t npages = 0;
+               size_t bytes_togo = 0;
                 int tiddma = 0;
                 int cfur;
  
@@ -885,7 +885,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
  
                         npages += qib_user_sdma_num_pages(&iov[idx]);
  
-                       bytes_togo += slen;
+                       if (check_add_overflow(bytes_togo, slen, &bytes_togo) ||
+                           bytes_togo > type_max(typeof(pkt->bytes_togo))) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
                         pktnwc += slen >> 2;
                         idx++;
                         nfrags++;
@@ -904,8 +908,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                 }
  
                 if (frag_size) {
-                       int tidsmsize, n;
-                       size_t pktsize;
+                       size_t tidsmsize, n, pktsize, sz, addrlimit;
  
                         n = npages*((2*PAGE_SIZE/frag_size)+1);
                         pktsize = struct_size(pkt, addr, n);
@@ -923,14 +926,24 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                         else
                                 tidsmsize = 0;
  
-                       pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+                       if (check_add_overflow(pktsize, tidsmsize, &sz)) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
+                       pkt = kmalloc(sz, GFP_KERNEL);
                         if (!pkt) {
                                 ret = -ENOMEM;
                                 goto free_pbc;
                         }
                         pkt->largepkt = 1;
                         pkt->frag_size = frag_size;
-                       pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+                       if (check_add_overflow(n, ARRAY_SIZE(pkt->addr),
+                                              &addrlimit) ||
+                           addrlimit > type_max(typeof(pkt->addrlimit))) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
+                       pkt->addrlimit = addrlimit;
  
                         if (tiddma) {
                                 char *tidsm = (char *)pkt + pktsize;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c

index 49bdd78..3305f27 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1223,7 +1223,7 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
         spin_lock(&rdi->n_qps_lock);
         if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
                 spin_unlock(&rdi->n_qps_lock);
-               ret = ENOMEM;
+               ret = -ENOMEM;
                 goto bail_ip;
         }
  
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h

index 5fc989a..9ed9c95 100644 (file)
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
  
  #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
  
-#include <linux/bcache.h>
  #include <linux/bio.h>
  #include <linux/kobject.h>
  #include <linux/list.h>
@@ -190,6 +189,7 @@
  #include <linux/workqueue.h>
  #include <linux/kthread.h>
  
+#include "bcache_ondisk.h"
  #include "bset.h"
  #include "util.h"
  #include "closure.h"
@@ -395,8 +395,6 @@ struct cached_dev {
         atomic_t                io_errors;
         unsigned int            error_limit;
         unsigned int            offline_seconds;
-
-       char                    backing_dev_name[BDEVNAME_SIZE];
  };
  
  enum alloc_reserve {
@@ -470,8 +468,6 @@ struct cache {
         atomic_long_t           meta_sectors_written;
         atomic_long_t           btree_sectors_written;
         atomic_long_t           sectors_written;
-
-       char                    cache_dev_name[BDEVNAME_SIZE];
  };
  
  struct gc_stat {
diff --git a/include/uapi/linux/bcache.h b/drivers/md/bcache/bcache_ondisk.h

similarity index 99%

rename from include/uapi/linux/bcache.h

rename to drivers/md/bcache/bcache_ondisk.h

index cf7399f..9741358 100644 (file)
--- a/include/uapi/linux/bcache.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -43,9 +43,9 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v)        \
  #define KEY_MAX_U64S           8
  
  KEY_FIELD(KEY_PTRS,    high, 60, 3)
-KEY_FIELD(HEADER_SIZE, high, 58, 2)
+KEY_FIELD(__PAD0,      high, 58, 2)
  KEY_FIELD(KEY_CSUM,    high, 56, 2)
-KEY_FIELD(KEY_PINNED,  high, 55, 1)
+KEY_FIELD(__PAD1,      high, 55, 1)
  KEY_FIELD(KEY_DIRTY,   high, 36, 1)
  
  KEY_FIELD(KEY_SIZE,    high, 20, KEY_SIZE_BITS)
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h

index a50dcfd..d795c84 100644 (file)
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -2,10 +2,10 @@
  #ifndef _BCACHE_BSET_H
  #define _BCACHE_BSET_H
  
-#include <linux/bcache.h>
  #include <linux/kernel.h>
  #include <linux/types.h>
  
+#include "bcache_ondisk.h"
  #include "util.h" /* for time_stats */
  
  /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c

index 0595559..93b67b8 100644 (file)
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -141,7 +141,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
         uint64_t crc = b->key.ptr[0];
         void *data = (void *) i + 8, *end = bset_bkey_last(i);
  
-       crc = bch_crc64_update(crc, data, end - data);
+       crc = crc64_be(crc, data, end - data);
         return crc ^ 0xffffffffffffffffULL;
  }
  
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c

index 116edda..6230dfd 100644 (file)
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -127,21 +127,20 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
  
         citer.bi_size = UINT_MAX;
         bio_for_each_segment(bv, bio, iter) {
-               void *p1 = kmap_atomic(bv.bv_page);
+               void *p1 = bvec_kmap_local(&bv);
                 void *p2;
  
                 cbv = bio_iter_iovec(check, citer);
-               p2 = page_address(cbv.bv_page);
+               p2 = bvec_kmap_local(&cbv);
  
-               cache_set_err_on(memcmp(p1 + bv.bv_offset,
-                                       p2 + bv.bv_offset,
-                                       bv.bv_len),
+               cache_set_err_on(memcmp(p1, p2, bv.bv_len),
                                  dc->disk.c,
-                                "verify failed at dev %s sector %llu",
-                                dc->backing_dev_name,
+                                "verify failed at dev %pg sector %llu",
+                                dc->bdev,
                                  (uint64_t) bio->bi_iter.bi_sector);
  
-               kunmap_atomic(p1);
+               kunmap_local(p2);
+               kunmap_local(p1);
                 bio_advance_iter(check, &citer, bv.bv_len);
         }
  
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c

index 6d2b7b8..634922c 100644 (file)
--- a/drivers/md/bcache/features.c
+++ b/drivers/md/bcache/features.c
@@ -6,7 +6,7 @@
   * Copyright 2020 Coly Li <colyli@suse.de>
   *
   */
-#include <linux/bcache.h>
+#include "bcache_ondisk.h"
  #include "bcache.h"
  #include "features.h"
  
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h

index d1c8fd3..09161b8 100644 (file)
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -2,10 +2,11 @@
  #ifndef _BCACHE_FEATURES_H
  #define _BCACHE_FEATURES_H
  
-#include <linux/bcache.h>
  #include <linux/kernel.h>
  #include <linux/types.h>
  
+#include "bcache_ondisk.h"
+
  #define BCH_FEATURE_COMPAT             0
  #define BCH_FEATURE_RO_COMPAT          1
  #define BCH_FEATURE_INCOMPAT           2
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c

index e4388fe..9c6f9ec 100644 (file)
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -65,15 +65,15 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
          * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
          */
         if (bio->bi_opf & REQ_RAHEAD) {
-               pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore\n",
-                                   dc->backing_dev_name);
+               pr_warn_ratelimited("%pg: Read-ahead I/O failed on backing device, ignore\n",
+                                   dc->bdev);
                 return;
         }
  
         errors = atomic_add_return(1, &dc->io_errors);
         if (errors < dc->error_limit)
-               pr_err("%s: IO error on backing device, unrecoverable\n",
-                       dc->backing_dev_name);
+               pr_err("%pg: IO error on backing device, unrecoverable\n",
+                       dc->bdev);
         else
                 bch_cached_dev_error(dc);
  }
@@ -123,13 +123,13 @@ void bch_count_io_errors(struct cache *ca,
                 errors >>= IO_ERROR_SHIFT;
  
                 if (errors < ca->set->error_limit)
-                       pr_err("%s: IO error on %s%s\n",
-                              ca->cache_dev_name, m,
+                       pr_err("%pg: IO error on %s%s\n",
+                              ca->bdev, m,
                                is_read ? ", recovering." : ".");
                 else
                         bch_cache_set_error(ca->set,
-                                           "%s: too many IO errors %s\n",
-                                           ca->cache_dev_name, m);
+                                           "%pg: too many IO errors %s\n",
+                                           ca->bdev, m);
         }
  }
  
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c

index 6d1de88..d15aae6 100644 (file)
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -46,7 +46,7 @@ static void bio_csum(struct bio *bio, struct bkey *k)
         bio_for_each_segment(bv, bio, iter) {
                 void *d = kmap(bv.bv_page) + bv.bv_offset;
  
-               csum = bch_crc64_update(csum, d, bv.bv_len);
+               csum = crc64_be(csum, d, bv.bv_len);
                 kunmap(bv.bv_page);
         }
  
@@ -651,8 +651,8 @@ static void backing_request_endio(struct bio *bio)
                  */
                 if (unlikely(s->iop.writeback &&
                              bio->bi_opf & REQ_PREFLUSH)) {
-                       pr_err("Can't flush %s: returned bi_status %i\n",
-                               dc->backing_dev_name, bio->bi_status);
+                       pr_err("Can't flush %pg: returned bi_status %i\n",
+                               dc->bdev, bio->bi_status);
                 } else {
                         /* set to orig_bio->bi_status in bio_complete() */
                         s->iop.status = bio->bi_status;
@@ -1163,7 +1163,7 @@ static void quit_max_writeback_rate(struct cache_set *c,
  
  /* Cached devices - read & write stuff */
  
-blk_qc_t cached_dev_submit_bio(struct bio *bio)
+void cached_dev_submit_bio(struct bio *bio)
  {
         struct search *s;
         struct block_device *orig_bdev = bio->bi_bdev;
@@ -1176,7 +1176,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
                      dc->io_disable)) {
                 bio->bi_status = BLK_STS_IOERR;
                 bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         if (likely(d->c)) {
@@ -1222,8 +1222,6 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
         } else
                 /* I/O request sent to backing device */
                 detached_dev_do_request(d, bio, orig_bdev, start_time);
-
-       return BLK_QC_T_NONE;
  }
  
  static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
@@ -1273,7 +1271,7 @@ static void flash_dev_nodata(struct closure *cl)
         continue_at(cl, search_free, NULL);
  }
  
-blk_qc_t flash_dev_submit_bio(struct bio *bio)
+void flash_dev_submit_bio(struct bio *bio)
  {
         struct search *s;
         struct closure *cl;
@@ -1282,7 +1280,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
         if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
                 bio->bi_status = BLK_STS_IOERR;
                 bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
@@ -1298,7 +1296,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
                 continue_at_nobarrier(&s->cl,
                                       flash_dev_nodata,
                                       bcache_wq);
-               return BLK_QC_T_NONE;
+               return;
         } else if (bio_data_dir(bio)) {
                 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
                                         &KEY(d->id, bio->bi_iter.bi_sector, 0),
@@ -1314,7 +1312,6 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
         }
  
         continue_at(cl, search_free, NULL);
-       return BLK_QC_T_NONE;
  }
  
  static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h

index 82b3836..38ab485 100644 (file)
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c);
  void bch_data_insert(struct closure *cl);
  
  void bch_cached_dev_request_init(struct cached_dev *dc);
-blk_qc_t cached_dev_submit_bio(struct bio *bio);
+void cached_dev_submit_bio(struct bio *bio);
  
  void bch_flash_dev_request_init(struct bcache_device *d);
-blk_qc_t flash_dev_submit_bio(struct bio *bio);
+void flash_dev_submit_bio(struct bio *bio);
  
  extern struct kmem_cache *bch_search_cache;
  
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c

index f2874c7..4a9a65d 100644 (file)
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1002,7 +1002,7 @@ static void calc_cached_dev_sectors(struct cache_set *c)
         struct cached_dev *dc;
  
         list_for_each_entry(dc, &c->cached_devs, list)
-               sectors += bdev_sectors(dc->bdev);
+               sectors += bdev_nr_sectors(dc->bdev);
  
         c->cached_dev_sectors = sectors;
  }
@@ -1026,8 +1026,8 @@ static int cached_dev_status_update(void *arg)
                         dc->offline_seconds = 0;
  
                 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
-                       pr_err("%s: device offline for %d seconds\n",
-                              dc->backing_dev_name,
+                       pr_err("%pg: device offline for %d seconds\n",
+                              dc->bdev,
                                BACKING_DEV_OFFLINE_TIMEOUT);
                         pr_err("%s: disable I/O request due to backing device offline\n",
                                dc->disk.name);
@@ -1058,15 +1058,13 @@ int bch_cached_dev_run(struct cached_dev *dc)
         };
  
         if (dc->io_disable) {
-               pr_err("I/O disabled on cached dev %s\n",
-                      dc->backing_dev_name);
+               pr_err("I/O disabled on cached dev %pg\n", dc->bdev);
                 ret = -EIO;
                 goto out;
         }
  
         if (atomic_xchg(&dc->running, 1)) {
-               pr_info("cached dev %s is running already\n",
-                      dc->backing_dev_name);
+               pr_info("cached dev %pg is running already\n", dc->bdev);
                 ret = -EBUSY;
                 goto out;
         }
@@ -1082,7 +1080,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
                 closure_sync(&cl);
         }
  
-       add_disk(d->disk);
+       ret = add_disk(d->disk);
+       if (ret)
+               goto out;
         bd_link_disk_holder(dc->bdev, dc->disk.disk);
         /*
          * won't show up in the uevent file, use udevadm monitor -e instead
@@ -1154,16 +1154,16 @@ static void cached_dev_detach_finish(struct work_struct *w)
  
         mutex_lock(&bch_register_lock);
  
-       calc_cached_dev_sectors(dc->disk.c);
         bcache_device_detach(&dc->disk);
         list_move(&dc->list, &uncached_devices);
+       calc_cached_dev_sectors(dc->disk.c);
  
         clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
         clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
  
         mutex_unlock(&bch_register_lock);
  
-       pr_info("Caching disabled for %s\n", dc->backing_dev_name);
+       pr_info("Caching disabled for %pg\n", dc->bdev);
  
         /* Drop ref we took in cached_dev_detach() */
         closure_put(&dc->disk.cl);
@@ -1203,29 +1203,27 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
                 return -ENOENT;
  
         if (dc->disk.c) {
-               pr_err("Can't attach %s: already attached\n",
-                      dc->backing_dev_name);
+               pr_err("Can't attach %pg: already attached\n", dc->bdev);
                 return -EINVAL;
         }
  
         if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
-               pr_err("Can't attach %s: shutting down\n",
-                      dc->backing_dev_name);
+               pr_err("Can't attach %pg: shutting down\n", dc->bdev);
                 return -EINVAL;
         }
  
         if (dc->sb.block_size < c->cache->sb.block_size) {
                 /* Will die */
-               pr_err("Couldn't attach %s: block size less than set's block size\n",
-                      dc->backing_dev_name);
+               pr_err("Couldn't attach %pg: block size less than set's block size\n",
+                      dc->bdev);
                 return -EINVAL;
         }
  
         /* Check whether already attached */
         list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
                 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
-                       pr_err("Tried to attach %s but duplicate UUID already attached\n",
-                               dc->backing_dev_name);
+                       pr_err("Tried to attach %pg but duplicate UUID already attached\n",
+                               dc->bdev);
  
                         return -EINVAL;
                 }
@@ -1243,15 +1241,13 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
  
         if (!u) {
                 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-                       pr_err("Couldn't find uuid for %s in set\n",
-                              dc->backing_dev_name);
+                       pr_err("Couldn't find uuid for %pg in set\n", dc->bdev);
                         return -ENOENT;
                 }
  
                 u = uuid_find_empty(c);
                 if (!u) {
-                       pr_err("Not caching %s, no room for UUID\n",
-                              dc->backing_dev_name);
+                       pr_err("Not caching %pg, no room for UUID\n", dc->bdev);
                         return -EINVAL;
                 }
         }
@@ -1319,8 +1315,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
                  */
                 kthread_stop(dc->writeback_thread);
                 cancel_writeback_rate_update_dwork(dc);
-               pr_err("Couldn't run cached device %s\n",
-                      dc->backing_dev_name);
+               pr_err("Couldn't run cached device %pg\n", dc->bdev);
                 return ret;
         }
  
@@ -1336,8 +1331,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
         /* Allow the writeback thread to proceed */
         up_write(&dc->writeback_lock);
  
-       pr_info("Caching %s as %s on set %pU\n",
-               dc->backing_dev_name,
+       pr_info("Caching %pg as %s on set %pU\n",
+               dc->bdev,
                 dc->disk.disk->disk_name,
                 dc->disk.c->set_uuid);
         return 0;
@@ -1461,7 +1456,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
         struct cache_set *c;
         int ret = -ENOMEM;
  
-       bdevname(bdev, dc->backing_dev_name);
         memcpy(&dc->sb, sb, sizeof(struct cache_sb));
         dc->bdev = bdev;
         dc->bdev->bd_holder = dc;
@@ -1476,7 +1470,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
         if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
                 goto err;
  
-       pr_info("registered backing device %s\n", dc->backing_dev_name);
+       pr_info("registered backing device %pg\n", dc->bdev);
  
         list_add(&dc->list, &uncached_devices);
         /* attach to a matched cache set if it exists */
@@ -1493,7 +1487,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
  
         return 0;
  err:
-       pr_notice("error %s: %s\n", dc->backing_dev_name, err);
+       pr_notice("error %pg: %s\n", dc->bdev, err);
         bcache_device_stop(&dc->disk);
         return ret;
  }
@@ -1534,10 +1528,11 @@ static void flash_dev_flush(struct closure *cl)
  
  static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
  {
+       int err = -ENOMEM;
         struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
                                           GFP_KERNEL);
         if (!d)
-               return -ENOMEM;
+               goto err_ret;
  
         closure_init(&d->cl, NULL);
         set_closure_fn(&d->cl, flash_dev_flush, system_wq);
@@ -1551,9 +1546,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
         bcache_device_attach(d, c, u - c->uuids);
         bch_sectors_dirty_init(d);
         bch_flash_dev_request_init(d);
-       add_disk(d->disk);
+       err = add_disk(d->disk);
+       if (err)
+               goto err;
  
-       if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+       err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache");
+       if (err)
                 goto err;
  
         bcache_device_link(d, c, "volume");
@@ -1567,7 +1565,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
         return 0;
  err:
         kobject_put(&d->kobj);
-       return -ENOMEM;
+err_ret:
+       return err;
  }
  
  static int flash_devs_run(struct cache_set *c)
@@ -1621,8 +1620,8 @@ bool bch_cached_dev_error(struct cached_dev *dc)
         /* make others know io_disable is true earlier */
         smp_mb();
  
-       pr_err("stop %s: too many IO errors on backing device %s\n",
-              dc->disk.disk->disk_name, dc->backing_dev_name);
+       pr_err("stop %s: too many IO errors on backing device %pg\n",
+              dc->disk.disk->disk_name, dc->bdev);
  
         bcache_device_stop(&dc->disk);
         return true;
@@ -2338,7 +2337,7 @@ err_btree_alloc:
  err_free:
         module_put(THIS_MODULE);
         if (err)
-               pr_notice("error %s: %s\n", ca->cache_dev_name, err);
+               pr_notice("error %pg: %s\n", ca->bdev, err);
         return ret;
  }
  
@@ -2348,7 +2347,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
         const char *err = NULL; /* must be set for any error case */
         int ret = 0;
  
-       bdevname(bdev, ca->cache_dev_name);
         memcpy(&ca->sb, sb, sizeof(struct cache_sb));
         ca->bdev = bdev;
         ca->bdev->bd_holder = ca;
@@ -2390,14 +2388,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
                 goto out;
         }
  
-       pr_info("registered cache device %s\n", ca->cache_dev_name);
+       pr_info("registered cache device %pg\n", ca->bdev);
  
  out:
         kobject_put(&ca->kobj);
  
  err:
         if (err)
-               pr_notice("error %s: %s\n", ca->cache_dev_name, err);
+               pr_notice("error %pg: %s\n", ca->bdev, err);
  
         return ret;
  }
@@ -2617,8 +2615,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
         if (SB_IS_BDEV(sb)) {
                 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
  
-               if (!dc)
+               if (!dc) {
+                       ret = -ENOMEM;
+                       err = "cannot allocate memory";
                         goto out_put_sb_page;
+               }
  
                 mutex_lock(&bch_register_lock);
                 ret = register_bdev(sb, sb_disk, bdev, dc);
@@ -2629,11 +2630,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
         } else {
                 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  
-               if (!ca)
+               if (!ca) {
+                       ret = -ENOMEM;
+                       err = "cannot allocate memory";
                         goto out_put_sb_page;
+               }
  
                 /* blkdev_put() will be called in bch_cache_release() */
-               if (register_cache(sb, sb_disk, bdev, ca) != 0)
+               ret = register_cache(sb, sb_disk, bdev, ca);
+               if (ret)
                         goto out_free_sb;
         }
  
@@ -2750,7 +2755,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
                  * The reason bch_register_lock is not held to call
                  * bch_cache_set_stop() and bcache_device_stop() is to
                  * avoid potential deadlock during reboot, because cache
-                * set or bcache device stopping process will acqurie
+                * set or bcache device stopping process will acquire
                  * bch_register_lock too.
                  *
                  * We are safe here because bcache_is_reboot sets to
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c

index 05ac1d6..1f0dce3 100644 (file)
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -271,7 +271,7 @@ SHOW(__bch_cached_dev)
         }
  
         if (attr == &sysfs_backing_dev_name) {
-               snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
+               snprintf(buf, BDEVNAME_SIZE + 1, "%pg", dc->bdev);
                 strcat(buf, "\n");
                 return strlen(buf);
         }
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h

index 215df32..c1752ba 100644 (file)
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -51,13 +51,27 @@ STORE(fn)                                                           \
  #define sysfs_printf(file, fmt, ...)                                   \
  do {                                                                   \
         if (attr == &sysfs_ ## file)                                    \
-               return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
+               return sysfs_emit(buf, fmt "\n", __VA_ARGS__);  \
  } while (0)
  
  #define sysfs_print(file, var)                                         \
  do {                                                                   \
         if (attr == &sysfs_ ## file)                                    \
-               return snprint(buf, PAGE_SIZE, var);                    \
+               return sysfs_emit(buf,                                          \
+                               __builtin_types_compatible_p(typeof(var), int)          \
+                                        ? "%i\n" :                             \
+                               __builtin_types_compatible_p(typeof(var), unsigned int) \
+                                        ? "%u\n" :                             \
+                               __builtin_types_compatible_p(typeof(var), long)         \
+                                        ? "%li\n" :                    \
+                               __builtin_types_compatible_p(typeof(var), unsigned long)\
+                                        ? "%lu\n" :                    \
+                               __builtin_types_compatible_p(typeof(var), int64_t)      \
+                                        ? "%lli\n" :                   \
+                               __builtin_types_compatible_p(typeof(var), uint64_t)     \
+                                        ? "%llu\n" :                   \
+                               __builtin_types_compatible_p(typeof(var), const char *) \
+                                        ? "%s\n" : "%i\n", var);       \
  } while (0)
  
  #define sysfs_hprint(file, val)                                                \
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h

index b64460a..6f3cb7c 100644 (file)
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -340,23 +340,6 @@ static inline int bch_strtoul_h(const char *cp, long *res)
         _r;                                                             \
  })
  
-#define snprint(buf, size, var)                                                \
-       snprintf(buf, size,                                             \
-               __builtin_types_compatible_p(typeof(var), int)          \
-                    ? "%i\n" :                                         \
-               __builtin_types_compatible_p(typeof(var), unsigned int) \
-                    ? "%u\n" :                                         \
-               __builtin_types_compatible_p(typeof(var), long)         \
-                    ? "%li\n" :                                        \
-               __builtin_types_compatible_p(typeof(var), unsigned long)\
-                    ? "%lu\n" :                                        \
-               __builtin_types_compatible_p(typeof(var), int64_t)      \
-                    ? "%lli\n" :                                       \
-               __builtin_types_compatible_p(typeof(var), uint64_t)     \
-                    ? "%llu\n" :                                       \
-               __builtin_types_compatible_p(typeof(var), const char *) \
-                    ? "%s\n" : "%i\n", var)
-
  ssize_t bch_hprint(char *buf, int64_t v);
  
  bool bch_is_zero(const char *p, size_t n);
@@ -548,14 +531,6 @@ static inline uint64_t bch_crc64(const void *p, size_t len)
         return crc ^ 0xffffffffffffffffULL;
  }
  
-static inline uint64_t bch_crc64_update(uint64_t crc,
-                                       const void *p,
-                                       size_t len)
-{
-       crc = crc64_be(crc, p, len);
-       return crc;
-}
-
  /*
   * A stepwise-linear pseudo-exponential.  This returns 1 << (x >>
   * frac_bits), with the less-significant bits filled in by linear
@@ -584,8 +559,4 @@ static inline unsigned int fract_exp_two(unsigned int x,
  void bch_bio_map(struct bio *bio, void *base);
  int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
  
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
-       return bdev->bd_inode->i_size >> 9;
-}
  #endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c

index 8120da2..c7560f6 100644 (file)
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -45,7 +45,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
          * backing volume uses about 2% of the cache for dirty data.
          */
         uint32_t bdev_share =
-               div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+               div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
                                 c->cached_dev_sectors);
  
         uint64_t cache_dirty_target =
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h

index a3b7135..745e3ab 100644 (file)
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -8,6 +8,7 @@
  #define DM_BIO_RECORD_H
  
  #include <linux/bio.h>
+#include <linux/blk-integrity.h>
  
  /*
   * There are lots of mutable fields in the bio struct that get
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c

index 50f3e67..104ebc1 100644 (file)
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1525,7 +1525,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
  
  sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
  {
-       sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t s = bdev_nr_sectors(c->bdev);
         if (s >= c->start)
                 s -= c->start;
         else
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c

index 89a7320..2874f22 100644 (file)
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -334,7 +334,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
         int r;
         struct dm_block *sblock;
         struct cache_disk_superblock *disk_super;
-       sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t bdev_size = bdev_nr_sectors(cmd->bdev);
  
         /* FIXME: see if we can lose the max sectors limit */
         if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c

index bdd5004..447d030 100644 (file)
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1940,7 +1940,7 @@ static void cache_dtr(struct dm_target *ti)
  
  static sector_t get_dev_size(struct dm_dev *dev)
  {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
  }
  
  /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c

index edd22e4..4599632 100644 (file)
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1514,7 +1514,7 @@ error:
  
  static sector_t get_dev_size(struct dm_dev *dev)
  {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
  }
  
  /*---------------------------------------------------------------------------*/
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h

index 55dccdf..b855fef 100644 (file)
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -13,7 +13,7 @@
  #include <linux/ktime.h>
  #include <linux/genhd.h>
  #include <linux/blk-mq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  
  #include <trace/events/block.h>
  
@@ -200,7 +200,7 @@ struct dm_table {
         struct dm_md_mempools *mempools;
  
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       struct blk_keyslot_manager *ksm;
+       struct blk_crypto_profile *crypto_profile;
  #endif
  };
  
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c

index 916b7da..292f789 100644 (file)
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -15,6 +15,7 @@
  #include <linux/key.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/mempool.h>
  #include <linux/slab.h>
  #include <linux/crypto.h>
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c

index 3163e2b..0367220 100644 (file)
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -415,7 +415,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
                         char *result, unsigned int maxlen)
  {
         struct dust_device *dd = ti->private;
-       sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t size = bdev_nr_sectors(dd->dev->bdev);
         bool invalid_msg = false;
         int r = -EINVAL;
         unsigned long long tmp, block;
@@ -544,8 +544,7 @@ static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
         /*
          * Only pass ioctls through if the device sizes match exactly.
          */
-       if (dd->start ||
-           ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (dd->start || ti->len != bdev_nr_sectors(dev->bdev))
                 return 1;
  
         return 0;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c

index d259896..7ce5d50 100644 (file)
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -416,7 +416,7 @@ static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
          * Only pass ioctls through if the device sizes match exactly.
          */
         *bdev = dev->bdev;
-       return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT);
+       return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev));
  }
  
  static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits)
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c

index 2a78f68..1f6bf15 100644 (file)
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1681,7 +1681,7 @@ static int era_message(struct dm_target *ti, unsigned argc, char **argv,
  
  static sector_t get_dev_size(struct dm_dev *dev)
  {
-       return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(dev->bdev);
  }
  
  static int era_iterate_devices(struct dm_target *ti,
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h

index 3f4139a..b5f20eb 100644 (file)
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -168,7 +168,7 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
   */
  static inline sector_t get_dev_size(struct block_device *bdev)
  {
-       return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(bdev);
  }
  
  static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c

index 4b94ffe..345229d 100644 (file)
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -456,8 +456,7 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
         /*
          * Only pass ioctls through if the device sizes match exactly.
          */
-       if (fc->start ||
-           ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+       if (fc->start || ti->len != bdev_nr_sectors((*bdev)))
                 return 1;
         return 0;
  }
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c

index 2c5edfb..9579999 100644 (file)
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -12,6 +12,7 @@
  #include "dm-ima.h"
  
  #include <linux/ima.h>
+#include <linux/sched/mm.h>
  #include <crypto/hash.h>
  #include <linux/crypto.h>
  #include <crypto/hash_info.h>
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c

index dc03b70..d0f788e 100644 (file)
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4113,11 +4113,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                 }
         }
  
-       ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev);
         if (!ic->meta_dev)
                 ic->meta_device_sectors = ic->data_device_sectors;
         else
-               ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT;
+               ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev);
  
         if (!journal_sectors) {
                 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
@@ -4367,7 +4367,7 @@ try_smaller_buffer:
         DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
         DEBUG_print("   journal_entries %u\n", ic->journal_entries);
         DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
-       DEBUG_print("   data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
+       DEBUG_print("   data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev));
         DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
         DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
         DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c

index 679b4c0..66ba167 100644 (file)
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -135,8 +135,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
         /*
          * Only pass ioctls through if the device sizes match exactly.
          */
-       if (lc->start ||
-           ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (lc->start || ti->len != bdev_nr_sectors(dev->bdev))
                 return 1;
         return 0;
  }
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c

index d93a4db..46de085 100644 (file)
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -446,7 +446,7 @@ static int log_super(struct log_writes_c *lc)
  
  static inline sector_t logdev_last_sector(struct log_writes_c *lc)
  {
-       return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(lc->logdev->bdev);
  }
  
  static int log_writes_kthread(void *arg)
@@ -851,7 +851,7 @@ static int log_writes_prepare_ioctl(struct dm_target *ti,
         /*
          * Only pass ioctls through if the device sizes match exactly.
          */
-       if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (ti->len != bdev_nr_sectors(dev->bdev))
                 return 1;
         return 0;
  }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c

index 1ecf75e..06f3289 100644 (file)
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -447,7 +447,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                                 bdev_logical_block_size(lc->header_location.
                                                             bdev));
  
-               if (buf_size > i_size_read(dev->bdev->bd_inode)) {
+               if (buf_size > bdev_nr_bytes(dev->bdev)) {
                         DMWARN("log device %s too small: need %llu bytes",
                                 dev->name, (unsigned long long)buf_size);
                         kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c

index 694aaca..90dc9cc 100644 (file)
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -530,7 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
  
         bdev = pgpath->path.dev->bdev;
         q = bdev_get_queue(bdev);
-       clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+       clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE,
                         BLK_MQ_REQ_NOWAIT);
         if (IS_ERR(clone)) {
                 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
@@ -579,7 +579,7 @@ static void multipath_release_clone(struct request *clone,
                                                     clone->io_start_time_ns);
         }
  
-       blk_put_request(clone);
+       blk_mq_free_request(clone);
  }
  
  /*
@@ -2061,7 +2061,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
         /*
          * Only pass ioctls through if the device sizes match exactly.
          */
-       if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+       if (!r && ti->len != bdev_nr_sectors((*bdev)))
                 return 1;
         return r;
  }
diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c

index 1856a1b..875bca3 100644 (file)
--- a/drivers/md/dm-ps-historical-service-time.c
+++ b/drivers/md/dm-ps-historical-service-time.c
@@ -27,6 +27,7 @@
  #include <linux/blkdev.h>
  #include <linux/slab.h>
  #include <linux/module.h>
+#include <linux/sched/clock.h>
  
  
  #define DM_MSG_PREFIX  "multipath historical-service-time"
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c

index d9ef521..2b26435 100644 (file)
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1261,7 +1261,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                         md_rdev_init(jdev);
                         jdev->mddev = &rs->md;
                         jdev->bdev = rs->journal_dev.dev->bdev;
-                       jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+                       jdev->sectors = bdev_nr_sectors(jdev->bdev);
                         if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
                                 rs->ti->error = "No space for raid4/5/6 journal";
                                 return -ENOSPC;
@@ -1607,7 +1607,7 @@ static int _check_data_dev_sectors(struct raid_set *rs)
  
         rdev_for_each(rdev, &rs->md)
                 if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
-                       ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
+                       ds = min(ds, bdev_nr_sectors(rdev->bdev));
                         if (ds < rs->md.dev_sectors) {
                                 rs->ti->error = "Component device(s) too small";
                                 return -EINVAL;
@@ -2662,7 +2662,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
          * Make sure we got a minimum amount of free sectors per device
          */
         if (rs->data_offset &&
-           to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
+           bdev_nr_sectors(rdev->bdev) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
                 rs->ti->error = data_offset ? "No space for forward reshape" :
                                               "No space for backward reshape";
                 return -ENOSPC;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c

index a896dea..579ab61 100644 (file)
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -7,7 +7,6 @@
  #include "dm-core.h"
  #include "dm-rq.h"
  
-#include <linux/elevator.h> /* for rq_end_sector() */
  #include <linux/blk-mq.h>
  
  #define DM_MSG_PREFIX "core-rq"
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c

index 028a92f..534dc2c 100644 (file)
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -529,7 +529,7 @@ static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
          * Only pass ioctls through if the device sizes match exactly.
          */
         if (ti->len + sctx->path_list[path_nr].start !=
-           i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+           bdev_nr_sectors((*bdev)))
                 return 1;
         return 0;
  }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c

index 2111daa..bcddc5e 100644 (file)
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -10,6 +10,7 @@
  #include <linux/module.h>
  #include <linux/vmalloc.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/namei.h>
  #include <linux/ctype.h>
  #include <linux/string.h>
@@ -169,7 +170,7 @@ static void free_devices(struct list_head *devices, struct mapped_device *md)
         }
  }
  
-static void dm_table_destroy_keyslot_manager(struct dm_table *t);
+static void dm_table_destroy_crypto_profile(struct dm_table *t);
  
  void dm_table_destroy(struct dm_table *t)
  {
@@ -199,7 +200,7 @@ void dm_table_destroy(struct dm_table *t)
  
         dm_free_md_mempools(t->mempools);
  
-       dm_table_destroy_keyslot_manager(t);
+       dm_table_destroy_crypto_profile(t);
  
         kfree(t);
  }
@@ -226,8 +227,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
  {
         struct queue_limits *limits = data;
         struct block_device *bdev = dev->bdev;
-       sector_t dev_size =
-               i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t dev_size = bdev_nr_sectors(bdev);
         unsigned short logical_block_size_sectors =
                 limits->logical_block_size >> SECTOR_SHIFT;
         char b[BDEVNAME_SIZE];
@@ -1186,8 +1186,8 @@ static int dm_table_register_integrity(struct dm_table *t)
  
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
  
-struct dm_keyslot_manager {
-       struct blk_keyslot_manager ksm;
+struct dm_crypto_profile {
+       struct blk_crypto_profile profile;
         struct mapped_device *md;
  };
  
@@ -1213,13 +1213,11 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
   * When an inline encryption key is evicted from a device-mapper device, evict
   * it from all the underlying devices.
   */
-static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int dm_keyslot_evict(struct blk_crypto_profile *profile,
                             const struct blk_crypto_key *key, unsigned int slot)
  {
-       struct dm_keyslot_manager *dksm = container_of(ksm,
-                                                      struct dm_keyslot_manager,
-                                                      ksm);
-       struct mapped_device *md = dksm->md;
+       struct mapped_device *md =
+               container_of(profile, struct dm_crypto_profile, profile)->md;
         struct dm_keyslot_evict_args args = { key };
         struct dm_table *t;
         int srcu_idx;
@@ -1239,150 +1237,148 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
         return args.err;
  }
  
-static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
-       .keyslot_evict = dm_keyslot_evict,
-};
-
-static int device_intersect_crypto_modes(struct dm_target *ti,
-                                        struct dm_dev *dev, sector_t start,
-                                        sector_t len, void *data)
+static int
+device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
+                                    sector_t start, sector_t len, void *data)
  {
-       struct blk_keyslot_manager *parent = data;
-       struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm;
+       struct blk_crypto_profile *parent = data;
+       struct blk_crypto_profile *child =
+               bdev_get_queue(dev->bdev)->crypto_profile;
  
-       blk_ksm_intersect_modes(parent, child);
+       blk_crypto_intersect_capabilities(parent, child);
         return 0;
  }
  
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
  {
-       struct dm_keyslot_manager *dksm = container_of(ksm,
-                                                      struct dm_keyslot_manager,
-                                                      ksm);
+       struct dm_crypto_profile *dmcp = container_of(profile,
+                                                     struct dm_crypto_profile,
+                                                     profile);
  
-       if (!ksm)
+       if (!profile)
                 return;
  
-       blk_ksm_destroy(ksm);
-       kfree(dksm);
+       blk_crypto_profile_destroy(profile);
+       kfree(dmcp);
  }
  
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
  {
-       dm_destroy_keyslot_manager(t->ksm);
-       t->ksm = NULL;
+       dm_destroy_crypto_profile(t->crypto_profile);
+       t->crypto_profile = NULL;
  }
  
  /*
- * Constructs and initializes t->ksm with a keyslot manager that
- * represents the common set of crypto capabilities of the devices
- * described by the dm_table. However, if the constructed keyslot
- * manager does not support a superset of the crypto capabilities
- * supported by the current keyslot manager of the mapped_device,
- * it returns an error instead, since we don't support restricting
- * crypto capabilities on table changes. Finally, if the constructed
- * keyslot manager doesn't actually support any crypto modes at all,
- * it just returns NULL.
+ * Constructs and initializes t->crypto_profile with a crypto profile that
+ * represents the common set of crypto capabilities of the devices described by
+ * the dm_table.  However, if the constructed crypto profile doesn't support all
+ * crypto capabilities that are supported by the current mapped_device, it
+ * returns an error instead, since we don't support removing crypto capabilities
+ * on table changes.  Finally, if the constructed crypto profile is "empty" (has
+ * no crypto capabilities at all), it just sets t->crypto_profile to NULL.
   */
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
  {
-       struct dm_keyslot_manager *dksm;
-       struct blk_keyslot_manager *ksm;
+       struct dm_crypto_profile *dmcp;
+       struct blk_crypto_profile *profile;
         struct dm_target *ti;
         unsigned int i;
-       bool ksm_is_empty = true;
+       bool empty_profile = true;
  
-       dksm = kmalloc(sizeof(*dksm), GFP_KERNEL);
-       if (!dksm)
+       dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL);
+       if (!dmcp)
                 return -ENOMEM;
-       dksm->md = t->md;
+       dmcp->md = t->md;
  
-       ksm = &dksm->ksm;
-       blk_ksm_init_passthrough(ksm);
-       ksm->ksm_ll_ops = dm_ksm_ll_ops;
-       ksm->max_dun_bytes_supported = UINT_MAX;
-       memset(ksm->crypto_modes_supported, 0xFF,
-              sizeof(ksm->crypto_modes_supported));
+       profile = &dmcp->profile;
+       blk_crypto_profile_init(profile, 0);
+       profile->ll_ops.keyslot_evict = dm_keyslot_evict;
+       profile->max_dun_bytes_supported = UINT_MAX;
+       memset(profile->modes_supported, 0xFF,
+              sizeof(profile->modes_supported));
  
         for (i = 0; i < dm_table_get_num_targets(t); i++) {
                 ti = dm_table_get_target(t, i);
  
                 if (!dm_target_passes_crypto(ti->type)) {
-                       blk_ksm_intersect_modes(ksm, NULL);
+                       blk_crypto_intersect_capabilities(profile, NULL);
                         break;
                 }
                 if (!ti->type->iterate_devices)
                         continue;
-               ti->type->iterate_devices(ti, device_intersect_crypto_modes,
-                                         ksm);
+               ti->type->iterate_devices(ti,
+                                         device_intersect_crypto_capabilities,
+                                         profile);
         }
  
-       if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) {
+       if (t->md->queue &&
+           !blk_crypto_has_capabilities(profile,
+                                        t->md->queue->crypto_profile)) {
                 DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!");
-               dm_destroy_keyslot_manager(ksm);
+               dm_destroy_crypto_profile(profile);
                 return -EINVAL;
         }
  
         /*
-        * If the new KSM doesn't actually support any crypto modes, we may as
-        * well represent it with a NULL ksm.
+        * If the new profile doesn't actually support any crypto capabilities,
+        * we may as well represent it with a NULL profile.
          */
-       ksm_is_empty = true;
-       for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) {
-               if (ksm->crypto_modes_supported[i]) {
-                       ksm_is_empty = false;
+       for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) {
+               if (profile->modes_supported[i]) {
+                       empty_profile = false;
                         break;
                 }
         }
  
-       if (ksm_is_empty) {
-               dm_destroy_keyslot_manager(ksm);
-               ksm = NULL;
+       if (empty_profile) {
+               dm_destroy_crypto_profile(profile);
+               profile = NULL;
         }
  
         /*
-        * t->ksm is only set temporarily while the table is being set
-        * up, and it gets set to NULL after the capabilities have
-        * been transferred to the request_queue.
+        * t->crypto_profile is only set temporarily while the table is being
+        * set up, and it gets set to NULL after the profile has been
+        * transferred to the request_queue.
          */
-       t->ksm = ksm;
+       t->crypto_profile = profile;
  
         return 0;
  }
  
-static void dm_update_keyslot_manager(struct request_queue *q,
-                                     struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+                                    struct dm_table *t)
  {
-       if (!t->ksm)
+       if (!t->crypto_profile)
                 return;
  
-       /* Make the ksm less restrictive */
-       if (!q->ksm) {
-               blk_ksm_register(t->ksm, q);
+       /* Make the crypto profile less restrictive. */
+       if (!q->crypto_profile) {
+               blk_crypto_register(t->crypto_profile, q);
         } else {
-               blk_ksm_update_capabilities(q->ksm, t->ksm);
-               dm_destroy_keyslot_manager(t->ksm);
+               blk_crypto_update_capabilities(q->crypto_profile,
+                                              t->crypto_profile);
+               dm_destroy_crypto_profile(t->crypto_profile);
         }
-       t->ksm = NULL;
+       t->crypto_profile = NULL;
  }
  
  #else /* CONFIG_BLK_INLINE_ENCRYPTION */
  
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
  {
         return 0;
  }
  
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
  {
  }
  
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
  {
  }
  
-static void dm_update_keyslot_manager(struct request_queue *q,
-                                     struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+                                    struct dm_table *t)
  {
  }
  
@@ -1414,9 +1410,9 @@ int dm_table_complete(struct dm_table *t)
                 return r;
         }
  
-       r = dm_table_construct_keyslot_manager(t);
+       r = dm_table_construct_crypto_profile(t);
         if (r) {
-               DMERR("could not construct keyslot manager.");
+               DMERR("could not construct crypto profile.");
                 return r;
         }
  
@@ -2070,7 +2066,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                         return r;
         }
  
-       dm_update_keyslot_manager(q, t);
+       dm_update_crypto_profile(q, t);
         disk_update_readahead(t->md->disk);
  
         return 0;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c

index c88ed14..1a96a07 100644 (file)
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -549,7 +549,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
         int r;
         struct dm_block *sblock;
         struct thin_disk_superblock *disk_super;
-       sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
+       sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
  
         if (bdev_size > THIN_METADATA_MAX_SECTORS)
                 bdev_size = THIN_METADATA_MAX_SECTORS;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c

index 4c67b77..ec119d2 100644 (file)
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3212,7 +3212,7 @@ static int metadata_pre_commit_callback(void *context)
  
  static sector_t get_dev_size(struct block_device *bdev)
  {
-       return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       return bdev_nr_sectors(bdev);
  }
  
  static void warn_if_metadata_device_too_big(struct block_device *bdev)
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c

index 88288c8..a7efe83 100644 (file)
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -18,6 +18,7 @@
  #include "dm-verity-verify-sig.h"
  #include <linux/module.h>
  #include <linux/reboot.h>
+#include <linux/scatterlist.h>
  
  #define DM_MSG_PREFIX                  "verity"
  
@@ -833,8 +834,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
  
         *bdev = v->data_dev->bdev;
  
-       if (v->data_start ||
-           ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+       if (v->data_start || ti->len != bdev_nr_sectors(v->data_dev->bdev))
                 return 1;
         return 0;
  }
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c

index 1832044..0178060 100644 (file)
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -2341,7 +2341,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
                 ti->error = "Cache data device lookup failed";
                 goto bad;
         }
-       wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
+       wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
  
         /*
          * Parse the cache block size
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c

index ae1bc48..8dc21c0 100644 (file)
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -733,7 +733,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path,
         dev->dev_idx = idx;
         (void)bdevname(dev->bdev, dev->name);
  
-       dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+       dev->capacity = bdev_nr_sectors(bdev);
         if (ti->begin) {
                 ti->error = "Partial mapping is not supported";
                 goto err;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index 76d9da4..63aa522 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -29,7 +29,7 @@
  #include <linux/refcount.h>
  #include <linux/part_stat.h>
  #include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  
  #define DM_MSG_PREFIX "core"
  
@@ -1183,14 +1183,13 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
         mutex_unlock(&md->swap_bios_lock);
  }
  
-static blk_qc_t __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct dm_target_io *tio)
  {
         int r;
         sector_t sector;
         struct bio *clone = &tio->clone;
         struct dm_io *io = tio->io;
         struct dm_target *ti = tio->ti;
-       blk_qc_t ret = BLK_QC_T_NONE;
  
         clone->bi_end_io = clone_endio;
  
@@ -1226,7 +1225,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
         case DM_MAPIO_REMAPPED:
                 /* the bio has been remapped so dispatch it */
                 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
-               ret = submit_bio_noacct(clone);
+               submit_bio_noacct(clone);
                 break;
         case DM_MAPIO_KILL:
                 if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1248,8 +1247,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
                 DMWARN("unimplemented target map return value: %d", r);
                 BUG();
         }
-
-       return ret;
  }
  
  static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
@@ -1336,7 +1333,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
         }
  }
  
-static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
+static void __clone_and_map_simple_bio(struct clone_info *ci,
                                            struct dm_target_io *tio, unsigned *len)
  {
         struct bio *clone = &tio->clone;
@@ -1346,8 +1343,7 @@ static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
         __bio_clone_fast(clone, ci->bio);
         if (len)
                 bio_setup_sector(clone, ci->sector, *len);
-
-       return __map_bio(tio);
+       __map_bio(tio);
  }
  
  static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
@@ -1361,7 +1357,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
  
         while ((bio = bio_list_pop(&blist))) {
                 tio = container_of(bio, struct dm_target_io, clone);
-               (void) __clone_and_map_simple_bio(ci, tio, len);
+               __clone_and_map_simple_bio(ci, tio, len);
         }
  }
  
@@ -1405,7 +1401,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
                 free_tio(tio);
                 return r;
         }
-       (void) __map_bio(tio);
+       __map_bio(tio);
  
         return 0;
  }
@@ -1520,11 +1516,10 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
  /*
   * Entry point to split a bio into clones and submit them to the targets.
   */
-static blk_qc_t __split_and_process_bio(struct mapped_device *md,
+static void __split_and_process_bio(struct mapped_device *md,
                                         struct dm_table *map, struct bio *bio)
  {
         struct clone_info ci;
-       blk_qc_t ret = BLK_QC_T_NONE;
         int error = 0;
  
         init_clone_info(&ci, md, map, bio);
@@ -1567,19 +1562,17 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
  
                         bio_chain(b, bio);
                         trace_block_split(b, bio->bi_iter.bi_sector);
-                       ret = submit_bio_noacct(bio);
+                       submit_bio_noacct(bio);
                 }
         }
  
         /* drop the extra reference count */
         dm_io_dec_pending(ci.io, errno_to_blk_status(error));
-       return ret;
  }
  
-static blk_qc_t dm_submit_bio(struct bio *bio)
+static void dm_submit_bio(struct bio *bio)
  {
         struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
-       blk_qc_t ret = BLK_QC_T_NONE;
         int srcu_idx;
         struct dm_table *map;
  
@@ -1609,10 +1602,9 @@ static blk_qc_t dm_submit_bio(struct bio *bio)
         if (is_abnormal_io(bio))
                 blk_queue_split(&bio);
  
-       ret = __split_and_process_bio(md, map, bio);
+       __split_and_process_bio(md, map, bio);
  out:
         dm_put_live_table(md, srcu_idx);
-       return ret;
  }
  
  /*-----------------------------------------------------------------
@@ -1671,14 +1663,14 @@ static const struct dax_operations dm_dax_ops;
  static void dm_wq_work(struct work_struct *work);
  
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static void dm_queue_destroy_crypto_profile(struct request_queue *q)
  {
-       dm_destroy_keyslot_manager(q->ksm);
+       dm_destroy_crypto_profile(q->crypto_profile);
  }
  
  #else /* CONFIG_BLK_INLINE_ENCRYPTION */
  
-static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
  {
  }
  #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
@@ -1704,7 +1696,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
                         dm_sysfs_exit(md);
                         del_gendisk(md->disk);
                 }
-               dm_queue_destroy_keyslot_manager(md->queue);
+               dm_queue_destroy_crypto_profile(md->queue);
                 blk_cleanup_disk(md->disk);
         }
  
@@ -2086,7 +2078,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
         if (r)
                 return r;
  
-       add_disk(md->disk);
+       r = add_disk(md->disk);
+       if (r)
+               return r;
  
         r = dm_sysfs_init(md);
         if (r) {
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 6c0c3d0..5111ed9 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -41,6 +41,7 @@
  #include <linux/sched/signal.h>
  #include <linux/kthread.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/badblocks.h>
  #include <linux/sysctl.h>
  #include <linux/seq_file.h>
@@ -51,6 +52,7 @@
  #include <linux/hdreg.h>
  #include <linux/proc_fs.h>
  #include <linux/random.h>
+#include <linux/major.h>
  #include <linux/module.h>
  #include <linux/reboot.h>
  #include <linux/file.h>
@@ -352,7 +354,7 @@ static bool create_on_open = true;
   */
  static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
  static atomic_t md_event_count;
-void md_new_event(struct mddev *mddev)
+void md_new_event(void)
  {
         atomic_inc(&md_event_count);
         wake_up(&md_event_waiters);
@@ -441,19 +443,19 @@ check_suspended:
  }
  EXPORT_SYMBOL(md_handle_request);
  
-static blk_qc_t md_submit_bio(struct bio *bio)
+static void md_submit_bio(struct bio *bio)
  {
         const int rw = bio_data_dir(bio);
         struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
  
         if (mddev == NULL || mddev->pers == NULL) {
                 bio_io_error(bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
                 bio_io_error(bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         blk_queue_split(&bio);
@@ -462,15 +464,13 @@ static blk_qc_t md_submit_bio(struct bio *bio)
                 if (bio_sectors(bio) != 0)
                         bio->bi_status = BLK_STS_IOERR;
                 bio_endio(bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         /* bio could be mergeable after passing to underlayer */
         bio->bi_opf &= ~REQ_NOMERGE;
  
         md_handle_request(mddev, bio);
-
-       return BLK_QC_T_NONE;
  }
  
  /* mddev_suspend makes sure no new requests are submitted
@@ -888,8 +888,7 @@ static struct md_personality *find_pers(int level, char *clevel)
  /* return the offset of the super block in 512byte sectors */
  static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
  {
-       sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
-       return MD_NEW_SIZE_SECTORS(num_sectors);
+       return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
  }
  
  static int alloc_disk_sb(struct md_rdev *rdev)
@@ -1631,8 +1630,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
          */
         switch(minor_version) {
         case 0:
-               sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
-               sb_start -= 8*2;
+               sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
                 sb_start &= ~(sector_t)(4*2-1);
                 break;
         case 1:
@@ -1787,10 +1785,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
                 else
                         ret = 0;
         }
-       if (minor_version) {
-               sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
-               sectors -= rdev->data_offset;
-       } else
+       if (minor_version)
+               sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
+       else
                 sectors = rdev->sb_start;
         if (sectors < le64_to_cpu(sb->data_size))
                 return -EINVAL;
@@ -2168,8 +2165,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
                 return 0; /* too confusing */
         if (rdev->sb_start < rdev->data_offset) {
                 /* minor versions 1 and 2; superblock before data */
-               max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
-               max_sectors -= rdev->data_offset;
+               max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
                 if (!num_sectors || num_sectors > max_sectors)
                         num_sectors = max_sectors;
         } else if (rdev->mddev->bitmap_info.offset) {
@@ -2178,7 +2174,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
         } else {
                 /* minor version 0; superblock after data */
                 sector_t sb_start, bm_space;
-               sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
+               sector_t dev_size = bdev_nr_sectors(rdev->bdev);
  
                 /* 8K is for superblock */
                 sb_start = dev_size - 8*2;
@@ -2886,7 +2882,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
         if (mddev->degraded)
                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-       md_new_event(mddev);
+       md_new_event();
         md_wakeup_thread(mddev->thread);
         return 0;
  }
@@ -2976,7 +2972,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
          *  -write_error - clears WriteErrorSeen
          *  {,-}failfast - set/clear FailFast
          */
+
+       struct mddev *mddev = rdev->mddev;
         int err = -EINVAL;
+       bool need_update_sb = false;
+
         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
                 md_error(rdev->mddev, rdev);
                 if (test_bit(Faulty, &rdev->flags))
@@ -2991,7 +2991,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                 if (rdev->raid_disk >= 0)
                         err = -EBUSY;
                 else {
-                       struct mddev *mddev = rdev->mddev;
                         err = 0;
                         if (mddev_is_clustered(mddev))
                                 err = md_cluster_ops->remove_disk(mddev, rdev);
@@ -3002,16 +3001,18 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                                         md_wakeup_thread(mddev->thread);
                                 }
-                               md_new_event(mddev);
+                               md_new_event();
                         }
                 }
         } else if (cmd_match(buf, "writemostly")) {
                 set_bit(WriteMostly, &rdev->flags);
                 mddev_create_serial_pool(rdev->mddev, rdev, false);
+               need_update_sb = true;
                 err = 0;
         } else if (cmd_match(buf, "-writemostly")) {
                 mddev_destroy_serial_pool(rdev->mddev, rdev, false);
                 clear_bit(WriteMostly, &rdev->flags);
+               need_update_sb = true;
                 err = 0;
         } else if (cmd_match(buf, "blocked")) {
                 set_bit(Blocked, &rdev->flags);
@@ -3037,9 +3038,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                 err = 0;
         } else if (cmd_match(buf, "failfast")) {
                 set_bit(FailFast, &rdev->flags);
+               need_update_sb = true;
                 err = 0;
         } else if (cmd_match(buf, "-failfast")) {
                 clear_bit(FailFast, &rdev->flags);
+               need_update_sb = true;
                 err = 0;
         } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
                    !test_bit(Journal, &rdev->flags)) {
@@ -3118,6 +3121,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                 clear_bit(ExternalBbl, &rdev->flags);
                 err = 0;
         }
+       if (need_update_sb)
+               md_update_sb(mddev, 1);
         if (!err)
                 sysfs_notify_dirent_safe(rdev->sysfs_state);
         return err ? err : len;
@@ -3382,7 +3387,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                         if (!sectors)
                                 return -EBUSY;
                 } else if (!sectors)
-                       sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
+                       sectors = bdev_nr_sectors(rdev->bdev) -
                                 rdev->data_offset;
                 if (!my_mddev->pers->resize)
                         /* Cannot change size for RAID0 or Linear etc */
@@ -3709,7 +3714,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
  
         kobject_init(&rdev->kobj, &rdev_ktype);
  
-       size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
+       size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
         if (!size) {
                 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
                         bdevname(rdev->bdev,b));
@@ -4099,7 +4104,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
         if (!mddev->thread)
                 md_update_sb(mddev, 1);
         sysfs_notify_dirent_safe(mddev->sysfs_level);
-       md_new_event(mddev);
+       md_new_event();
         rv = len;
  out_unlock:
         mddev_unlock(mddev);
@@ -4620,7 +4625,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
                 export_rdev(rdev);
         mddev_unlock(mddev);
         if (!err)
-               md_new_event(mddev);
+               md_new_event();
         return err ? err : len;
  }
  
@@ -5490,6 +5495,10 @@ static struct attribute *md_default_attrs[] = {
         NULL,
  };
  
+static const struct attribute_group md_default_group = {
+       .attrs = md_default_attrs,
+};
+
  static struct attribute *md_redundancy_attrs[] = {
         &md_scan_mode.attr,
         &md_last_scan_mode.attr,
@@ -5512,6 +5521,12 @@ static const struct attribute_group md_redundancy_group = {
         .attrs = md_redundancy_attrs,
  };
  
+static const struct attribute_group *md_attr_groups[] = {
+       &md_default_group,
+       &md_bitmap_group,
+       NULL,
+};
+
  static ssize_t
  md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
  {
@@ -5587,7 +5602,7 @@ static const struct sysfs_ops md_sysfs_ops = {
  static struct kobj_type md_ktype = {
         .release        = md_free,
         .sysfs_ops      = &md_sysfs_ops,
-       .default_attrs  = md_default_attrs,
+       .default_groups = md_attr_groups,
  };
  
  int mdp_major = 0;
@@ -5596,7 +5611,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
  {
         struct mddev *mddev = container_of(ws, struct mddev, del_work);
  
-       sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
         kobject_del(&mddev->kobj);
         kobject_put(&mddev->kobj);
  }
@@ -5663,7 +5677,7 @@ static int md_alloc(dev_t dev, char *name)
                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
                                 spin_unlock(&all_mddevs_lock);
                                 error = -EEXIST;
-                               goto abort;
+                               goto out_unlock_disks_mutex;
                         }
                 spin_unlock(&all_mddevs_lock);
         }
@@ -5676,7 +5690,7 @@ static int md_alloc(dev_t dev, char *name)
         error = -ENOMEM;
         disk = blk_alloc_disk(NUMA_NO_NODE);
         if (!disk)
-               goto abort;
+               goto out_unlock_disks_mutex;
  
         disk->major = MAJOR(mddev->unit);
         disk->first_minor = unit << shift;
@@ -5700,27 +5714,25 @@ static int md_alloc(dev_t dev, char *name)
         disk->flags |= GENHD_FL_EXT_DEVT;
         disk->events |= DISK_EVENT_MEDIA_CHANGE;
         mddev->gendisk = disk;
-       add_disk(disk);
+       error = add_disk(disk);
+       if (error)
+               goto out_cleanup_disk;
  
         error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
-       if (error) {
-               /* This isn't possible, but as kobject_init_and_add is marked
-                * __must_check, we must do something with the result
-                */
-               pr_debug("md: cannot register %s/md - name in use\n",
-                        disk->disk_name);
-               error = 0;
-       }
-       if (mddev->kobj.sd &&
-           sysfs_create_group(&mddev->kobj, &md_bitmap_group))
-               pr_debug("pointless warning\n");
- abort:
+       if (error)
+               goto out_del_gendisk;
+
+       kobject_uevent(&mddev->kobj, KOBJ_ADD);
+       mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
+       mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
+       goto out_unlock_disks_mutex;
+
+out_del_gendisk:
+       del_gendisk(disk);
+out_cleanup_disk:
+       blk_cleanup_disk(disk);
+out_unlock_disks_mutex:
         mutex_unlock(&disks_mutex);
-       if (!error && mddev->kobj.sd) {
-               kobject_uevent(&mddev->kobj, KOBJ_ADD);
-               mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
-               mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
-       }
         mddev_put(mddev);
         return error;
  }
@@ -6034,7 +6046,7 @@ int md_run(struct mddev *mddev)
         if (mddev->sb_flags)
                 md_update_sb(mddev, 0);
  
-       md_new_event(mddev);
+       md_new_event();
         return 0;
  
  bitmap_abort:
@@ -6424,7 +6436,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
                 if (mddev->hold_active == UNTIL_STOP)
                         mddev->hold_active = 0;
         }
-       md_new_event(mddev);
+       md_new_event();
         sysfs_notify_dirent_safe(mddev->sysfs_state);
         return 0;
  }
@@ -6880,7 +6892,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
  
                 if (!mddev->persistent) {
                         pr_debug("md: nonpersistent superblock ...\n");
-                       rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+                       rdev->sb_start = bdev_nr_sectors(rdev->bdev);
                 } else
                         rdev->sb_start = calc_dev_sboffset(rdev);
                 rdev->sectors = rdev->sb_start;
@@ -6928,7 +6940,7 @@ kick_rdev:
                 md_wakeup_thread(mddev->thread);
         else
                 md_update_sb(mddev, 1);
-       md_new_event(mddev);
+       md_new_event();
  
         return 0;
  busy:
@@ -6967,7 +6979,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
         if (mddev->persistent)
                 rdev->sb_start = calc_dev_sboffset(rdev);
         else
-               rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+               rdev->sb_start = bdev_nr_sectors(rdev->bdev);
  
         rdev->sectors = rdev->sb_start;
  
@@ -7001,7 +7013,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
          */
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
         md_wakeup_thread(mddev->thread);
-       md_new_event(mddev);
+       md_new_event();
         return 0;
  
  abort_export:
@@ -7975,7 +7987,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
         md_wakeup_thread(mddev->thread);
         if (mddev->event_work.func)
                 queue_work(md_misc_wq, &mddev->event_work);
-       md_new_event(mddev);
+       md_new_event();
  }
  EXPORT_SYMBOL(md_error);
  
@@ -8859,7 +8871,7 @@ void md_do_sync(struct md_thread *thread)
                 mddev->curr_resync = 3; /* no longer delayed */
         mddev->curr_resync_completed = j;
         sysfs_notify_dirent_safe(mddev->sysfs_completed);
-       md_new_event(mddev);
+       md_new_event();
         update_time = jiffies;
  
         blk_start_plug(&plug);
@@ -8930,7 +8942,7 @@ void md_do_sync(struct md_thread *thread)
                         /* this is the earliest that rebuild will be
                          * visible in /proc/mdstat
                          */
-                       md_new_event(mddev);
+                       md_new_event();
  
                 if (last_check + window > io_sectors || j == max_sectors)
                         continue;
@@ -9154,7 +9166,7 @@ static int remove_and_add_spares(struct mddev *mddev,
                         sysfs_link_rdev(mddev, rdev);
                         if (!test_bit(Journal, &rdev->flags))
                                 spares++;
-                       md_new_event(mddev);
+                       md_new_event();
                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                 }
         }
@@ -9188,7 +9200,7 @@ static void md_start_sync(struct work_struct *ws)
         } else
                 md_wakeup_thread(mddev->sync_thread);
         sysfs_notify_dirent_safe(mddev->sysfs_action);
-       md_new_event(mddev);
+       md_new_event();
  }
  
  /*
@@ -9447,7 +9459,7 @@ void md_reap_sync_thread(struct mddev *mddev)
         /* flag recovery needed just to double check */
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
         sysfs_notify_dirent_safe(mddev->sysfs_action);
-       md_new_event(mddev);
+       md_new_event();
         if (mddev->event_work.func)
                 queue_work(md_misc_wq, &mddev->event_work);
  }
diff --git a/drivers/md/md.h b/drivers/md/md.h

index 4c96c36..53ea7a6 100644 (file)
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -731,7 +731,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
                         struct page *page, int op, int op_flags,
                         bool metadata_op);
  extern void md_do_sync(struct md_thread *thread);
-extern void md_new_event(struct mddev *mddev);
+extern void md_new_event(void);
  extern void md_allow_write(struct mddev *mddev);
  extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
  extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index 19598bd..7dc8026 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1496,7 +1496,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                 if (!r1_bio->bios[i])
                         continue;
  
-               if (first_clone) {
+               if (first_clone && test_bit(WriteMostly, &rdev->flags)) {
                         /* do behind I/O ?
                          * Not if there are too many, or cannot
                          * allocate memory, or a reader on WriteMostly
@@ -1529,13 +1529,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
  
                 r1_bio->bios[i] = mbio;
  
-               mbio->bi_iter.bi_sector = (r1_bio->sector +
-                                  conf->mirrors[i].rdev->data_offset);
-               bio_set_dev(mbio, conf->mirrors[i].rdev->bdev);
+               mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
+               bio_set_dev(mbio, rdev->bdev);
                 mbio->bi_end_io = raid1_end_write_request;
                 mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
-               if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
-                   !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
+               if (test_bit(FailFast, &rdev->flags) &&
+                   !test_bit(WriteMostly, &rdev->flags) &&
                     conf->raid_disks - mddev->degraded > 1)
                         mbio->bi_opf |= MD_FAILFAST;
                 mbio->bi_private = r1_bio;
@@ -1546,7 +1545,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                         trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
                                               r1_bio->sector);
                 /* flush_pending_writes() needs access to the rdev so...*/
-               mbio->bi_bdev = (void *)conf->mirrors[i].rdev;
+               mbio->bi_bdev = (void *)rdev;
  
                 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
                 if (cb)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index aa26365..dde98f6 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4647,7 +4647,7 @@ out:
         }
         conf->reshape_checkpoint = jiffies;
         md_wakeup_thread(mddev->sync_thread);
-       md_new_event(mddev);
+       md_new_event();
         return 0;
  
  abort:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 02ed53b..9c1a587 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7732,10 +7732,7 @@ static int raid5_run(struct mddev *mddev)
                  * discard data disk but write parity disk
                  */
                 stripe = stripe * PAGE_SIZE;
-               /* Round up to power of 2, as discard handling
-                * currently assumes that */
-               while ((stripe-1) & stripe)
-                       stripe = (stripe | (stripe-1)) + 1;
+               stripe = roundup_pow_of_two(stripe);
                 mddev->queue->limits.discard_alignment = stripe;
                 mddev->queue->limits.discard_granularity = stripe;
  
@@ -8282,7 +8279,7 @@ static int raid5_start_reshape(struct mddev *mddev)
         }
         conf->reshape_checkpoint = jiffies;
         md_wakeup_thread(mddev->sync_thread);
-       md_new_event(mddev);
+       md_new_event();
         return 0;
  }
  
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c

index 431af5e..74882fa 100644 (file)
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -258,7 +258,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
         mq = &md->queue;
  
         /* Dispatch locking to the block layer */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_OUT, 0);
         if (IS_ERR(req)) {
                 count = PTR_ERR(req);
                 goto out_put;
@@ -266,7 +266,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
         req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
         blk_execute_rq(NULL, req, 0);
         ret = req_to_mmc_queue_req(req)->drv_op_result;
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
         if (!ret) {
                 pr_info("%s: Locking boot partition ro until next power on\n",
@@ -646,7 +646,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
          * Dispatch the ioctl() into the block request queue.
          */
         mq = &md->queue;
-       req = blk_get_request(mq->queue,
+       req = blk_mq_alloc_request(mq->queue,
                 idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(req)) {
                 err = PTR_ERR(req);
@@ -660,7 +660,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
         blk_execute_rq(NULL, req, 0);
         ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
         err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
  cmd_done:
         kfree(idata->buf);
@@ -716,7 +716,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
          * Dispatch the ioctl()s into the block request queue.
          */
         mq = &md->queue;
-       req = blk_get_request(mq->queue,
+       req = blk_mq_alloc_request(mq->queue,
                 idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(req)) {
                 err = PTR_ERR(req);
@@ -733,7 +733,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
         for (i = 0; i < num_of_cmds && !err; i++)
                 err = mmc_blk_ioctl_copy_to_user(&cmds[i], idata[i]);
  
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
  cmd_err:
         for (i = 0; i < num_of_cmds; i++) {
@@ -2730,7 +2730,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
         int ret;
  
         /* Ask the block layer about the card status */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0);
         if (IS_ERR(req))
                 return PTR_ERR(req);
         req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
@@ -2740,7 +2740,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
                 *val = ret;
                 ret = 0;
         }
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
         return ret;
  }
@@ -2766,7 +2766,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
                 return -ENOMEM;
  
         /* Ask the block layer for the EXT CSD */
-       req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
+       req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0);
         if (IS_ERR(req)) {
                 err = PTR_ERR(req);
                 goto out_free;
@@ -2775,7 +2775,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
         req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
         blk_execute_rq(NULL, req, 0);
         err = req_to_mmc_queue_req(req)->drv_op_result;
-       blk_put_request(req);
+       blk_mq_free_request(req);
         if (err) {
                 pr_err("FAILED %d\n", err);
                 goto out_free;
diff --git a/drivers/mmc/core/crypto.c b/drivers/mmc/core/crypto.c

index 6755780..fec4fbf 100644 (file)
--- a/drivers/mmc/core/crypto.c
+++ b/drivers/mmc/core/crypto.c
@@ -16,13 +16,13 @@ void mmc_crypto_set_initial_state(struct mmc_host *host)
  {
         /* Reset might clear all keys, so reprogram all the keys. */
         if (host->caps2 & MMC_CAP2_CRYPTO)
-               blk_ksm_reprogram_all_keys(&host->ksm);
+               blk_crypto_reprogram_all_keys(&host->crypto_profile);
  }
  
  void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host)
  {
         if (host->caps2 & MMC_CAP2_CRYPTO)
-               blk_ksm_register(&host->ksm, q);
+               blk_crypto_register(&host->crypto_profile, q);
  }
  EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue);
  
@@ -30,12 +30,15 @@ void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq)
  {
         struct request *req = mmc_queue_req_to_req(mqrq);
         struct mmc_request *mrq = &mqrq->brq.mrq;
+       struct blk_crypto_keyslot *keyslot;
  
         if (!req->crypt_ctx)
                 return;
  
         mrq->crypto_ctx = req->crypt_ctx;
-       if (req->crypt_keyslot)
-               mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot);
+
+       keyslot = req->crypt_keyslot;
+       if (keyslot)
+               mrq->crypto_key_slot = blk_crypto_keyslot_index(keyslot);
  }
  EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c

index 4646b7a..c9db24e 100644 (file)
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -12,6 +12,7 @@
  #include <linux/slab.h>
  #include <linux/stat.h>
  #include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
  
  #include <linux/mmc/host.h>
  #include <linux/mmc/card.h>
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig

index 95b3511..ccc148c 100644 (file)
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -506,7 +506,7 @@ config MMC_OMAP_HS
  
  config MMC_WBSD
         tristate "Winbond W83L51xD SD/MMC Card Interface support"
-       depends on ISA_DMA_API
+       depends on ISA_DMA_API && !M68K
         help
           This selects the Winbond(R) W83L51xD Secure digital and
           Multimedia card Interface.
diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c

index 38559a9..31f8412 100644 (file)
--- a/drivers/mmc/host/cqhci-core.c
+++ b/drivers/mmc/host/cqhci-core.c
@@ -282,6 +282,9 @@ static void __cqhci_enable(struct cqhci_host *cq_host)
  
         cqhci_writel(cq_host, cqcfg, CQHCI_CFG);
  
+       if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT)
+               cqhci_writel(cq_host, 0, CQHCI_CTL);
+
         mmc->cqe_on = true;
  
         if (cq_host->ops->enable)
diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c

index 6419cfb..d5f4b69 100644 (file)
--- a/drivers/mmc/host/cqhci-crypto.c
+++ b/drivers/mmc/host/cqhci-crypto.c
@@ -6,7 +6,7 @@
   */
  
  #include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  #include <linux/mmc/host.h>
  
  #include "cqhci-crypto.h"
@@ -23,9 +23,10 @@ static const struct cqhci_crypto_alg_entry {
  };
  
  static inline struct cqhci_host *
-cqhci_host_from_ksm(struct blk_keyslot_manager *ksm)
+cqhci_host_from_crypto_profile(struct blk_crypto_profile *profile)
  {
-       struct mmc_host *mmc = container_of(ksm, struct mmc_host, ksm);
+       struct mmc_host *mmc =
+               container_of(profile, struct mmc_host, crypto_profile);
  
         return mmc->cqe_private;
  }
@@ -57,12 +58,12 @@ static int cqhci_crypto_program_key(struct cqhci_host *cq_host,
         return 0;
  }
  
-static int cqhci_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile,
                                         const struct blk_crypto_key *key,
                                         unsigned int slot)
  
  {
-       struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+       struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
         const union cqhci_crypto_cap_entry *ccap_array =
                 cq_host->crypto_cap_array;
         const struct cqhci_crypto_alg_entry *alg =
@@ -115,11 +116,11 @@ static int cqhci_crypto_clear_keyslot(struct cqhci_host *cq_host, int slot)
         return cqhci_crypto_program_key(cq_host, &cfg, slot);
  }
  
-static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_evict(struct blk_crypto_profile *profile,
                                       const struct blk_crypto_key *key,
                                       unsigned int slot)
  {
-       struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+       struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
  
         return cqhci_crypto_clear_keyslot(cq_host, slot);
  }
@@ -132,7 +133,7 @@ static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
   * "enabled" when these are called, i.e. CQHCI_ENABLE might not be set in the
   * CQHCI_CFG register.  But the hardware allows that.
   */
-static const struct blk_ksm_ll_ops cqhci_ksm_ops = {
+static const struct blk_crypto_ll_ops cqhci_crypto_ops = {
         .keyslot_program        = cqhci_crypto_keyslot_program,
         .keyslot_evict          = cqhci_crypto_keyslot_evict,
  };
@@ -157,8 +158,8 @@ cqhci_find_blk_crypto_mode(union cqhci_crypto_cap_entry cap)
   *
   * If the driver previously set MMC_CAP2_CRYPTO and the CQE declares
   * CQHCI_CAP_CS, initialize the crypto support.  This involves reading the
- * crypto capability registers, initializing the keyslot manager, clearing all
- * keyslots, and enabling 128-bit task descriptors.
+ * crypto capability registers, initializing the blk_crypto_profile, clearing
+ * all keyslots, and enabling 128-bit task descriptors.
   *
   * Return: 0 if crypto was initialized or isn't supported; whether
   *        MMC_CAP2_CRYPTO remains set indicates which one of those cases it is.
@@ -168,7 +169,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
  {
         struct mmc_host *mmc = cq_host->mmc;
         struct device *dev = mmc_dev(mmc);
-       struct blk_keyslot_manager *ksm = &mmc->ksm;
+       struct blk_crypto_profile *profile = &mmc->crypto_profile;
         unsigned int num_keyslots;
         unsigned int cap_idx;
         enum blk_crypto_mode_num blk_mode_num;
@@ -199,15 +200,15 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
          */
         num_keyslots = cq_host->crypto_capabilities.config_count + 1;
  
-       err = devm_blk_ksm_init(dev, ksm, num_keyslots);
+       err = devm_blk_crypto_profile_init(dev, profile, num_keyslots);
         if (err)
                 goto out;
  
-       ksm->ksm_ll_ops = cqhci_ksm_ops;
-       ksm->dev = dev;
+       profile->ll_ops = cqhci_crypto_ops;
+       profile->dev = dev;
  
         /* Unfortunately, CQHCI crypto only supports 32 DUN bits. */
-       ksm->max_dun_bytes_supported = 4;
+       profile->max_dun_bytes_supported = 4;
  
         /*
          * Cache all the crypto capabilities and advertise the supported crypto
@@ -223,7 +224,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
                                         cq_host->crypto_cap_array[cap_idx]);
                 if (blk_mode_num == BLK_ENCRYPTION_MODE_INVALID)
                         continue;
-               ksm->crypto_modes_supported[blk_mode_num] |=
+               profile->modes_supported[blk_mode_num] |=
                         cq_host->crypto_cap_array[cap_idx].sdus_mask * 512;
         }
  
diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c

index 0c75810..1f8a3c0 100644 (file)
--- a/drivers/mmc/host/dw_mmc-exynos.c
+++ b/drivers/mmc/host/dw_mmc-exynos.c
@@ -464,6 +464,18 @@ static s8 dw_mci_exynos_get_best_clksmpl(u8 candiates)
                 }
         }
  
+       /*
+        * If there is no cadiates value, then it needs to return -EIO.
+        * If there are candiates values and don't find bset clk sample value,
+        * then use a first candiates clock sample value.
+        */
+       for (i = 0; i < iter; i++) {
+               __c = ror8(candiates, i);
+               if ((__c & 0x1) == 0x1) {
+                       loc = i;
+                       goto out;
+               }
+       }
  out:
         return loc;
  }
@@ -494,6 +506,8 @@ static int dw_mci_exynos_execute_tuning(struct dw_mci_slot *slot, u32 opcode)
                 priv->tuned_sample = found;
         } else {
                 ret = -EIO;
+               dev_warn(&mmc->class_dev,
+                       "There is no candiates value about clksmpl!\n");
         }
  
         return ret;
diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c

index 4dfc246..b06b4dc 100644 (file)
--- a/drivers/mmc/host/mtk-sd.c
+++ b/drivers/mmc/host/mtk-sd.c
@@ -2577,6 +2577,25 @@ static int msdc_drv_probe(struct platform_device *pdev)
                 host->dma_mask = DMA_BIT_MASK(32);
         mmc_dev(mmc)->dma_mask = &host->dma_mask;
  
+       host->timeout_clks = 3 * 1048576;
+       host->dma.gpd = dma_alloc_coherent(&pdev->dev,
+                               2 * sizeof(struct mt_gpdma_desc),
+                               &host->dma.gpd_addr, GFP_KERNEL);
+       host->dma.bd = dma_alloc_coherent(&pdev->dev,
+                               MAX_BD_NUM * sizeof(struct mt_bdma_desc),
+                               &host->dma.bd_addr, GFP_KERNEL);
+       if (!host->dma.gpd || !host->dma.bd) {
+               ret = -ENOMEM;
+               goto release_mem;
+       }
+       msdc_init_gpd_bd(host, &host->dma);
+       INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout);
+       spin_lock_init(&host->lock);
+
+       platform_set_drvdata(pdev, mmc);
+       msdc_ungate_clock(host);
+       msdc_init_hw(host);
+
         if (mmc->caps2 & MMC_CAP2_CQE) {
                 host->cq_host = devm_kzalloc(mmc->parent,
                                              sizeof(*host->cq_host),
@@ -2597,25 +2616,6 @@ static int msdc_drv_probe(struct platform_device *pdev)
                 mmc->max_seg_size = 64 * 1024;
         }
  
-       host->timeout_clks = 3 * 1048576;
-       host->dma.gpd = dma_alloc_coherent(&pdev->dev,
-                               2 * sizeof(struct mt_gpdma_desc),
-                               &host->dma.gpd_addr, GFP_KERNEL);
-       host->dma.bd = dma_alloc_coherent(&pdev->dev,
-                               MAX_BD_NUM * sizeof(struct mt_bdma_desc),
-                               &host->dma.bd_addr, GFP_KERNEL);
-       if (!host->dma.gpd || !host->dma.bd) {
-               ret = -ENOMEM;
-               goto release_mem;
-       }
-       msdc_init_gpd_bd(host, &host->dma);
-       INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout);
-       spin_lock_init(&host->lock);
-
-       platform_set_drvdata(pdev, mmc);
-       msdc_ungate_clock(host);
-       msdc_init_hw(host);
-
         ret = devm_request_irq(&pdev->dev, host->irq, msdc_irq,
                                IRQF_TRIGGER_NONE, pdev->name, host);
         if (ret)
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c

index f18d169..e658f01 100644 (file)
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -1187,6 +1187,7 @@ static void esdhc_reset_tuning(struct sdhci_host *host)
         struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
         struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host);
         u32 ctrl;
+       int ret;
  
         /* Reset the tuning circuit */
         if (esdhc_is_usdhc(imx_data)) {
@@ -1199,7 +1200,22 @@ static void esdhc_reset_tuning(struct sdhci_host *host)
                 } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
                         ctrl = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS);
                         ctrl &= ~ESDHC_MIX_CTRL_SMPCLK_SEL;
+                       ctrl &= ~ESDHC_MIX_CTRL_EXE_TUNE;
                         writel(ctrl, host->ioaddr + SDHCI_AUTO_CMD_STATUS);
+                       /* Make sure ESDHC_MIX_CTRL_EXE_TUNE cleared */
+                       ret = readl_poll_timeout(host->ioaddr + SDHCI_AUTO_CMD_STATUS,
+                               ctrl, !(ctrl & ESDHC_MIX_CTRL_EXE_TUNE), 1, 50);
+                       if (ret == -ETIMEDOUT)
+                               dev_warn(mmc_dev(host->mmc),
+                                "Warning! clear execute tuning bit failed\n");
+                       /*
+                        * SDHCI_INT_DATA_AVAIL is W1C bit, set this bit will clear the
+                        * usdhc IP internal logic flag execute_tuning_with_clr_buf, which
+                        * will finally make sure the normal data transfer logic correct.
+                        */
+                       ctrl = readl(host->ioaddr + SDHCI_INT_STATUS);
+                       ctrl |= SDHCI_INT_DATA_AVAIL;
+                       writel(ctrl, host->ioaddr + SDHCI_INT_STATUS);
                 }
         }
  }
diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c

index be19785..d0f2edf 100644 (file)
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -616,16 +616,12 @@ static int intel_select_drive_strength(struct mmc_card *card,
         return intel_host->drv_strength;
  }
  
-static int bxt_get_cd(struct mmc_host *mmc)
+static int sdhci_get_cd_nogpio(struct mmc_host *mmc)
  {
-       int gpio_cd = mmc_gpio_get_cd(mmc);
         struct sdhci_host *host = mmc_priv(mmc);
         unsigned long flags;
         int ret = 0;
  
-       if (!gpio_cd)
-               return 0;
-
         spin_lock_irqsave(&host->lock, flags);
  
         if (host->flags & SDHCI_DEVICE_DEAD)
@@ -638,6 +634,21 @@ out:
         return ret;
  }
  
+static int bxt_get_cd(struct mmc_host *mmc)
+{
+       int gpio_cd = mmc_gpio_get_cd(mmc);
+
+       if (!gpio_cd)
+               return 0;
+
+       return sdhci_get_cd_nogpio(mmc);
+}
+
+static int mrfld_get_cd(struct mmc_host *mmc)
+{
+       return sdhci_get_cd_nogpio(mmc);
+}
+
  #define SDHCI_INTEL_PWR_TIMEOUT_CNT    20
  #define SDHCI_INTEL_PWR_TIMEOUT_UDELAY 100
  
@@ -1341,6 +1352,14 @@ static int intel_mrfld_mmc_probe_slot(struct sdhci_pci_slot *slot)
                                          MMC_CAP_1_8V_DDR;
                 break;
         case INTEL_MRFLD_SD:
+               slot->cd_idx = 0;
+               slot->cd_override_level = true;
+               /*
+                * There are two PCB designs of SD card slot with the opposite
+                * card detection sense. Quirk this out by ignoring GPIO state
+                * completely in the custom ->get_cd() callback.
+                */
+               slot->host->mmc_host_ops.get_cd = mrfld_get_cd;
                 slot->host->quirks2 |= SDHCI_QUIRK2_NO_1_8_V;
                 break;
         case INTEL_MRFLD_SDIO:
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c

index 8eefa7d..2d80a04 100644 (file)
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2042,6 +2042,12 @@ void sdhci_set_power_noreg(struct sdhci_host *host, unsigned char mode,
                         break;
                 case MMC_VDD_32_33:
                 case MMC_VDD_33_34:
+               /*
+                * 3.4 ~ 3.6V are valid only for those platforms where it's
+                * known that the voltage range is supported by hardware.
+                */
+               case MMC_VDD_34_35:
+               case MMC_VDD_35_36:
                         pwr = SDHCI_POWER_330;
                         break;
                 default:
diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c

index 7dfc26f..e2affa5 100644 (file)
--- a/drivers/mmc/host/tmio_mmc_core.c
+++ b/drivers/mmc/host/tmio_mmc_core.c
@@ -195,6 +195,10 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host)
         sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, host->sdcard_irq_mask_all);
         host->sdcard_irq_mask = host->sdcard_irq_mask_all;
  
+       if (host->native_hotplug)
+               tmio_mmc_enable_mmc_irqs(host,
+                               TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT);
+
         tmio_mmc_set_bus_width(host, host->mmc->ios.bus_width);
  
         if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) {
@@ -956,8 +960,15 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
         case MMC_POWER_OFF:
                 tmio_mmc_power_off(host);
                 /* For R-Car Gen2+, we need to reset SDHI specific SCC */
-               if (host->pdata->flags & TMIO_MMC_MIN_RCAR2)
+               if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) {
                         host->reset(host);
+
+                       if (host->native_hotplug)
+                               tmio_mmc_enable_mmc_irqs(host,
+                                               TMIO_STAT_CARD_REMOVE |
+                                               TMIO_STAT_CARD_INSERT);
+               }
+
                 host->set_clock(host, 0);
                 break;
         case MMC_POWER_UP:
@@ -1185,10 +1196,6 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host)
         _host->set_clock(_host, 0);
         tmio_mmc_reset(_host);
  
-       if (_host->native_hotplug)
-               tmio_mmc_enable_mmc_irqs(_host,
-                               TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT);
-
         spin_lock_init(&_host->lock);
         mutex_init(&_host->ios_lock);
  
diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c

index 4950d10..97beece 100644 (file)
--- a/drivers/mmc/host/vub300.c
+++ b/drivers/mmc/host/vub300.c
@@ -576,7 +576,7 @@ static void check_vub300_port_status(struct vub300_mmc_host *vub300)
                                 GET_SYSTEM_PORT_STATUS,
                                 USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                 0x0000, 0x0000, &vub300->system_port_status,
-                               sizeof(vub300->system_port_status), HZ);
+                               sizeof(vub300->system_port_status), 1000);
         if (sizeof(vub300->system_port_status) == retval)
                 new_system_port_status(vub300);
  }
@@ -1241,7 +1241,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300,
                                                 SET_INTERRUPT_PSEUDOCODE,
                                                 USB_DIR_OUT | USB_TYPE_VENDOR |
                                                 USB_RECIP_DEVICE, 0x0000, 0x0000,
-                                               xfer_buffer, xfer_length, HZ);
+                                               xfer_buffer, xfer_length, 1000);
                         kfree(xfer_buffer);
                         if (retval < 0)
                                 goto copy_error_message;
@@ -1284,7 +1284,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300,
                                                 SET_TRANSFER_PSEUDOCODE,
                                                 USB_DIR_OUT | USB_TYPE_VENDOR |
                                                 USB_RECIP_DEVICE, 0x0000, 0x0000,
-                                               xfer_buffer, xfer_length, HZ);
+                                               xfer_buffer, xfer_length, 1000);
                         kfree(xfer_buffer);
                         if (retval < 0)
                                 goto copy_error_message;
@@ -1991,7 +1991,7 @@ static void __set_clock_speed(struct vub300_mmc_host *vub300, u8 buf[8],
                 usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                 SET_CLOCK_SPEED,
                                 USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x00, 0x00, buf, buf_array_size, HZ);
+                               0x00, 0x00, buf, buf_array_size, 1000);
         if (retval != 8) {
                 dev_err(&vub300->udev->dev, "SET_CLOCK_SPEED"
                         " %dkHz failed with retval=%d\n", kHzClock, retval);
@@ -2013,14 +2013,14 @@ static void vub300_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                 usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                 SET_SD_POWER,
                                 USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x0000, 0x0000, NULL, 0, HZ);
+                               0x0000, 0x0000, NULL, 0, 1000);
                 /* must wait for the VUB300 u-proc to boot up */
                 msleep(600);
         } else if ((ios->power_mode == MMC_POWER_UP) && !vub300->card_powered) {
                 usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                 SET_SD_POWER,
                                 USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               0x0001, 0x0000, NULL, 0, HZ);
+                               0x0001, 0x0000, NULL, 0, 1000);
                 msleep(600);
                 vub300->card_powered = 1;
         } else if (ios->power_mode == MMC_POWER_ON) {
@@ -2275,14 +2275,14 @@ static int vub300_probe(struct usb_interface *interface,
                                 GET_HC_INF0,
                                 USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                 0x0000, 0x0000, &vub300->hc_info,
-                               sizeof(vub300->hc_info), HZ);
+                               sizeof(vub300->hc_info), 1000);
         if (retval < 0)
                 goto error5;
         retval =
                 usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0),
                                 SET_ROM_WAIT_STATES,
                                 USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-                               firmware_rom_wait_states, 0x0000, NULL, 0, HZ);
+                               firmware_rom_wait_states, 0x0000, NULL, 0, 1000);
         if (retval < 0)
                 goto error5;
         dev_info(&vub300->udev->dev,
@@ -2297,7 +2297,7 @@ static int vub300_probe(struct usb_interface *interface,
                                 GET_SYSTEM_PORT_STATUS,
                                 USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
                                 0x0000, 0x0000, &vub300->system_port_status,
-                               sizeof(vub300->system_port_status), HZ);
+                               sizeof(vub300->system_port_status), 1000);
         if (retval < 0) {
                 goto error4;
         } else if (sizeof(vub300->system_port_status) == retval) {
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c

index b8ae1ec..4eaba6f 100644 (file)
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -384,7 +384,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
         if (new->readonly)
                 set_disk_ro(gd, 1);
  
-       device_add_disk(&new->mtd->dev, gd, NULL);
+       ret = device_add_disk(&new->mtd->dev, gd, NULL);
+       if (ret)
+               goto out_cleanup_disk;
  
         if (new->disk_attributes) {
                 ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
@@ -393,6 +395,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
         }
         return 0;
  
+out_cleanup_disk:
+       blk_cleanup_disk(new->disk);
  out_free_tag_set:
         blk_mq_free_tag_set(new->tag_set);
  out_kfree_tag_set:
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c

index 38b6aa8..5ff0011 100644 (file)
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -15,6 +15,7 @@
  #include <linux/slab.h>
  #include <linux/major.h>
  #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
  #include <linux/fs_context.h>
  #include "mtdcore.h"
  
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c

index 2b66c59..e54f962 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -137,7 +137,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                 .name = "uc",
                 .cmd = HNAE3_DBG_CMD_MAC_UC,
                 .dentry = HNS3_DBG_DENTRY_MAC,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_128KB,
                 .init = hns3_dbg_common_file_init,
         },
         {
@@ -256,7 +256,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                 .name = "tqp",
                 .cmd = HNAE3_DBG_CMD_REG_TQP,
                 .dentry = HNS3_DBG_DENTRY_REG,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_128KB,
                 .init = hns3_dbg_common_file_init,
         },
         {
@@ -298,7 +298,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = {
                 .name = "fd_tcam",
                 .cmd = HNAE3_DBG_CMD_FD_TCAM,
                 .dentry = HNS3_DBG_DENTRY_FD,
-               .buf_len = HNS3_DBG_READ_LEN,
+               .buf_len = HNS3_DBG_READ_LEN_1MB,
                 .init = hns3_dbg_common_file_init,
         },
         {
@@ -462,7 +462,7 @@ static const struct hns3_dbg_item rx_queue_info_items[] = {
         { "TAIL", 2 },
         { "HEAD", 2 },
         { "FBDNUM", 2 },
-       { "PKTNUM", 2 },
+       { "PKTNUM", 5 },
         { "COPYBREAK", 2 },
         { "RING_EN", 2 },
         { "RX_RING_EN", 2 },
@@ -565,7 +565,7 @@ static const struct hns3_dbg_item tx_queue_info_items[] = {
         { "HEAD", 2 },
         { "FBDNUM", 2 },
         { "OFFSET", 2 },
-       { "PKTNUM", 2 },
+       { "PKTNUM", 5 },
         { "RING_EN", 2 },
         { "TX_RING_EN", 2 },
         { "BASE_ADDR", 10 },
@@ -790,13 +790,13 @@ static int hns3_dbg_rx_bd_info(struct hns3_dbg_data *d, char *buf, int len)
  }
  
  static const struct hns3_dbg_item tx_bd_info_items[] = {
-       { "BD_IDX", 5 },
-       { "ADDRESS", 2 },
+       { "BD_IDX", 2 },
+       { "ADDRESS", 13 },
         { "VLAN_TAG", 2 },
         { "SIZE", 2 },
         { "T_CS_VLAN_TSO", 2 },
         { "OT_VLAN_TAG", 3 },
-       { "TV", 2 },
+       { "TV", 5 },
         { "OLT_VLAN_LEN", 2 },
         { "PAYLEN_OL4CS", 2 },
         { "BD_FE_SC_VLD", 2 },
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c

index 32f62cd..9cda8b3 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
@@ -391,7 +391,7 @@ static int hclge_dbg_dump_mac(struct hclge_dev *hdev, char *buf, int len)
  static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
                                    int *pos)
  {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
         struct hclge_desc desc;
         u16 qset_id, qset_num;
         int ret;
@@ -408,12 +408,12 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
                 if (ret)
                         return ret;
  
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
  
                 *pos += scnprintf(buf + *pos, len - *pos,
                                   "%04u           %#x            %#x             %#x               %#x\n",
-                                 qset_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2, bitmap->bit3);
+                                 qset_id, req.bit0, req.bit1, req.bit2,
+                                 req.bit3);
         }
  
         return 0;
@@ -422,7 +422,7 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len,
  static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
                                   int *pos)
  {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
         struct hclge_desc desc;
         u8 pri_id, pri_num;
         int ret;
@@ -439,12 +439,11 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
                 if (ret)
                         return ret;
  
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
  
                 *pos += scnprintf(buf + *pos, len - *pos,
                                   "%03u       %#x           %#x                %#x\n",
-                                 pri_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2);
+                                 pri_id, req.bit0, req.bit1, req.bit2);
         }
  
         return 0;
@@ -453,7 +452,7 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len,
  static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len,
                                  int *pos)
  {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
         struct hclge_desc desc;
         u8 pg_id;
         int ret;
@@ -466,12 +465,11 @@ static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len,
                 if (ret)
                         return ret;
  
-               bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+               req.bitmap = (u8)le32_to_cpu(desc.data[1]);
  
                 *pos += scnprintf(buf + *pos, len - *pos,
                                   "%03u      %#x           %#x               %#x\n",
-                                 pg_id, bitmap->bit0, bitmap->bit1,
-                                 bitmap->bit2);
+                                 pg_id, req.bit0, req.bit1, req.bit2);
         }
  
         return 0;
@@ -511,7 +509,7 @@ static int hclge_dbg_dump_dcb_queue(struct hclge_dev *hdev, char *buf, int len,
  static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len,
                                    int *pos)
  {
-       struct hclge_dbg_bitmap_cmd *bitmap;
+       struct hclge_dbg_bitmap_cmd req;
         struct hclge_desc desc;
         u8 port_id = 0;
         int ret;
@@ -521,12 +519,12 @@ static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len,
         if (ret)
                 return ret;
  
-       bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1];
+       req.bitmap = (u8)le32_to_cpu(desc.data[1]);
  
         *pos += scnprintf(buf + *pos, len - *pos, "port_mask: %#x\n",
-                        bitmap->bit0);
+                        req.bit0);
         *pos += scnprintf(buf + *pos, len - *pos, "port_shaping_pass: %#x\n",
-                        bitmap->bit1);
+                        req.bit1);
  
         return 0;
  }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c

index dcd40cc..d891390 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2847,33 +2847,29 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev)
  {
         if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
             !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
  }
  
  static void hclge_reset_task_schedule(struct hclge_dev *hdev)
  {
         if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
+           test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state) &&
             !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
  }
  
  static void hclge_errhand_task_schedule(struct hclge_dev *hdev)
  {
         if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
             !test_and_set_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task, 0);
+               mod_delayed_work(hclge_wq, &hdev->service_task, 0);
  }
  
  void hclge_task_schedule(struct hclge_dev *hdev, unsigned long delay_time)
  {
         if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
             !test_bit(HCLGE_STATE_RST_FAIL, &hdev->state))
-               mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
-                                   hclge_wq, &hdev->service_task,
-                                   delay_time);
+               mod_delayed_work(hclge_wq, &hdev->service_task, delay_time);
  }
  
  static int hclge_get_mac_link_status(struct hclge_dev *hdev, int *link_status)
@@ -3491,33 +3487,14 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev)
         hdev->num_msi_used += 1;
  }
  
-static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify,
-                                     const cpumask_t *mask)
-{
-       struct hclge_dev *hdev = container_of(notify, struct hclge_dev,
-                                             affinity_notify);
-
-       cpumask_copy(&hdev->affinity_mask, mask);
-}
-
-static void hclge_irq_affinity_release(struct kref *ref)
-{
-}
-
  static void hclge_misc_affinity_setup(struct hclge_dev *hdev)
  {
         irq_set_affinity_hint(hdev->misc_vector.vector_irq,
                               &hdev->affinity_mask);
-
-       hdev->affinity_notify.notify = hclge_irq_affinity_notify;
-       hdev->affinity_notify.release = hclge_irq_affinity_release;
-       irq_set_affinity_notifier(hdev->misc_vector.vector_irq,
-                                 &hdev->affinity_notify);
  }
  
  static void hclge_misc_affinity_teardown(struct hclge_dev *hdev)
  {
-       irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL);
         irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL);
  }
  
@@ -13052,7 +13029,7 @@ static int hclge_init(void)
  {
         pr_info("%s is initializing\n", HCLGE_NAME);
  
-       hclge_wq = alloc_workqueue("%s", 0, 0, HCLGE_NAME);
+       hclge_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGE_NAME);
         if (!hclge_wq) {
                 pr_err("%s: failed to create workqueue\n", HCLGE_NAME);
                 return -ENOMEM;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h

index de6afbc..69cd8f8 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -944,7 +944,6 @@ struct hclge_dev {
  
         /* affinity mask and notify for misc interrupt */
         cpumask_t affinity_mask;
-       struct irq_affinity_notify affinity_notify;
         struct hclge_ptp *ptp;
         struct devlink *devlink;
  };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c

index bef6b98..cf00ad7 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -2232,6 +2232,7 @@ static void hclgevf_get_misc_vector(struct hclgevf_dev *hdev)
  void hclgevf_reset_task_schedule(struct hclgevf_dev *hdev)
  {
         if (!test_bit(HCLGEVF_STATE_REMOVING, &hdev->state) &&
+           test_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state) &&
             !test_and_set_bit(HCLGEVF_STATE_RST_SERVICE_SCHED,
                               &hdev->state))
                 mod_delayed_work(hclgevf_wq, &hdev->service_task, 0);
@@ -3449,6 +3450,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
  
         hclgevf_init_rxd_adv_layout(hdev);
  
+       set_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state);
+
         hdev->last_reset_time = jiffies;
         dev_info(&hdev->pdev->dev, "finished initializing %s driver\n",
                  HCLGEVF_DRIVER_NAME);
@@ -3899,7 +3902,7 @@ static int hclgevf_init(void)
  {
         pr_info("%s is initializing\n", HCLGEVF_NAME);
  
-       hclgevf_wq = alloc_workqueue("%s", 0, 0, HCLGEVF_NAME);
+       hclgevf_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGEVF_NAME);
         if (!hclgevf_wq) {
                 pr_err("%s: failed to create workqueue\n", HCLGEVF_NAME);
                 return -ENOMEM;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h

index 883130a..28288d7 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
@@ -146,6 +146,7 @@ enum hclgevf_states {
         HCLGEVF_STATE_REMOVING,
         HCLGEVF_STATE_NIC_REGISTERED,
         HCLGEVF_STATE_ROCE_REGISTERED,
+       HCLGEVF_STATE_SERVICE_INITED,
         /* task states */
         HCLGEVF_STATE_RST_SERVICE_SCHED,
         HCLGEVF_STATE_RST_HANDLING,
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c

index 37c18c6..e375ac8 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_lag.c
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -100,9 +100,9 @@ static void ice_display_lag_info(struct ice_lag *lag)
   */
  static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
  {
-       struct net_device *event_netdev, *netdev_tmp;
         struct netdev_notifier_bonding_info *info;
         struct netdev_bonding_info *bonding_info;
+       struct net_device *event_netdev;
         const char *lag_netdev_name;
  
         event_netdev = netdev_notifier_info_to_dev(ptr);
@@ -123,19 +123,6 @@ static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
                 goto lag_out;
         }
  
-       rcu_read_lock();
-       for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) {
-               if (!netif_is_ice(netdev_tmp))
-                       continue;
-
-               if (netdev_tmp && netdev_tmp != lag->netdev &&
-                   lag->peer_netdev != netdev_tmp) {
-                       dev_hold(netdev_tmp);
-                       lag->peer_netdev = netdev_tmp;
-               }
-       }
-       rcu_read_unlock();
-
         if (bonding_info->slave.state)
                 ice_lag_set_backup(lag);
         else
@@ -319,6 +306,9 @@ ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event,
         case NETDEV_BONDING_INFO:
                 ice_lag_info_event(lag, ptr);
                 break;
+       case NETDEV_UNREGISTER:
+               ice_lag_unlink(lag, ptr);
+               break;
         default:
                 break;
         }
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c

index 80380ae..d1ef3d4 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -1571,6 +1571,9 @@ err_kworker:
   */
  void ice_ptp_release(struct ice_pf *pf)
  {
+       if (!test_bit(ICE_FLAG_PTP, pf->flags))
+               return;
+
         /* Disable timestamping for both Tx and Rx */
         ice_ptp_cfg_timestamp(pf, false);
  
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c

index 9338765..49d822a 100644 (file)
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -226,18 +226,85 @@ static const struct file_operations rvu_dbg_##name##_fops = { \
  
  static void print_nix_qsize(struct seq_file *filp, struct rvu_pfvf *pfvf);
  
+static void get_lf_str_list(struct rvu_block block, int pcifunc,
+                           char *lfs)
+{
+       int lf = 0, seq = 0, len = 0, prev_lf = block.lf.max;
+
+       for_each_set_bit(lf, block.lf.bmap, block.lf.max) {
+               if (lf >= block.lf.max)
+                       break;
+
+               if (block.fn_map[lf] != pcifunc)
+                       continue;
+
+               if (lf == prev_lf + 1) {
+                       prev_lf = lf;
+                       seq = 1;
+                       continue;
+               }
+
+               if (seq)
+                       len += sprintf(lfs + len, "-%d,%d", prev_lf, lf);
+               else
+                       len += (len ? sprintf(lfs + len, ",%d", lf) :
+                                     sprintf(lfs + len, "%d", lf));
+
+               prev_lf = lf;
+               seq = 0;
+       }
+
+       if (seq)
+               len += sprintf(lfs + len, "-%d", prev_lf);
+
+       lfs[len] = '\0';
+}
+
+static int get_max_column_width(struct rvu *rvu)
+{
+       int index, pf, vf, lf_str_size = 12, buf_size = 256;
+       struct rvu_block block;
+       u16 pcifunc;
+       char *buf;
+
+       buf = kzalloc(buf_size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
+               for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
+                       pcifunc = pf << 10 | vf;
+                       if (!pcifunc)
+                               continue;
+
+                       for (index = 0; index < BLK_COUNT; index++) {
+                               block = rvu->hw->block[index];
+                               if (!strlen(block.name))
+                                       continue;
+
+                               get_lf_str_list(block, pcifunc, buf);
+                               if (lf_str_size <= strlen(buf))
+                                       lf_str_size = strlen(buf) + 1;
+                       }
+               }
+       }
+
+       kfree(buf);
+       return lf_str_size;
+}
+
  /* Dumps current provisioning status of all RVU block LFs */
  static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                           char __user *buffer,
                                           size_t count, loff_t *ppos)
  {
-       int index, off = 0, flag = 0, go_back = 0, len = 0;
+       int index, off = 0, flag = 0, len = 0, i = 0;
         struct rvu *rvu = filp->private_data;
-       int lf, pf, vf, pcifunc;
+       int bytes_not_copied = 0;
         struct rvu_block block;
-       int bytes_not_copied;
-       int lf_str_size = 12;
+       int pf, vf, pcifunc;
         int buf_size = 2048;
+       int lf_str_size;
         char *lfs;
         char *buf;
  
@@ -249,6 +316,9 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
         if (!buf)
                 return -ENOSPC;
  
+       /* Get the maximum width of a column */
+       lf_str_size = get_max_column_width(rvu);
+
         lfs = kzalloc(lf_str_size, GFP_KERNEL);
         if (!lfs) {
                 kfree(buf);
@@ -262,65 +332,69 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                          "%-*s", lf_str_size,
                                          rvu->hw->block[index].name);
                 }
+
         off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
+       bytes_not_copied = copy_to_user(buffer + (i * off), buf, off);
+       if (bytes_not_copied)
+               goto out;
+
+       i++;
+       *ppos += off;
         for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
                 for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
+                       off = 0;
+                       flag = 0;
                         pcifunc = pf << 10 | vf;
                         if (!pcifunc)
                                 continue;
  
                         if (vf) {
                                 sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
-                               go_back = scnprintf(&buf[off],
-                                                   buf_size - 1 - off,
-                                                   "%-*s", lf_str_size, lfs);
+                               off = scnprintf(&buf[off],
+                                               buf_size - 1 - off,
+                                               "%-*s", lf_str_size, lfs);
                         } else {
                                 sprintf(lfs, "PF%d", pf);
-                               go_back = scnprintf(&buf[off],
-                                                   buf_size - 1 - off,
-                                                   "%-*s", lf_str_size, lfs);
+                               off = scnprintf(&buf[off],
+                                               buf_size - 1 - off,
+                                               "%-*s", lf_str_size, lfs);
                         }
  
-                       off += go_back;
-                       for (index = 0; index < BLKTYPE_MAX; index++) {
+                       for (index = 0; index < BLK_COUNT; index++) {
                                 block = rvu->hw->block[index];
                                 if (!strlen(block.name))
                                         continue;
                                 len = 0;
                                 lfs[len] = '\0';
-                               for (lf = 0; lf < block.lf.max; lf++) {
-                                       if (block.fn_map[lf] != pcifunc)
-                                               continue;
+                               get_lf_str_list(block, pcifunc, lfs);
+                               if (strlen(lfs))
                                         flag = 1;
-                                       len += sprintf(&lfs[len], "%d,", lf);
-                               }
  
-                               if (flag)
-                                       len--;
-                               lfs[len] = '\0';
                                 off += scnprintf(&buf[off], buf_size - 1 - off,
                                                  "%-*s", lf_str_size, lfs);
-                               if (!strlen(lfs))
-                                       go_back += lf_str_size;
                         }
-                       if (!flag)
-                               off -= go_back;
-                       else
-                               flag = 0;
-                       off--;
-                       off +=  scnprintf(&buf[off], buf_size - 1 - off, "\n");
+                       if (flag) {
+                               off +=  scnprintf(&buf[off],
+                                                 buf_size - 1 - off, "\n");
+                               bytes_not_copied = copy_to_user(buffer +
+                                                               (i * off),
+                                                               buf, off);
+                               if (bytes_not_copied)
+                                       goto out;
+
+                               i++;
+                               *ppos += off;
+                       }
                 }
         }
  
-       bytes_not_copied = copy_to_user(buffer, buf, off);
+out:
         kfree(lfs);
         kfree(buf);
-
         if (bytes_not_copied)
                 return -EFAULT;
  
-       *ppos = off;
-       return off;
+       return *ppos;
  }
  
  RVU_DEBUG_FOPS(rsrc_status, rsrc_attach_status, NULL);
@@ -504,7 +578,7 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp,
         if (cmd_buf)
                 ret = -EINVAL;
  
-       if (!strncmp(subtoken, "help", 4) || ret < 0) {
+       if (ret < 0 || !strncmp(subtoken, "help", 4)) {
                 dev_info(rvu->dev, "Use echo <%s-lf > qsize\n", blk_string);
                 goto qsize_write_done;
         }
@@ -1719,6 +1793,10 @@ static int rvu_dbg_nix_band_prof_ctx_display(struct seq_file *m, void *unused)
         u16 pcifunc;
         char *str;
  
+       /* Ingress policers do not exist on all platforms */
+       if (!nix_hw->ipolicer)
+               return 0;
+
         for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) {
                 if (layer == BAND_PROF_INVAL_LAYER)
                         continue;
@@ -1768,6 +1846,10 @@ static int rvu_dbg_nix_band_prof_rsrc_display(struct seq_file *m, void *unused)
         int layer;
         char *str;
  
+       /* Ingress policers do not exist on all platforms */
+       if (!nix_hw->ipolicer)
+               return 0;
+
         seq_puts(m, "\nBandwidth profile resource free count\n");
         seq_puts(m, "=====================================\n");
         for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) {
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c

index 9ef4e94..6970540 100644 (file)
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -2507,6 +2507,9 @@ static void nix_free_tx_vtag_entries(struct rvu *rvu, u16 pcifunc)
                 return;
  
         nix_hw = get_nix_hw(rvu->hw, blkaddr);
+       if (!nix_hw)
+               return;
+
         vlan = &nix_hw->txvlan;
  
         mutex_lock(&vlan->rsrc_lock);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c

index 13b0259..fcace73 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -353,13 +353,10 @@ static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
         struct sk_buff *skb;
         int err;
  
-       elem_info->u.rdq.skb = NULL;
         skb = netdev_alloc_skb_ip_align(NULL, buf_len);
         if (!skb)
                 return -ENOMEM;
  
-       /* Assume that wqe was previously zeroed. */
-
         err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
                                      buf_len, DMA_FROM_DEVICE);
         if (err)
@@ -597,21 +594,26 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
         struct pci_dev *pdev = mlxsw_pci->pdev;
         struct mlxsw_pci_queue_elem_info *elem_info;
         struct mlxsw_rx_info rx_info = {};
-       char *wqe;
+       char wqe[MLXSW_PCI_WQE_SIZE];
         struct sk_buff *skb;
         u16 byte_count;
         int err;
  
         elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-       skb = elem_info->u.sdq.skb;
-       if (!skb)
-               return;
-       wqe = elem_info->elem;
-       mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+       skb = elem_info->u.rdq.skb;
+       memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);
  
         if (q->consumer_counter++ != consumer_counter_limit)
                 dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
  
+       err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+       if (err) {
+               dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+               goto out;
+       }
+
+       mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+
         if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
                 rx_info.is_lag = true;
                 rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe_v, cqe);
@@ -647,10 +649,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
         skb_put(skb, byte_count);
         mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
  
-       memset(wqe, 0, q->elem_size);
-       err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
-       if (err)
-               dev_dbg_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+out:
         /* Everything is set up, ring doorbell to pass elem to HW */
         q->producer_counter++;
         mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c

index 9e8561c..4d5a5d6 100644 (file)
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -1743,6 +1743,16 @@ static int lan743x_tx_ring_init(struct lan743x_tx *tx)
                 ret = -EINVAL;
                 goto cleanup;
         }
+       if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev,
+                                     DMA_BIT_MASK(64))) {
+               if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev,
+                                             DMA_BIT_MASK(32))) {
+                       dev_warn(&tx->adapter->pdev->dev,
+                                "lan743x_: No suitable DMA available\n");
+                       ret = -ENOMEM;
+                       goto cleanup;
+               }
+       }
         ring_allocation_size = ALIGN(tx->ring_size *
                                      sizeof(struct lan743x_tx_descriptor),
                                      PAGE_SIZE);
@@ -1934,7 +1944,8 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index)
                                   index);
  }
  
-static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index)
+static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index,
+                                       gfp_t gfp)
  {
         struct net_device *netdev = rx->adapter->netdev;
         struct device *dev = &rx->adapter->pdev->dev;
@@ -1948,7 +1959,7 @@ static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index)
  
         descriptor = &rx->ring_cpu_ptr[index];
         buffer_info = &rx->buffer_info[index];
-       skb = __netdev_alloc_skb(netdev, buffer_length, GFP_ATOMIC | GFP_DMA);
+       skb = __netdev_alloc_skb(netdev, buffer_length, gfp);
         if (!skb)
                 return -ENOMEM;
         dma_ptr = dma_map_single(dev, skb->data, buffer_length, DMA_FROM_DEVICE);
@@ -2110,7 +2121,8 @@ static int lan743x_rx_process_buffer(struct lan743x_rx *rx)
  
         /* save existing skb, allocate new skb and map to dma */
         skb = buffer_info->skb;
-       if (lan743x_rx_init_ring_element(rx, rx->last_head)) {
+       if (lan743x_rx_init_ring_element(rx, rx->last_head,
+                                        GFP_ATOMIC | GFP_DMA)) {
                 /* failed to allocate next skb.
                  * Memory is very low.
                  * Drop this packet and reuse buffer.
@@ -2276,6 +2288,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx)
                 ret = -EINVAL;
                 goto cleanup;
         }
+       if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev,
+                                     DMA_BIT_MASK(64))) {
+               if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev,
+                                             DMA_BIT_MASK(32))) {
+                       dev_warn(&rx->adapter->pdev->dev,
+                                "lan743x_: No suitable DMA available\n");
+                       ret = -ENOMEM;
+                       goto cleanup;
+               }
+       }
         ring_allocation_size = ALIGN(rx->ring_size *
                                      sizeof(struct lan743x_rx_descriptor),
                                      PAGE_SIZE);
@@ -2315,13 +2337,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx)
  
         rx->last_head = 0;
         for (index = 0; index < rx->ring_size; index++) {
-               ret = lan743x_rx_init_ring_element(rx, index);
+               ret = lan743x_rx_init_ring_element(rx, index, GFP_KERNEL);
                 if (ret)
                         goto cleanup;
         }
         return 0;
  
  cleanup:
+       netif_warn(rx->adapter, ifup, rx->adapter->netdev,
+                  "Error allocating memory for LAN743x\n");
+
         lan743x_rx_ring_cleanup(rx);
         return ret;
  }
@@ -3019,6 +3044,8 @@ static int lan743x_pm_resume(struct device *dev)
         if (ret) {
                 netif_err(adapter, probe, adapter->netdev,
                           "lan743x_hardware_init returned %d\n", ret);
+               lan743x_pci_cleanup(adapter);
+               return ret;
         }
  
         /* open netdev when netdev is at running state while resume.
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c

index 11c83a9..f469950 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -182,15 +182,21 @@ static int
  nfp_bpf_check_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu)
  {
         struct nfp_net *nn = netdev_priv(netdev);
-       unsigned int max_mtu;
+       struct nfp_bpf_vnic *bv;
+       struct bpf_prog *prog;
  
         if (~nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF)
                 return 0;
  
-       max_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
-       if (new_mtu > max_mtu) {
-               nn_info(nn, "BPF offload active, MTU over %u not supported\n",
-                       max_mtu);
+       if (nn->xdp_hw.prog) {
+               prog = nn->xdp_hw.prog;
+       } else {
+               bv = nn->app_priv;
+               prog = bv->tc_prog;
+       }
+
+       if (nfp_bpf_offload_check_mtu(nn, prog, new_mtu)) {
+               nn_info(nn, "BPF offload active, potential packet access beyond hardware packet boundary");
                 return -EBUSY;
         }
         return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h

index d0e17ee..16841bb 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -560,6 +560,8 @@ bool nfp_is_subprog_start(struct nfp_insn_meta *meta);
  void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog);
  int nfp_bpf_jit(struct nfp_prog *prog);
  bool nfp_bpf_supported_opcode(u8 code);
+bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog,
+                              unsigned int mtu);
  
  int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx,
                     int prev_insn_idx);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c

index 5385185..9d97cd2 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -481,19 +481,28 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data,
         return 0;
  }
  
+bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog,
+                              unsigned int mtu)
+{
+       unsigned int fw_mtu, pkt_off;
+
+       fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
+       pkt_off = min(prog->aux->max_pkt_offset, mtu);
+
+       return fw_mtu < pkt_off;
+}
+
  static int
  nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
                  struct netlink_ext_ack *extack)
  {
         struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-       unsigned int fw_mtu, pkt_off, max_stack, max_prog_len;
+       unsigned int max_stack, max_prog_len;
         dma_addr_t dma_addr;
         void *img;
         int err;
  
-       fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
-       pkt_off = min(prog->aux->max_pkt_offset, nn->dp.netdev->mtu);
-       if (fw_mtu < pkt_off) {
+       if (nfp_bpf_offload_check_mtu(nn, prog, nn->dp.netdev->mtu)) {
                 NL_SET_ERR_MSG_MOD(extack, "BPF offload not supported with potential packet access beyond HW packet split boundary");
                 return -EOPNOTSUPP;
         }
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c

index d29fe56..c910fa2 100644 (file)
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1015,9 +1015,6 @@ static int lpc_eth_close(struct net_device *ndev)
         napi_disable(&pldat->napi);
         netif_stop_queue(ndev);
  
-       if (ndev->phydev)
-               phy_stop(ndev->phydev);
-
         spin_lock_irqsave(&pldat->lock, flags);
         __lpc_eth_reset(pldat);
         netif_carrier_off(ndev);
@@ -1025,6 +1022,8 @@ static int lpc_eth_close(struct net_device *ndev)
         writel(0, LPC_ENET_MAC2(pldat->net_base));
         spin_unlock_irqrestore(&pldat->lock, flags);
  
+       if (ndev->phydev)
+               phy_stop(ndev->phydev);
         clk_disable_unprepare(pldat->clk);
  
         return 0;
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c

index 46a6ff9..2918947 100644 (file)
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -157,6 +157,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = {
         { PCI_VDEVICE(REALTEK,  0x8129) },
         { PCI_VDEVICE(REALTEK,  0x8136), RTL_CFG_NO_GBIT },
         { PCI_VDEVICE(REALTEK,  0x8161) },
+       { PCI_VDEVICE(REALTEK,  0x8162) },
         { PCI_VDEVICE(REALTEK,  0x8167) },
         { PCI_VDEVICE(REALTEK,  0x8168) },
         { PCI_VDEVICE(NCUBE,    0x8168) },
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c

index f124a8a..a3bfb15 100644 (file)
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -243,62 +243,10 @@ static void phy_sanitize_settings(struct phy_device *phydev)
         }
  }
  
-int phy_ethtool_ksettings_set(struct phy_device *phydev,
-                             const struct ethtool_link_ksettings *cmd)
-{
-       __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
-       u8 autoneg = cmd->base.autoneg;
-       u8 duplex = cmd->base.duplex;
-       u32 speed = cmd->base.speed;
-
-       if (cmd->base.phy_address != phydev->mdio.addr)
-               return -EINVAL;
-
-       linkmode_copy(advertising, cmd->link_modes.advertising);
-
-       /* We make sure that we don't pass unsupported values in to the PHY */
-       linkmode_and(advertising, advertising, phydev->supported);
-
-       /* Verify the settings we care about. */
-       if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
-               return -EINVAL;
-
-       if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising))
-               return -EINVAL;
-
-       if (autoneg == AUTONEG_DISABLE &&
-           ((speed != SPEED_1000 &&
-             speed != SPEED_100 &&
-             speed != SPEED_10) ||
-            (duplex != DUPLEX_HALF &&
-             duplex != DUPLEX_FULL)))
-               return -EINVAL;
-
-       phydev->autoneg = autoneg;
-
-       if (autoneg == AUTONEG_DISABLE) {
-               phydev->speed = speed;
-               phydev->duplex = duplex;
-       }
-
-       linkmode_copy(phydev->advertising, advertising);
-
-       linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
-                        phydev->advertising, autoneg == AUTONEG_ENABLE);
-
-       phydev->master_slave_set = cmd->base.master_slave_cfg;
-       phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
-
-       /* Restart the PHY */
-       phy_start_aneg(phydev);
-
-       return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_ksettings_set);
-
  void phy_ethtool_ksettings_get(struct phy_device *phydev,
                                struct ethtool_link_ksettings *cmd)
  {
+       mutex_lock(&phydev->lock);
         linkmode_copy(cmd->link_modes.supported, phydev->supported);
         linkmode_copy(cmd->link_modes.advertising, phydev->advertising);
         linkmode_copy(cmd->link_modes.lp_advertising, phydev->lp_advertising);
@@ -317,6 +265,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev,
         cmd->base.autoneg = phydev->autoneg;
         cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl;
         cmd->base.eth_tp_mdix = phydev->mdix;
+       mutex_unlock(&phydev->lock);
  }
  EXPORT_SYMBOL(phy_ethtool_ksettings_get);
  
@@ -751,7 +700,7 @@ static int phy_check_link_status(struct phy_device *phydev)
  }
  
  /**
- * phy_start_aneg - start auto-negotiation for this PHY device
+ * _phy_start_aneg - start auto-negotiation for this PHY device
   * @phydev: the phy_device struct
   *
   * Description: Sanitizes the settings (if we're not autonegotiating
@@ -759,25 +708,43 @@ static int phy_check_link_status(struct phy_device *phydev)
   *   If the PHYCONTROL Layer is operating, we change the state to
   *   reflect the beginning of Auto-negotiation or forcing.
   */
-int phy_start_aneg(struct phy_device *phydev)
+static int _phy_start_aneg(struct phy_device *phydev)
  {
         int err;
  
+       lockdep_assert_held(&phydev->lock);
+
         if (!phydev->drv)
                 return -EIO;
  
-       mutex_lock(&phydev->lock);
-
         if (AUTONEG_DISABLE == phydev->autoneg)
                 phy_sanitize_settings(phydev);
  
         err = phy_config_aneg(phydev);
         if (err < 0)
-               goto out_unlock;
+               return err;
  
         if (phy_is_started(phydev))
                 err = phy_check_link_status(phydev);
-out_unlock:
+
+       return err;
+}
+
+/**
+ * phy_start_aneg - start auto-negotiation for this PHY device
+ * @phydev: the phy_device struct
+ *
+ * Description: Sanitizes the settings (if we're not autonegotiating
+ *   them), and then calls the driver's config_aneg function.
+ *   If the PHYCONTROL Layer is operating, we change the state to
+ *   reflect the beginning of Auto-negotiation or forcing.
+ */
+int phy_start_aneg(struct phy_device *phydev)
+{
+       int err;
+
+       mutex_lock(&phydev->lock);
+       err = _phy_start_aneg(phydev);
         mutex_unlock(&phydev->lock);
  
         return err;
@@ -800,6 +767,61 @@ static int phy_poll_aneg_done(struct phy_device *phydev)
         return ret < 0 ? ret : 0;
  }
  
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+                             const struct ethtool_link_ksettings *cmd)
+{
+       __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+       u8 autoneg = cmd->base.autoneg;
+       u8 duplex = cmd->base.duplex;
+       u32 speed = cmd->base.speed;
+
+       if (cmd->base.phy_address != phydev->mdio.addr)
+               return -EINVAL;
+
+       linkmode_copy(advertising, cmd->link_modes.advertising);
+
+       /* We make sure that we don't pass unsupported values in to the PHY */
+       linkmode_and(advertising, advertising, phydev->supported);
+
+       /* Verify the settings we care about. */
+       if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
+               return -EINVAL;
+
+       if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising))
+               return -EINVAL;
+
+       if (autoneg == AUTONEG_DISABLE &&
+           ((speed != SPEED_1000 &&
+             speed != SPEED_100 &&
+             speed != SPEED_10) ||
+            (duplex != DUPLEX_HALF &&
+             duplex != DUPLEX_FULL)))
+               return -EINVAL;
+
+       mutex_lock(&phydev->lock);
+       phydev->autoneg = autoneg;
+
+       if (autoneg == AUTONEG_DISABLE) {
+               phydev->speed = speed;
+               phydev->duplex = duplex;
+       }
+
+       linkmode_copy(phydev->advertising, advertising);
+
+       linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+                        phydev->advertising, autoneg == AUTONEG_ENABLE);
+
+       phydev->master_slave_set = cmd->base.master_slave_cfg;
+       phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
+
+       /* Restart the PHY */
+       _phy_start_aneg(phydev);
+
+       mutex_unlock(&phydev->lock);
+       return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_set);
+
  /**
   * phy_speed_down - set speed to lowest speed supported by both link partners
   * @phydev: the phy_device struct
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c

index 793f8fb..63cd72c 100644 (file)
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -4122,6 +4122,12 @@ static int lan78xx_probe(struct usb_interface *intf,
  
         dev->maxpacket = usb_maxpacket(dev->udev, dev->pipe_out, 1);
  
+       /* Reject broken descriptors. */
+       if (dev->maxpacket == 0) {
+               ret = -ENODEV;
+               goto out4;
+       }
+
         /* driver requires remote-wakeup capability during autosuspend. */
         intf->needs_remote_wakeup = 1;
  
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c

index 80432ee..a33d7fb 100644 (file)
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1790,6 +1790,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
         dev->maxpacket = usb_maxpacket (dev->udev, dev->out, 1);
         if (dev->maxpacket == 0) {
                 /* that is a broken device */
+               status = -ENODEV;
                 goto out4;
         }
  
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c

index 142f706..8799854 100644 (file)
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -3833,7 +3833,6 @@ vmxnet3_suspend(struct device *device)
         vmxnet3_free_intr_resources(adapter);
  
         netif_device_detach(netdev);
-       netif_tx_stop_all_queues(netdev);
  
         /* Create wake-up filters. */
         pmConf = adapter->pm_conf;
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c

index e31b984..fc41ba9 100644 (file)
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1730,6 +1730,10 @@ static int netfront_resume(struct xenbus_device *dev)
  
         dev_dbg(&dev->dev, "%s\n", dev->nodename);
  
+       netif_tx_lock_bh(info->netdev);
+       netif_device_detach(info->netdev);
+       netif_tx_unlock_bh(info->netdev);
+
         xennet_disconnect_backend(info);
         return 0;
  }
@@ -2349,6 +2353,10 @@ static int xennet_connect(struct net_device *dev)
          * domain a kick because we've probably just requeued some
          * packets.
          */
+       netif_tx_lock_bh(np->netdev);
+       netif_device_attach(np->netdev);
+       netif_tx_unlock_bh(np->netdev);
+
         netif_carrier_on(np->netdev);
         for (j = 0; j < num_queues; ++j) {
                 queue = &np->queues[j];
diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c

index 517376c..16ceb76 100644 (file)
--- a/drivers/nfc/port100.c
+++ b/drivers/nfc/port100.c
@@ -1006,11 +1006,11 @@ static u64 port100_get_command_type_mask(struct port100 *dev)
  
         skb = port100_alloc_skb(dev, 0);
         if (!skb)
-               return -ENOMEM;
+               return 0;
  
         resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb);
         if (IS_ERR(resp))
-               return PTR_ERR(resp);
+               return 0;
  
         if (resp->len < 8)
                 mask = 0;
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c

index 088d3dd..b6c6866 100644 (file)
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -162,7 +162,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
         return err;
  }
  
-static blk_qc_t nd_blk_submit_bio(struct bio *bio)
+static void nd_blk_submit_bio(struct bio *bio)
  {
         struct bio_integrity_payload *bip;
         struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
@@ -173,7 +173,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
         bool do_acct;
  
         if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               return;
  
         bip = bio_integrity(bio);
         rw = bio_data_dir(bio);
@@ -199,7 +199,6 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
                 bio_end_io_acct(bio, start);
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c

index 92dec49..4295fa8 100644 (file)
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1440,7 +1440,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
         return ret;
  }
  
-static blk_qc_t btt_submit_bio(struct bio *bio)
+static void btt_submit_bio(struct bio *bio)
  {
         struct bio_integrity_payload *bip = bio_integrity(bio);
         struct btt *btt = bio->bi_bdev->bd_disk->private_data;
@@ -1451,7 +1451,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
         bool do_acct;
  
         if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               return;
  
         do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
         if (do_acct)
@@ -1483,7 +1483,6 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
                 bio_end_io_acct(bio, start);
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int btt_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c

index 7de592d..6a45fa9 100644 (file)
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -7,6 +7,7 @@
  #include <linux/export.h>
  #include <linux/module.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/device.h>
  #include <linux/ctype.h>
  #include <linux/ndctl.h>
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c

index ef4950f..c74d7bc 100644 (file)
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -190,7 +190,7 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
         return rc;
  }
  
-static blk_qc_t pmem_submit_bio(struct bio *bio)
+static void pmem_submit_bio(struct bio *bio)
  {
         int ret = 0;
         blk_status_t rc = 0;
@@ -229,7 +229,6 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
                 bio->bi_status = errno_to_blk_status(ret);
  
         bio_endio(bio);
-       return BLK_QC_T_NONE;
  }
  
  static int pmem_rw_page(struct block_device *bdev, sector_t sector,
@@ -333,26 +332,6 @@ static const struct attribute_group *pmem_attribute_groups[] = {
         NULL,
  };
  
-static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
-       struct pmem_device *pmem = pgmap->owner;
-
-       blk_cleanup_disk(pmem->disk);
-}
-
-static void pmem_release_queue(void *pgmap)
-{
-       pmem_pagemap_cleanup(pgmap);
-}
-
-static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
-{
-       struct request_queue *q =
-               container_of(pgmap->ref, struct request_queue, q_usage_counter);
-
-       blk_freeze_queue_start(q);
-}
-
  static void pmem_release_disk(void *__pmem)
  {
         struct pmem_device *pmem = __pmem;
@@ -360,12 +339,9 @@ static void pmem_release_disk(void *__pmem)
         kill_dax(pmem->dax_dev);
         put_dax(pmem->dax_dev);
         del_gendisk(pmem->disk);
-}
  
-static const struct dev_pagemap_ops fsdax_pagemap_ops = {
-       .kill                   = pmem_pagemap_kill,
-       .cleanup                = pmem_pagemap_cleanup,
-};
+       blk_cleanup_disk(pmem->disk);
+}
  
  static int pmem_attach_disk(struct device *dev,
                 struct nd_namespace_common *ndns)
@@ -427,10 +403,8 @@ static int pmem_attach_disk(struct device *dev,
         pmem->disk = disk;
         pmem->pgmap.owner = pmem;
         pmem->pfn_flags = PFN_DEV;
-       pmem->pgmap.ref = &q->q_usage_counter;
         if (is_nd_pfn(dev)) {
                 pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
-               pmem->pgmap.ops = &fsdax_pagemap_ops;
                 addr = devm_memremap_pages(dev, &pmem->pgmap);
                 pfn_sb = nd_pfn->pfn_sb;
                 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -444,16 +418,12 @@ static int pmem_attach_disk(struct device *dev,
                 pmem->pgmap.range.end = res->end;
                 pmem->pgmap.nr_range = 1;
                 pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
-               pmem->pgmap.ops = &fsdax_pagemap_ops;
                 addr = devm_memremap_pages(dev, &pmem->pgmap);
                 pmem->pfn_flags |= PFN_MAP;
                 bb_range = pmem->pgmap.range;
         } else {
                 addr = devm_memremap(dev, pmem->phys_addr,
                                 pmem->size, ARCH_MEMREMAP_PMEM);
-               if (devm_add_action_or_reset(dev, pmem_release_queue,
-                                       &pmem->pgmap))
-                       return -ENOMEM;
                 bb_range.start =  res->start;
                 bb_range.end = res->end;
         }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index f8dd664..838b5e2 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
  
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
  #include <linux/compat.h>
  #include <linux/delay.h>
  #include <linux/errno.h>
@@ -118,25 +119,6 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
  static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
                                    struct nvme_command *cmd);
  
-/*
- * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex.  This will end buffered writers dirtying pages that can't
- * be synced.
- */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
-{
-       if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
-               return;
-
-       blk_set_queue_dying(ns->queue);
-       blk_mq_unquiesce_queue(ns->queue);
-
-       set_capacity_and_notify(ns->disk, 0);
-}
-
  void nvme_queue_scan(struct nvme_ctrl *ctrl)
  {
         /*
@@ -221,7 +203,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
  static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
  {
         dev_info(ctrl->device,
-                "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
+                "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
  
         flush_work(&ctrl->reset_work);
         nvme_stop_ctrl(ctrl);
@@ -345,15 +327,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
         return RETRY;
  }
  
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
  {
-       blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
             req_op(req) == REQ_OP_ZONE_APPEND)
                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
                         le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+       blk_status_t status = nvme_error_status(nvme_req(req)->status);
  
+       nvme_end_req_zoned(req);
         nvme_trace_bio_complete(req);
         blk_mq_end_request(req, status);
  }
@@ -380,6 +366,13 @@ void nvme_complete_rq(struct request *req)
  }
  EXPORT_SYMBOL_GPL(nvme_complete_rq);
  
+void nvme_complete_batch_req(struct request *req)
+{
+       nvme_cleanup_cmd(req);
+       nvme_end_req_zoned(req);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
  /*
   * Called to unwind from ->queue_rq on a failed command submission so that the
   * multipathing code gets called to potentially failover to another path.
@@ -631,7 +624,7 @@ static inline void nvme_init_request(struct request *req,
  
         req->cmd_flags |= REQ_FAILFAST_DRIVER;
         if (req->mq_hctx->type == HCTX_TYPE_POLL)
-               req->cmd_flags |= REQ_HIPRI;
+               req->cmd_flags |= REQ_POLLED;
         nvme_clear_nvme_request(req);
         memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
  }
@@ -822,6 +815,7 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
  static inline void nvme_setup_flush(struct nvme_ns *ns,
                 struct nvme_command *cmnd)
  {
+       memset(cmnd, 0, sizeof(*cmnd));
         cmnd->common.opcode = nvme_cmd_flush;
         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
  }
@@ -873,6 +867,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
                 return BLK_STS_IOERR;
         }
  
+       memset(cmnd, 0, sizeof(*cmnd));
         cmnd->dsm.opcode = nvme_cmd_dsm;
         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
         cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -889,6 +884,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
  static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
                 struct request *req, struct nvme_command *cmnd)
  {
+       memset(cmnd, 0, sizeof(*cmnd));
+
         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
                 return nvme_setup_discard(ns, req, cmnd);
  
@@ -922,9 +919,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
  
         cmnd->rw.opcode = op;
+       cmnd->rw.flags = 0;
         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
+       cmnd->rw.rsvd2 = 0;
+       cmnd->rw.metadata = 0;
         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+       cmnd->rw.reftag = 0;
+       cmnd->rw.apptag = 0;
+       cmnd->rw.appmask = 0;
  
         if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
                 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
@@ -981,10 +984,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
         struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
         blk_status_t ret = BLK_STS_OK;
  
-       if (!(req->rq_flags & RQF_DONTPREP)) {
+       if (!(req->rq_flags & RQF_DONTPREP))
                 nvme_clear_nvme_request(req);
-               memset(cmd, 0, sizeof(*cmd));
-       }
  
         switch (req_op(req)) {
         case REQ_OP_DRV_IN:
@@ -2600,6 +2601,24 @@ static ssize_t nvme_subsys_show_nqn(struct device *dev,
  }
  static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
  
+static ssize_t nvme_subsys_show_type(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+
+       switch (subsys->subtype) {
+       case NVME_NQN_DISC:
+               return sysfs_emit(buf, "discovery\n");
+       case NVME_NQN_NVME:
+               return sysfs_emit(buf, "nvm\n");
+       default:
+               return sysfs_emit(buf, "reserved\n");
+       }
+}
+static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
+
  #define nvme_subsys_show_str_function(field)                           \
  static ssize_t subsys_##field##_show(struct device *dev,               \
                             struct device_attribute *attr, char *buf)   \
@@ -2620,6 +2639,7 @@ static struct attribute *nvme_subsys_attrs[] = {
         &subsys_attr_serial.attr,
         &subsys_attr_firmware_rev.attr,
         &subsys_attr_subsysnqn.attr,
+       &subsys_attr_subsystype.attr,
  #ifdef CONFIG_NVME_MULTIPATH
         &subsys_attr_iopolicy.attr,
  #endif
@@ -2690,6 +2710,21 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
         memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
         subsys->vendor_id = le16_to_cpu(id->vid);
         subsys->cmic = id->cmic;
+
+       /* Versions prior to 1.4 don't necessarily report a valid type */
+       if (id->cntrltype == NVME_CTRL_DISC ||
+           !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
+               subsys->subtype = NVME_NQN_DISC;
+       else
+               subsys->subtype = NVME_NQN_NVME;
+
+       if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
+               dev_err(ctrl->device,
+                       "Subsystem %s is not a discovery controller",
+                       subsys->subnqn);
+               kfree(subsys);
+               return -EINVAL;
+       }
         subsys->awupf = le16_to_cpu(id->awupf);
  #ifdef CONFIG_NVME_MULTIPATH
         subsys->iopolicy = NVME_IOPOLICY_NUMA;
@@ -4473,6 +4508,37 @@ out:
  }
  EXPORT_SYMBOL_GPL(nvme_init_ctrl);
  
+static void nvme_start_ns_queue(struct nvme_ns *ns)
+{
+       if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
+               blk_mq_unquiesce_queue(ns->queue);
+}
+
+static void nvme_stop_ns_queue(struct nvme_ns *ns)
+{
+       if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
+               blk_mq_quiesce_queue(ns->queue);
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex.  This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
+static void nvme_set_queue_dying(struct nvme_ns *ns)
+{
+       if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+               return;
+
+       blk_set_queue_dying(ns->queue);
+       nvme_start_ns_queue(ns);
+
+       set_capacity_and_notify(ns->disk, 0);
+}
+
  /**
   * nvme_kill_queues(): Ends all namespace queues
   * @ctrl: the dead controller that needs to end
@@ -4488,7 +4554,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
  
         /* Forcibly unquiesce queues to avoid blocking dispatch */
         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
-               blk_mq_unquiesce_queue(ctrl->admin_q);
+               nvme_start_admin_queue(ctrl);
  
         list_for_each_entry(ns, &ctrl->namespaces, list)
                 nvme_set_queue_dying(ns);
@@ -4551,7 +4617,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
  
         down_read(&ctrl->namespaces_rwsem);
         list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_quiesce_queue(ns->queue);
+               nvme_stop_ns_queue(ns);
         up_read(&ctrl->namespaces_rwsem);
  }
  EXPORT_SYMBOL_GPL(nvme_stop_queues);
@@ -4562,11 +4628,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
  
         down_read(&ctrl->namespaces_rwsem);
         list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_unquiesce_queue(ns->queue);
+               nvme_start_ns_queue(ns);
         up_read(&ctrl->namespaces_rwsem);
  }
  EXPORT_SYMBOL_GPL(nvme_start_queues);
  
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+       if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+               blk_mq_quiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
+
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+       if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+               blk_mq_unquiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
+
  void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
  {
         struct nvme_ns *ns;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c

index 668c6bb..c5a2b71 100644 (file)
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = {
         { NVMF_OPT_NR_POLL_QUEUES,      "nr_poll_queues=%d"     },
         { NVMF_OPT_TOS,                 "tos=%d"                },
         { NVMF_OPT_FAIL_FAST_TMO,       "fast_io_fail_tmo=%d"   },
+       { NVMF_OPT_DISCOVERY,           "discovery"             },
         { NVMF_OPT_ERR,                 NULL                    }
  };
  
@@ -823,6 +824,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                         }
                         opts->tos = token;
                         break;
+               case NVMF_OPT_DISCOVERY:
+                       opts->discovery_nqn = true;
+                       break;
                 default:
                         pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
                                 p);
@@ -949,7 +953,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
  #define NVMF_ALLOWED_OPTS      (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
                                  NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
                                  NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
-                                NVMF_OPT_DISABLE_SQFLOW |\
+                                NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
                                  NVMF_OPT_FAIL_FAST_TMO)
  
  static struct nvme_ctrl *
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h

index a146cb9..c3203ff 100644 (file)
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -67,6 +67,7 @@ enum {
         NVMF_OPT_TOS            = 1 << 19,
         NVMF_OPT_FAIL_FAST_TMO  = 1 << 20,
         NVMF_OPT_HOST_IFACE     = 1 << 21,
+       NVMF_OPT_DISCOVERY      = 1 << 22,
  };
  
  /**
@@ -178,6 +179,13 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
         return true;
  }
  
+static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl)
+{
+       if (!ctrl->subsys)
+               return ctrl->opts->subsysnqn;
+       return ctrl->subsys->subnqn;
+}
+
  int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
  int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
  int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c

index aa14ad9..71b3108 100644 (file)
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -16,6 +16,7 @@
  #include <linux/nvme-fc.h>
  #include "fc.h"
  #include <scsi/scsi_transport_fc.h>
+#include <linux/blk-mq-pci.h>
  
  /* *************************** Data Structures/Defines ****************** */
  
@@ -2382,7 +2383,7 @@ nvme_fc_ctrl_free(struct kref *ref)
         list_del(&ctrl->ctrl_list);
         spin_unlock_irqrestore(&ctrl->rport->lock, flags);
  
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
         blk_cleanup_queue(ctrl->ctrl.admin_q);
         blk_cleanup_queue(ctrl->ctrl.fabrics_q);
         blk_mq_free_tag_set(&ctrl->admin_tag_set);
@@ -2510,7 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
         /*
          * clean up the admin queue. Same thing as above.
          */
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         blk_sync_queue(ctrl->ctrl.admin_q);
         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                 nvme_fc_terminate_exchange, &ctrl->ctrl);
@@ -2841,6 +2842,28 @@ nvme_fc_complete_rq(struct request *rq)
         nvme_fc_ctrl_put(ctrl);
  }
  
+static int nvme_fc_map_queues(struct blk_mq_tag_set *set)
+{
+       struct nvme_fc_ctrl *ctrl = set->driver_data;
+       int i;
+
+       for (i = 0; i < set->nr_maps; i++) {
+               struct blk_mq_queue_map *map = &set->map[i];
+
+               if (!map->nr_queues) {
+                       WARN_ON(i == HCTX_TYPE_DEFAULT);
+                       continue;
+               }
+
+               /* Call LLDD map queue functionality if defined */
+               if (ctrl->lport->ops->map_queues)
+                       ctrl->lport->ops->map_queues(&ctrl->lport->localport,
+                                                    map);
+               else
+                       blk_mq_map_queues(map);
+       }
+       return 0;
+}
  
  static const struct blk_mq_ops nvme_fc_mq_ops = {
         .queue_rq       = nvme_fc_queue_rq,
@@ -2849,6 +2872,7 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
         .exit_request   = nvme_fc_exit_request,
         .init_hctx      = nvme_fc_init_hctx,
         .timeout        = nvme_fc_timeout,
+       .map_queues     = nvme_fc_map_queues,
  };
  
  static int
@@ -3095,7 +3119,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
         ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
                                                 (ilog2(SZ_4K) - 9);
  
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
  
         ret = nvme_init_ctrl_finish(&ctrl->ctrl);
         if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
@@ -3249,7 +3273,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
         nvme_fc_free_queue(&ctrl->queues[0]);
  
         /* re-enable the admin_q so anything new can fast fail */
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
  
         /* resume the io queues so that things will fast fail */
         nvme_start_queues(&ctrl->ctrl);
@@ -3572,7 +3596,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
  
         dev_info(ctrl->ctrl.device,
                 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
-               ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
+               ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl));
  
         return &ctrl->ctrl;
  
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c

index fba0661..7f2071f 100644 (file)
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -85,8 +85,13 @@ void nvme_failover_req(struct request *req)
         }
  
         spin_lock_irqsave(&ns->head->requeue_lock, flags);
-       for (bio = req->bio; bio; bio = bio->bi_next)
+       for (bio = req->bio; bio; bio = bio->bi_next) {
                 bio_set_dev(bio, ns->head->disk->part0);
+               if (bio->bi_opf & REQ_POLLED) {
+                       bio->bi_opf &= ~REQ_POLLED;
+                       bio->bi_cookie = BLK_QC_T_NONE;
+               }
+       }
         blk_steal_bios(&ns->head->requeue_list, req);
         spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  
@@ -100,8 +105,11 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
  
         down_read(&ctrl->namespaces_rwsem);
         list_for_each_entry(ns, &ctrl->namespaces, list) {
-               if (ns->head->disk)
-                       kblockd_schedule_work(&ns->head->requeue_work);
+               if (!ns->head->disk)
+                       continue;
+               kblockd_schedule_work(&ns->head->requeue_work);
+               if (ctrl->state == NVME_CTRL_LIVE)
+                       disk_uevent(ns->head->disk, KOBJ_CHANGE);
         }
         up_read(&ctrl->namespaces_rwsem);
  }
@@ -138,13 +146,12 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
  {
         struct nvme_ns *ns;
  
-       mutex_lock(&ctrl->scan_lock);
         down_read(&ctrl->namespaces_rwsem);
-       list_for_each_entry(ns, &ctrl->namespaces, list)
-               if (nvme_mpath_clear_current_path(ns))
-                       kblockd_schedule_work(&ns->head->requeue_work);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               nvme_mpath_clear_current_path(ns);
+               kblockd_schedule_work(&ns->head->requeue_work);
+       }
         up_read(&ctrl->namespaces_rwsem);
-       mutex_unlock(&ctrl->scan_lock);
  }
  
  void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -312,12 +319,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)
         return false;
  }
  
-static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
+static void nvme_ns_head_submit_bio(struct bio *bio)
  {
         struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
         struct device *dev = disk_to_dev(head->disk);
         struct nvme_ns *ns;
-       blk_qc_t ret = BLK_QC_T_NONE;
         int srcu_idx;
  
         /*
@@ -334,7 +340,7 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
                 bio->bi_opf |= REQ_NVME_MPATH;
                 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
                                       bio->bi_iter.bi_sector);
-               ret = submit_bio_noacct(bio);
+               submit_bio_noacct(bio);
         } else if (nvme_available_path(head)) {
                 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
  
@@ -349,7 +355,6 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
         }
  
         srcu_read_unlock(&head->srcu, srcu_idx);
-       return ret;
  }
  
  static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
@@ -479,6 +484,15 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
  
         blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
         blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
+       /*
+        * This assumes all controllers that refer to a namespace either
+        * support poll queues or not.  That is not a strict guarantee,
+        * but if the assumption is wrong the effect is only suboptimal
+        * performance but not correctness problem.
+        */
+       if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
+           ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
+               blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
  
         /* set to a default value of 512 until the disk is validated */
         blk_queue_logical_block_size(head->disk->queue, 512);
@@ -494,13 +508,23 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
  static void nvme_mpath_set_live(struct nvme_ns *ns)
  {
         struct nvme_ns_head *head = ns->head;
+       int rc;
  
         if (!head->disk)
                 return;
  
+       /*
+        * test_and_set_bit() is used because it is protecting against two nvme
+        * paths simultaneously calling device_add_disk() on the same namespace
+        * head.
+        */
         if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
-               device_add_disk(&head->subsys->dev, head->disk,
-                               nvme_ns_id_attr_groups);
+               rc = device_add_disk(&head->subsys->dev, head->disk,
+                                    nvme_ns_id_attr_groups);
+               if (rc) {
+                       clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
+                       return;
+               }
                 nvme_add_ns_head_cdev(head);
         }
  
@@ -538,7 +562,7 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
                         return -EINVAL;
  
                 nr_nsids = le32_to_cpu(desc->nnsids);
-               nsid_buf_size = nr_nsids * sizeof(__le32);
+               nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
  
                 if (WARN_ON_ONCE(desc->grpid == 0))
                         return -EINVAL;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h

index ed79a6c..b334af8 100644 (file)
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -342,6 +342,7 @@ struct nvme_ctrl {
         int nr_reconnects;
         unsigned long flags;
  #define NVME_CTRL_FAILFAST_EXPIRED     0
+#define NVME_CTRL_ADMIN_Q_STOPPED      1
         struct nvmf_ctrl_options *opts;
  
         struct page *discard_page;
@@ -372,6 +373,7 @@ struct nvme_subsystem {
         char                    model[40];
         char                    firmware_rev[8];
         u8                      cmic;
+       enum nvme_subsys_type   subtype;
         u16                     vendor_id;
         u16                     awupf;  /* 0's based awupf value. */
         struct ida              ns_ida;
@@ -463,6 +465,7 @@ struct nvme_ns {
  #define NVME_NS_ANA_PENDING    2
  #define NVME_NS_FORCE_RO       3
  #define NVME_NS_READY          4
+#define NVME_NS_STOPPED                5
  
         struct cdev             cdev;
         struct device           cdev_device;
@@ -638,6 +641,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
  }
  
  void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
+                                               void (*fn)(struct request *rq))
+{
+       struct request *req;
+
+       rq_list_for_each(&iob->req_list, req) {
+               fn(req);
+               nvme_complete_batch_req(req);
+       }
+       blk_mq_end_request_batch(iob);
+}
+
  blk_status_t nvme_host_path_error(struct request *req);
  bool nvme_cancel_request(struct request *req, void *data, bool reserved);
  void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
@@ -665,6 +682,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
  
  void nvme_stop_queues(struct nvme_ctrl *ctrl);
  void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
  void nvme_kill_queues(struct nvme_ctrl *ctrl);
  void nvme_sync_queues(struct nvme_ctrl *ctrl);
  void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index 149ecf7..ca2ee80 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -10,6 +10,7 @@
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-mq-pci.h>
+#include <linux/blk-integrity.h>
  #include <linux/dmi.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
@@ -244,8 +245,15 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
  {
         unsigned int mem_size = nvme_dbbuf_size(dev);
  
-       if (dev->dbbuf_dbs)
+       if (dev->dbbuf_dbs) {
+               /*
+                * Clear the dbbuf memory so the driver doesn't observe stale
+                * values from the previous instantiation.
+                */
+               memset(dev->dbbuf_dbs, 0, mem_size);
+               memset(dev->dbbuf_eis, 0, mem_size);
                 return 0;
+       }
  
         dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
                                             &dev->dbbuf_dbs_dma_addr,
@@ -958,7 +966,7 @@ out_free_cmd:
         return ret;
  }
  
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
  {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
         struct nvme_dev *dev = iod->nvmeq->dev;
@@ -968,9 +976,19 @@ static void nvme_pci_complete_rq(struct request *req)
                                rq_integrity_vec(req)->bv_len, rq_data_dir(req));
         if (blk_rq_nr_phys_segments(req))
                 nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+       nvme_pci_unmap_rq(req);
         nvme_complete_rq(req);
  }
  
+static void nvme_pci_complete_batch(struct io_comp_batch *iob)
+{
+       nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
  /* We read the CQE phase first to check if the rest of the entry is valid */
  static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
  {
@@ -995,7 +1013,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
         return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
  }
  
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+                                  struct io_comp_batch *iob, u16 idx)
  {
         struct nvme_completion *cqe = &nvmeq->cqes[idx];
         __u16 command_id = READ_ONCE(cqe->command_id);
@@ -1022,7 +1041,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
         }
  
         trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-       if (!nvme_try_complete_req(req, cqe->status, cqe->result))
+       if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
+           !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
+                                       nvme_pci_complete_batch))
                 nvme_pci_complete_rq(req);
  }
  
@@ -1038,7 +1059,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
         }
  }
  
-static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
+                              struct io_comp_batch *iob)
  {
         int found = 0;
  
@@ -1049,7 +1071,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
                  * the cqe requires a full read memory barrier
                  */
                 dma_rmb();
-               nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+               nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
                 nvme_update_cq_head(nvmeq);
         }
  
@@ -1061,9 +1083,13 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
  static irqreturn_t nvme_irq(int irq, void *data)
  {
         struct nvme_queue *nvmeq = data;
+       DEFINE_IO_COMP_BATCH(iob);
  
-       if (nvme_process_cq(nvmeq))
+       if (nvme_poll_cq(nvmeq, &iob)) {
+               if (!rq_list_empty(iob.req_list))
+                       nvme_pci_complete_batch(&iob);
                 return IRQ_HANDLED;
+       }
         return IRQ_NONE;
  }
  
@@ -1087,11 +1113,11 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
         WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
  
         disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
-       nvme_process_cq(nvmeq);
+       nvme_poll_cq(nvmeq, NULL);
         enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
  }
  
-static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  {
         struct nvme_queue *nvmeq = hctx->driver_data;
         bool found;
@@ -1100,7 +1126,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
                 return 0;
  
         spin_lock(&nvmeq->cq_poll_lock);
-       found = nvme_process_cq(nvmeq);
+       found = nvme_poll_cq(nvmeq, iob);
         spin_unlock(&nvmeq->cq_poll_lock);
  
         return found;
@@ -1273,7 +1299,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
          * Did we miss an interrupt?
          */
         if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
-               nvme_poll(req->mq_hctx);
+               nvme_poll(req->mq_hctx, NULL);
         else
                 nvme_poll_irqdisable(nvmeq);
  
@@ -1395,7 +1421,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
  
         nvmeq->dev->online_queues--;
         if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
-               blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+               nvme_stop_admin_queue(&nvmeq->dev->ctrl);
         if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
                 pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
         return 0;
@@ -1433,7 +1459,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev)
  
         for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
                 spin_lock(&dev->queues[i].cq_poll_lock);
-               nvme_process_cq(&dev->queues[i]);
+               nvme_poll_cq(&dev->queues[i], NULL);
                 spin_unlock(&dev->queues[i].cq_poll_lock);
         }
  }
@@ -1654,7 +1680,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
                  * user requests may be waiting on a stopped queue. Start the
                  * queue to flush these to completion.
                  */
-               blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+               nvme_start_admin_queue(&dev->ctrl);
                 blk_cleanup_queue(dev->ctrl.admin_q);
                 blk_mq_free_tag_set(&dev->admin_tagset);
         }
@@ -1688,7 +1714,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
                         return -ENODEV;
                 }
         } else
-               blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+               nvme_start_admin_queue(&dev->ctrl);
  
         return 0;
  }
@@ -2623,7 +2649,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
         if (shutdown) {
                 nvme_start_queues(&dev->ctrl);
                 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
-                       blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+                       nvme_start_admin_queue(&dev->ctrl);
         }
         mutex_unlock(&dev->shutdown_lock);
  }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index 042c594..850f84d 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -13,6 +13,7 @@
  #include <linux/atomic.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-mq-rdma.h>
+#include <linux/blk-integrity.h>
  #include <linux/types.h>
  #include <linux/list.h>
  #include <linux/mutex.h>
@@ -918,7 +919,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
         else
                 ctrl->ctrl.max_integrity_segments = 0;
  
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
  
         error = nvme_init_ctrl_finish(&ctrl->ctrl);
         if (error)
@@ -927,7 +928,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
         return 0;
  
  out_quiesce_queue:
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         blk_sync_queue(ctrl->ctrl.admin_q);
  out_stop_queue:
         nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -1025,12 +1026,12 @@ out_free_io_queues:
  static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
                 bool remove)
  {
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         blk_sync_queue(ctrl->ctrl.admin_q);
         nvme_rdma_stop_queue(&ctrl->queues[0]);
         nvme_cancel_admin_tagset(&ctrl->ctrl);
         if (remove)
-               blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+               nvme_start_admin_queue(&ctrl->ctrl);
         nvme_rdma_destroy_admin_queue(ctrl, remove);
  }
  
@@ -1095,11 +1096,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
                 return ret;
  
         if (ctrl->ctrl.icdoff) {
+               ret = -EOPNOTSUPP;
                 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
                 goto destroy_admin;
         }
  
         if (!(ctrl->ctrl.sgls & (1 << 2))) {
+               ret = -EOPNOTSUPP;
                 dev_err(ctrl->ctrl.device,
                         "Mandatory keyed sgls are not supported!\n");
                 goto destroy_admin;
@@ -1111,6 +1114,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
                         ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
         }
  
+       if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
+               dev_warn(ctrl->ctrl.device,
+                       "ctrl sqsize %u > max queue size %u, clamping down\n",
+                       ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
+               ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
+       }
+
         if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
                 dev_warn(ctrl->ctrl.device,
                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
@@ -1153,7 +1163,7 @@ destroy_io:
                 nvme_rdma_destroy_io_queues(ctrl, new);
         }
  destroy_admin:
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         blk_sync_queue(ctrl->ctrl.admin_q);
         nvme_rdma_stop_queue(&ctrl->queues[0]);
         nvme_cancel_admin_tagset(&ctrl->ctrl);
@@ -1193,7 +1203,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
         nvme_rdma_teardown_io_queues(ctrl, false);
         nvme_start_queues(&ctrl->ctrl);
         nvme_rdma_teardown_admin_queue(ctrl, false);
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
  
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we started ctrl delete */
@@ -2105,7 +2115,7 @@ unmap_qe:
         return ret;
  }
  
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  {
         struct nvme_rdma_queue *queue = hctx->driver_data;
  
@@ -2231,7 +2241,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
         cancel_delayed_work_sync(&ctrl->reconnect_work);
  
         nvme_rdma_teardown_io_queues(ctrl, shutdown);
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         if (shutdown)
                 nvme_shutdown_ctrl(&ctrl->ctrl);
         else
@@ -2385,7 +2395,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
                 goto out_uninit_ctrl;
  
         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
-               ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+               nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
  
         mutex_lock(&nvme_rdma_ctrl_mutex);
         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c

index 3c1c29d..33bc83d 100644 (file)
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -926,12 +926,14 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
  static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
+       int req_data_len = req->data_len;
  
         while (true) {
                 struct page *page = nvme_tcp_req_cur_page(req);
                 size_t offset = nvme_tcp_req_cur_offset(req);
                 size_t len = nvme_tcp_req_cur_length(req);
                 bool last = nvme_tcp_pdu_last_send(req, len);
+               int req_data_sent = req->data_sent;
                 int ret, flags = MSG_DONTWAIT;
  
                 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
@@ -958,7 +960,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
                  * in the request where we don't want to modify it as we may
                  * compete with the RX path completing the request.
                  */
-               if (req->data_sent + ret < req->data_len)
+               if (req_data_sent + ret < req_data_len)
                         nvme_tcp_advance_req(req, ret);
  
                 /* fully successful last send in current PDU */
@@ -1048,10 +1050,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
  static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
+       size_t offset = req->offset;
         int ret;
         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
         struct kvec iov = {
-               .iov_base = &req->ddgst + req->offset,
+               .iov_base = (u8 *)&req->ddgst + req->offset,
                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
         };
  
@@ -1064,7 +1067,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
         if (unlikely(ret <= 0))
                 return ret;
  
-       if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+       if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
                 nvme_tcp_done_send_req(queue);
                 return 1;
         }
@@ -1915,7 +1918,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
         if (error)
                 goto out_stop_queue;
  
-       blk_mq_unquiesce_queue(ctrl->admin_q);
+       nvme_start_admin_queue(ctrl);
  
         error = nvme_init_ctrl_finish(ctrl);
         if (error)
@@ -1924,7 +1927,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
         return 0;
  
  out_quiesce_queue:
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
         blk_sync_queue(ctrl->admin_q);
  out_stop_queue:
         nvme_tcp_stop_queue(ctrl, 0);
@@ -1946,12 +1949,12 @@ out_free_queue:
  static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
                 bool remove)
  {
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
         blk_sync_queue(ctrl->admin_q);
         nvme_tcp_stop_queue(ctrl, 0);
         nvme_cancel_admin_tagset(ctrl);
         if (remove)
-               blk_mq_unquiesce_queue(ctrl->admin_q);
+               nvme_start_admin_queue(ctrl);
         nvme_tcp_destroy_admin_queue(ctrl, remove);
  }
  
@@ -1960,7 +1963,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
  {
         if (ctrl->queue_count <= 1)
                 return;
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
         nvme_start_freeze(ctrl);
         nvme_stop_queues(ctrl);
         nvme_sync_io_queues(ctrl);
@@ -2055,7 +2058,7 @@ destroy_io:
                 nvme_tcp_destroy_io_queues(ctrl, new);
         }
  destroy_admin:
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
         blk_sync_queue(ctrl->admin_q);
         nvme_tcp_stop_queue(ctrl, 0);
         nvme_cancel_admin_tagset(ctrl);
@@ -2098,7 +2101,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
         /* unquiesce to fail fast pending requests */
         nvme_start_queues(ctrl);
         nvme_tcp_teardown_admin_queue(ctrl, false);
-       blk_mq_unquiesce_queue(ctrl->admin_q);
+       nvme_start_admin_queue(ctrl);
  
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we started ctrl delete */
@@ -2116,7 +2119,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
  
         nvme_tcp_teardown_io_queues(ctrl, shutdown);
-       blk_mq_quiesce_queue(ctrl->admin_q);
+       nvme_stop_admin_queue(ctrl);
         if (shutdown)
                 nvme_shutdown_ctrl(ctrl);
         else
@@ -2429,7 +2432,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
         return 0;
  }
  
-static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  {
         struct nvme_tcp_queue *queue = hctx->driver_data;
         struct sock *sk = queue->sock->sk;
@@ -2582,7 +2585,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
                 goto out_uninit_ctrl;
  
         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
-               ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+               nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
  
         mutex_lock(&nvme_tcp_ctrl_mutex);
         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c

index d950104..bfc259e 100644 (file)
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -233,6 +233,8 @@ out_free:
  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
                 struct nvme_command *c, enum nvme_zone_mgmt_action action)
  {
+       memset(c, 0, sizeof(*c));
+
         c->zms.opcode = nvme_cmd_zone_mgmt_send;
         c->zms.nsid = cpu_to_le32(ns->head->ns_id);
         c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c

index aa6d84d..6fb2474 100644 (file)
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -264,7 +264,7 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
         desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
         desc->state = req->port->ana_state[grpid];
         memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
-       return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
+       return struct_size(desc, nsids, count);
  }
  
  static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
@@ -278,8 +278,8 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
         u16 status;
  
         status = NVME_SC_INTERNAL;
-       desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
-                       NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
+       desc = kmalloc(struct_size(desc, nsids, NVMET_MAX_NAMESPACES),
+                      GFP_KERNEL);
         if (!desc)
                 goto out;
  
@@ -374,13 +374,19 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
  
         id->rab = 6;
  
+       if (nvmet_is_disc_subsys(ctrl->subsys))
+               id->cntrltype = NVME_CTRL_DISC;
+       else
+               id->cntrltype = NVME_CTRL_IO;
+
         /*
          * XXX: figure out how we can assign a IEEE OUI, but until then
          * the safest is to leave it as zeroes.
          */
  
         /* we support multiple ports, multiples hosts and ANA: */
-       id->cmic = (1 << 0) | (1 << 1) | (1 << 3);
+       id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL |
+               NVME_CTRL_CMIC_ANA;
  
         /* Limit MDTS according to transport capability */
         if (ctrl->ops->get_mdts)
@@ -536,7 +542,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
          * Our namespace might always be shared.  Not just with other
          * controllers, but also with any other user of the block device.
          */
-       id->nmic = (1 << 0);
+       id->nmic = NVME_NS_NMIC_SHARED;
         id->anagrpid = cpu_to_le32(req->ns->anagrpid);
  
         memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
@@ -1008,7 +1014,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
  
         if (nvme_is_fabrics(cmd))
                 return nvmet_parse_fabrics_cmd(req);
-       if (nvmet_req_subsys(req)->type == NVME_NQN_DISC)
+       if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
                 return nvmet_parse_discovery_cmd(req);
  
         ret = nvmet_check_ctrl_status(req);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c

index be5d824..091a0ca 100644 (file)
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1233,6 +1233,44 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item,
  }
  CONFIGFS_ATTR(nvmet_subsys_, attr_model);
  
+static ssize_t nvmet_subsys_attr_discovery_nqn_show(struct config_item *item,
+                       char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%s\n",
+                       nvmet_disc_subsys->subsysnqn);
+}
+
+static ssize_t nvmet_subsys_attr_discovery_nqn_store(struct config_item *item,
+                       const char *page, size_t count)
+{
+       struct nvmet_subsys *subsys = to_subsys(item);
+       char *subsysnqn;
+       int len;
+
+       len = strcspn(page, "\n");
+       if (!len)
+               return -EINVAL;
+
+       subsysnqn = kmemdup_nul(page, len, GFP_KERNEL);
+       if (!subsysnqn)
+               return -ENOMEM;
+
+       /*
+        * The discovery NQN must be different from subsystem NQN.
+        */
+       if (!strcmp(subsysnqn, subsys->subsysnqn)) {
+               kfree(subsysnqn);
+               return -EBUSY;
+       }
+       down_write(&nvmet_config_sem);
+       kfree(nvmet_disc_subsys->subsysnqn);
+       nvmet_disc_subsys->subsysnqn = subsysnqn;
+       up_write(&nvmet_config_sem);
+
+       return count;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_discovery_nqn);
+
  #ifdef CONFIG_BLK_DEV_INTEGRITY
  static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item,
                                                 char *page)
@@ -1262,6 +1300,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
         &nvmet_subsys_attr_attr_cntlid_min,
         &nvmet_subsys_attr_attr_cntlid_max,
         &nvmet_subsys_attr_attr_model,
+       &nvmet_subsys_attr_attr_discovery_nqn,
  #ifdef CONFIG_BLK_DEV_INTEGRITY
         &nvmet_subsys_attr_attr_pi_enable,
  #endif
@@ -1553,6 +1592,8 @@ static void nvmet_port_release(struct config_item *item)
  {
         struct nvmet_port *port = to_nvmet_port(item);
  
+       /* Let inflight controllers teardown complete */
+       flush_scheduled_work();
         list_del(&port->global_entry);
  
         kfree(port->ana_state);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c

index b8425fa..5119c68 100644 (file)
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1140,7 +1140,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
          * should verify iosqes,iocqes are zeroed, however that
          * would break backwards compatibility, so don't enforce it.
          */
-       if (ctrl->subsys->type != NVME_NQN_DISC &&
+       if (!nvmet_is_disc_subsys(ctrl->subsys) &&
             (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
              nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
                 ctrl->csts = NVME_CSTS_CFS;
@@ -1205,7 +1205,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
         /* CC.EN timeout in 500msec units: */
         ctrl->cap |= (15ULL << 24);
         /* maximum queue entries supported: */
-       ctrl->cap |= NVMET_QUEUE_SIZE - 1;
+       if (ctrl->ops->get_max_queue_size)
+               ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1;
+       else
+               ctrl->cap |= NVMET_QUEUE_SIZE - 1;
  
         if (nvmet_is_passthru_subsys(ctrl->subsys))
                 nvmet_passthrough_override_cap(ctrl);
@@ -1278,7 +1281,7 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
         if (subsys->allow_any_host)
                 return true;
  
-       if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
+       if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */
                 return true;
  
         list_for_each_entry(p, &subsys->hosts, entry) {
@@ -1367,6 +1370,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
         mutex_init(&ctrl->lock);
  
         ctrl->port = req->port;
+       ctrl->ops = req->ops;
  
         INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
         INIT_LIST_HEAD(&ctrl->async_events);
@@ -1405,13 +1409,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
         }
         ctrl->cntlid = ret;
  
-       ctrl->ops = req->ops;
-
         /*
          * Discovery controllers may use some arbitrary high value
          * in order to cleanup stale discovery sessions
          */
-       if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
+       if (nvmet_is_disc_subsys(ctrl->subsys) && !kato)
                 kato = NVMET_DISC_KATO_MS;
  
         /* keep-alive timeout in seconds */
@@ -1491,7 +1493,8 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
         if (!port)
                 return NULL;
  
-       if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
+       if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn) ||
+           !strcmp(nvmet_disc_subsys->subsysnqn, subsysnqn)) {
                 if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
                         return NULL;
                 return nvmet_disc_subsys;
@@ -1538,6 +1541,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
                 subsys->max_qid = NVMET_NR_QUEUES;
                 break;
         case NVME_NQN_DISC:
+       case NVME_NQN_CURR:
                 subsys->max_qid = 0;
                 break;
         default:
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c

index 7aa62bc..c2162ee 100644 (file)
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -146,7 +146,7 @@ static size_t discovery_log_entries(struct nvmet_req *req)
         struct nvmet_ctrl *ctrl = req->sq->ctrl;
         struct nvmet_subsys_link *p;
         struct nvmet_port *r;
-       size_t entries = 0;
+       size_t entries = 1;
  
         list_for_each_entry(p, &req->port->subsystems, entry) {
                 if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
@@ -171,6 +171,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
         u32 numrec = 0;
         u16 status = 0;
         void *buffer;
+       char traddr[NVMF_TRADDR_SIZE];
  
         if (!nvmet_check_transfer_len(req, data_len))
                 return;
@@ -203,15 +204,19 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
                 status = NVME_SC_INTERNAL;
                 goto out;
         }
-
         hdr = buffer;
-       list_for_each_entry(p, &req->port->subsystems, entry) {
-               char traddr[NVMF_TRADDR_SIZE];
  
+       nvmet_set_disc_traddr(req, req->port, traddr);
+
+       nvmet_format_discovery_entry(hdr, req->port,
+                                    nvmet_disc_subsys->subsysnqn,
+                                    traddr, NVME_NQN_CURR, numrec);
+       numrec++;
+
+       list_for_each_entry(p, &req->port->subsystems, entry) {
                 if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
                         continue;
  
-               nvmet_set_disc_traddr(req, req->port, traddr);
                 nvmet_format_discovery_entry(hdr, req->port,
                                 p->subsys->subsysnqn, traddr,
                                 NVME_NQN_NVME, numrec);
@@ -268,6 +273,8 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
         memcpy_and_pad(id->fr, sizeof(id->fr),
                        UTS_RELEASE, strlen(UTS_RELEASE), ' ');
  
+       id->cntrltype = NVME_CTRL_DISC;
+
         /* no limit on data transfer sizes for now */
         id->mdts = 0;
         id->cntlid = cpu_to_le16(ctrl->cntlid);
@@ -387,7 +394,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
  int __init nvmet_init_discovery(void)
  {
         nvmet_disc_subsys =
-               nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
+               nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_CURR);
         return PTR_ERR_OR_ZERO(nvmet_disc_subsys);
  }
  
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c

index 7d0454c..70fb587 100644 (file)
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -221,7 +221,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
                 goto out;
         }
  
-       pr_info("creating controller %d for subsystem %s for NQN %s%s.\n",
+       pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n",
+               nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
                 ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
                 ctrl->pi_support ? " T10-PI is enabled" : "");
         req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c

index 0fc2781..70ca9df 100644 (file)
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -5,6 +5,7 @@
   */
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/module.h>
  #include "nvmet.h"
  
@@ -86,7 +87,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
                 ns->bdev = NULL;
                 return ret;
         }
-       ns->size = i_size_read(ns->bdev->bd_inode);
+       ns->size = bdev_nr_bytes(ns->bdev);
         ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
  
         ns->pi_type = 0;
@@ -107,7 +108,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
  
  void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
  {
-       ns->size = i_size_read(ns->bdev->bd_inode);
+       ns->size = bdev_nr_bytes(ns->bdev);
  }
  
  u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c

index 1dd1a0f..6aa30f3 100644 (file)
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -125,7 +125,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
         return call_iter(iocb, &iter);
  }
  
-static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
+static void nvmet_file_io_done(struct kiocb *iocb, long ret)
  {
         struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
         u16 status = NVME_SC_SUCCESS;
@@ -222,7 +222,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
         }
  
  complete:
-       nvmet_file_io_done(&req->f.iocb, ret, 0);
+       nvmet_file_io_done(&req->f.iocb, ret);
         return true;
  }
  
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c

index 0285ccc..eb10942 100644 (file)
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -384,6 +384,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
                 error = PTR_ERR(ctrl->ctrl.admin_q);
                 goto out_cleanup_fabrics_q;
         }
+       /* reset stopped state for the fresh admin queue */
+       clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
  
         error = nvmf_connect_admin_queue(&ctrl->ctrl);
         if (error)
@@ -398,7 +400,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
         ctrl->ctrl.max_hw_sectors =
                 (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
  
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_start_admin_queue(&ctrl->ctrl);
  
         error = nvme_init_ctrl_finish(&ctrl->ctrl);
         if (error)
@@ -428,7 +430,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
                 nvme_loop_destroy_io_queues(ctrl);
         }
  
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_stop_admin_queue(&ctrl->ctrl);
         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
                 nvme_shutdown_ctrl(&ctrl->ctrl);
  
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h

index 7143c7f..af19342 100644 (file)
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -309,6 +309,7 @@ struct nvmet_fabrics_ops {
         u16 (*install_queue)(struct nvmet_sq *nvme_sq);
         void (*discovery_chg)(struct nvmet_port *port);
         u8 (*get_mdts)(const struct nvmet_ctrl *ctrl);
+       u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
  };
  
  #define NVMET_MAX_INLINE_BIOVEC        8
@@ -576,6 +577,11 @@ static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req)
         return req->sq->ctrl->subsys;
  }
  
+static inline bool nvmet_is_disc_subsys(struct nvmet_subsys *subsys)
+{
+    return subsys->type != NVME_NQN_NVME;
+}
+
  #ifdef CONFIG_NVME_TARGET_PASSTHRU
  void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
  int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c

index 891174c..1deb404 100644 (file)
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -5,6 +5,7 @@
   */
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  #include <linux/atomic.h>
+#include <linux/blk-integrity.h>
  #include <linux/ctype.h>
  #include <linux/delay.h>
  #include <linux/err.h>
@@ -1818,12 +1819,36 @@ restart:
         mutex_unlock(&nvmet_rdma_queue_mutex);
  }
  
+static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port)
+{
+       struct nvmet_rdma_queue *queue, *tmp;
+       struct nvmet_port *nport = port->nport;
+
+       mutex_lock(&nvmet_rdma_queue_mutex);
+       list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
+                                queue_list) {
+               if (queue->port != nport)
+                       continue;
+
+               list_del_init(&queue->queue_list);
+               __nvmet_rdma_queue_disconnect(queue);
+       }
+       mutex_unlock(&nvmet_rdma_queue_mutex);
+}
+
  static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
  {
         struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
  
         if (cm_id)
                 rdma_destroy_id(cm_id);
+
+       /*
+        * Destroy the remaining queues, which are not belong to any
+        * controller yet. Do it here after the RDMA-CM was destroyed
+        * guarantees that no new queue will be created.
+        */
+       nvmet_rdma_destroy_port_queues(port);
  }
  
  static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
@@ -1975,6 +2000,11 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
         return NVMET_RDMA_MAX_MDTS;
  }
  
+static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl)
+{
+       return NVME_RDMA_MAX_QUEUE_SIZE;
+}
+
  static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
         .owner                  = THIS_MODULE,
         .type                   = NVMF_TRTYPE_RDMA,
@@ -1986,6 +2016,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
         .delete_ctrl            = nvmet_rdma_delete_ctrl,
         .disc_traddr            = nvmet_rdma_disc_port_addr,
         .get_mdts               = nvmet_rdma_get_mdts,
+       .get_max_queue_size     = nvmet_rdma_get_max_queue_size,
  };
  
  static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c

index 07ee347..84c387e 100644 (file)
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -702,7 +702,7 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
         struct nvmet_tcp_queue *queue = cmd->queue;
         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
         struct kvec iov = {
-               .iov_base = &cmd->exp_ddgst + cmd->offset,
+               .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
                 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
         };
         int ret;
@@ -1096,7 +1096,7 @@ recv:
         }
  
         if (queue->hdr_digest &&
-           nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+           nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
                 nvmet_tcp_fatal_error(queue); /* fatal */
                 return -EPROTO;
         }
@@ -1428,6 +1428,7 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
  
  static void nvmet_tcp_release_queue_work(struct work_struct *w)
  {
+       struct page *page;
         struct nvmet_tcp_queue *queue =
                 container_of(w, struct nvmet_tcp_queue, release_work);
  
@@ -1447,6 +1448,8 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
                 nvmet_tcp_free_crypto(queue);
         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
  
+       page = virt_to_head_page(queue->pf_cache.va);
+       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
         kfree(queue);
  }
  
@@ -1737,6 +1740,17 @@ err_port:
         return ret;
  }
  
+static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
+{
+       struct nvmet_tcp_queue *queue;
+
+       mutex_lock(&nvmet_tcp_queue_mutex);
+       list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+               if (queue->port == port)
+                       kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+       mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
  static void nvmet_tcp_remove_port(struct nvmet_port *nport)
  {
         struct nvmet_tcp_port *port = nport->priv;
@@ -1746,6 +1760,11 @@ static void nvmet_tcp_remove_port(struct nvmet_port *nport)
         port->sock->sk->sk_user_data = NULL;
         write_unlock_bh(&port->sock->sk->sk_callback_lock);
         cancel_work_sync(&port->accept_work);
+       /*
+        * Destroy the remaining queues, which are not belong to any
+        * controller yet.
+        */
+       nvmet_tcp_destroy_port_queues(port);
  
         sock_release(port->sock);
         kfree(port);
diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig

index be799a5..b0056ae 100644 (file)
--- a/drivers/reset/Kconfig
+++ b/drivers/reset/Kconfig
@@ -147,8 +147,8 @@ config RESET_OXNAS
         bool
  
  config RESET_PISTACHIO
-       bool "Pistachio Reset Driver" if COMPILE_TEST
-       default MACH_PISTACHIO
+       bool "Pistachio Reset Driver"
+       depends on MIPS || COMPILE_TEST
         help
           This enables the reset driver for ImgTec Pistachio SoCs.
  
diff --git a/drivers/reset/reset-brcmstb-rescal.c b/drivers/reset/reset-brcmstb-rescal.c

index b6f074d..433fa0c 100644 (file)
--- a/drivers/reset/reset-brcmstb-rescal.c
+++ b/drivers/reset/reset-brcmstb-rescal.c
@@ -38,7 +38,7 @@ static int brcm_rescal_reset_set(struct reset_controller_dev *rcdev,
         }
  
         ret = readl_poll_timeout(base + BRCM_RESCAL_STATUS, reg,
-                                !(reg & BRCM_RESCAL_STATUS_BIT), 100, 1000);
+                                (reg & BRCM_RESCAL_STATUS_BIT), 100, 1000);
         if (ret) {
                 dev_err(data->dev, "time out on SATA/PCIe rescal\n");
                 return ret;
diff --git a/drivers/reset/reset-socfpga.c b/drivers/reset/reset-socfpga.c

index 2a72f86..8c6492e 100644 (file)
--- a/drivers/reset/reset-socfpga.c
+++ b/drivers/reset/reset-socfpga.c
@@ -92,3 +92,29 @@ void __init socfpga_reset_init(void)
         for_each_matching_node(np, socfpga_early_reset_dt_ids)
                 a10_reset_init(np);
  }
+
+/*
+ * The early driver is problematic, because it doesn't register
+ * itself as a driver. This causes certain device links to prevent
+ * consumer devices from probing. The hacky solution is to register
+ * an empty driver, whose only job is to attach itself to the reset
+ * manager and call probe.
+ */
+static const struct of_device_id socfpga_reset_dt_ids[] = {
+       { .compatible = "altr,rst-mgr", },
+       { /* sentinel */ },
+};
+
+static int reset_simple_probe(struct platform_device *pdev)
+{
+       return 0;
+}
+
+static struct platform_driver reset_socfpga_driver = {
+       .probe  = reset_simple_probe,
+       .driver = {
+               .name           = "socfpga-reset",
+               .of_match_table = socfpga_reset_dt_ids,
+       },
+};
+builtin_platform_driver(reset_socfpga_driver);
diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c

index 24d3395..4c5bba5 100644 (file)
--- a/drivers/reset/tegra/reset-bpmp.c
+++ b/drivers/reset/tegra/reset-bpmp.c
@@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
         struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc);
         struct mrq_reset_request request;
         struct tegra_bpmp_message msg;
+       int err;
  
         memset(&request, 0, sizeof(request));
         request.cmd = command;
@@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
         msg.tx.data = &request;
         msg.tx.size = sizeof(request);
  
-       return tegra_bpmp_transfer(bpmp, &msg);
+       err = tegra_bpmp_transfer(bpmp, &msg);
+       if (err)
+               return err;
+       if (msg.rx.ret)
+               return -EINVAL;
+
+       return 0;
  }
  
  static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c

index e34c6cc..8e87a31 100644 (file)
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2077,12 +2077,15 @@ static void __dasd_device_check_path_events(struct dasd_device *device)
  
         if (device->stopped & ~(DASD_STOPPED_DC_WAIT))
                 return;
+
+       dasd_path_clear_all_verify(device);
+       dasd_path_clear_all_fcsec(device);
+
         rc = device->discipline->pe_handler(device, tbvpm, fcsecpm);
         if (rc) {
+               dasd_path_add_tbvpm(device, tbvpm);
+               dasd_path_add_fcsecpm(device, fcsecpm);
                 dasd_device_set_timer(device, 50);
-       } else {
-               dasd_path_clear_all_verify(device);
-               dasd_path_clear_all_fcsec(device);
         }
  };
  
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c

index 4691a3c..299001a 100644 (file)
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -201,7 +201,7 @@ dasd_3990_erp_DCTL(struct dasd_ccw_req * erp, char modifier)
         struct ccw1 *ccw;
         struct dasd_ccw_req *dctl_cqr;
  
-       dctl_cqr = dasd_alloc_erp_request((char *) &erp->magic, 1,
+       dctl_cqr = dasd_alloc_erp_request(erp->magic, 1,
                                           sizeof(struct DCTL_data),
                                           device);
         if (IS_ERR(dctl_cqr)) {
@@ -1652,7 +1652,7 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense)
         }
  
         /* Build new ERP request including DE/LO */
-       erp = dasd_alloc_erp_request((char *) &cqr->magic,
+       erp = dasd_alloc_erp_request(cqr->magic,
                                      2 + 1,/* DE/LO + TIC */
                                      sizeof(struct DE_eckd_data) +
                                      sizeof(struct LO_eckd_data), device);
@@ -2388,7 +2388,7 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
         }
  
         /* allocate additional request block */
-       erp = dasd_alloc_erp_request((char *) &cqr->magic,
+       erp = dasd_alloc_erp_request(cqr->magic,
                                      cplength, datasize, device);
         if (IS_ERR(erp)) {
                  if (cqr->retries <= 0) {
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c

index 460e0f1..8410a25 100644 (file)
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -560,8 +560,8 @@ static int prefix_LRE(struct ccw1 *ccw, struct PFX_eckd_data *pfxdata,
                 return -EINVAL;
         }
         pfxdata->format = format;
-       pfxdata->base_address = basepriv->ned->unit_addr;
-       pfxdata->base_lss = basepriv->ned->ID;
+       pfxdata->base_address = basepriv->conf.ned->unit_addr;
+       pfxdata->base_lss = basepriv->conf.ned->ID;
         pfxdata->validity.define_extent = 1;
  
         /* private uid is kept up to date, conf_data may be outdated */
@@ -736,32 +736,30 @@ dasd_eckd_cdl_reclen(int recid)
         return LABEL_SIZE;
  }
  /* create unique id from private structure. */
-static void create_uid(struct dasd_eckd_private *private)
+static void create_uid(struct dasd_conf *conf, struct dasd_uid *uid)
  {
         int count;
-       struct dasd_uid *uid;
  
-       uid = &private->uid;
         memset(uid, 0, sizeof(struct dasd_uid));
-       memcpy(uid->vendor, private->ned->HDA_manufacturer,
+       memcpy(uid->vendor, conf->ned->HDA_manufacturer,
                sizeof(uid->vendor) - 1);
         EBCASC(uid->vendor, sizeof(uid->vendor) - 1);
-       memcpy(uid->serial, &private->ned->serial,
+       memcpy(uid->serial, &conf->ned->serial,
                sizeof(uid->serial) - 1);
         EBCASC(uid->serial, sizeof(uid->serial) - 1);
-       uid->ssid = private->gneq->subsystemID;
-       uid->real_unit_addr = private->ned->unit_addr;
-       if (private->sneq) {
-               uid->type = private->sneq->sua_flags;
+       uid->ssid = conf->gneq->subsystemID;
+       uid->real_unit_addr = conf->ned->unit_addr;
+       if (conf->sneq) {
+               uid->type = conf->sneq->sua_flags;
                 if (uid->type == UA_BASE_PAV_ALIAS)
-                       uid->base_unit_addr = private->sneq->base_unit_addr;
+                       uid->base_unit_addr = conf->sneq->base_unit_addr;
         } else {
                 uid->type = UA_BASE_DEVICE;
         }
-       if (private->vdsneq) {
+       if (conf->vdsneq) {
                 for (count = 0; count < 16; count++) {
                         sprintf(uid->vduit+2*count, "%02x",
-                               private->vdsneq->uit[count]);
+                               conf->vdsneq->uit[count]);
                 }
         }
  }
@@ -776,10 +774,10 @@ static int dasd_eckd_generate_uid(struct dasd_device *device)
  
         if (!private)
                 return -ENODEV;
-       if (!private->ned || !private->gneq)
+       if (!private->conf.ned || !private->conf.gneq)
                 return -ENODEV;
         spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
-       create_uid(private);
+       create_uid(&private->conf, &private->uid);
         spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
         return 0;
  }
@@ -803,14 +801,15 @@ static int dasd_eckd_get_uid(struct dasd_device *device, struct dasd_uid *uid)
   * return 0 for match
   */
  static int dasd_eckd_compare_path_uid(struct dasd_device *device,
-                                     struct dasd_eckd_private *private)
+                                     struct dasd_conf *path_conf)
  {
         struct dasd_uid device_uid;
+       struct dasd_uid path_uid;
  
-       create_uid(private);
+       create_uid(path_conf, &path_uid);
         dasd_eckd_get_uid(device, &device_uid);
  
-       return memcmp(&device_uid, &private->uid, sizeof(struct dasd_uid));
+       return memcmp(&device_uid, &path_uid, sizeof(struct dasd_uid));
  }
  
  static void dasd_eckd_fill_rcd_cqr(struct dasd_device *device,
@@ -946,34 +945,34 @@ out_error:
         return ret;
  }
  
-static int dasd_eckd_identify_conf_parts(struct dasd_eckd_private *private)
+static int dasd_eckd_identify_conf_parts(struct dasd_conf *conf)
  {
  
         struct dasd_sneq *sneq;
         int i, count;
  
-       private->ned = NULL;
-       private->sneq = NULL;
-       private->vdsneq = NULL;
-       private->gneq = NULL;
-       count = private->conf_len / sizeof(struct dasd_sneq);
-       sneq = (struct dasd_sneq *)private->conf_data;
+       conf->ned = NULL;
+       conf->sneq = NULL;
+       conf->vdsneq = NULL;
+       conf->gneq = NULL;
+       count = conf->len / sizeof(struct dasd_sneq);
+       sneq = (struct dasd_sneq *)conf->data;
         for (i = 0; i < count; ++i) {
                 if (sneq->flags.identifier == 1 && sneq->format == 1)
-                       private->sneq = sneq;
+                       conf->sneq = sneq;
                 else if (sneq->flags.identifier == 1 && sneq->format == 4)
-                       private->vdsneq = (struct vd_sneq *)sneq;
+                       conf->vdsneq = (struct vd_sneq *)sneq;
                 else if (sneq->flags.identifier == 2)
-                       private->gneq = (struct dasd_gneq *)sneq;
+                       conf->gneq = (struct dasd_gneq *)sneq;
                 else if (sneq->flags.identifier == 3 && sneq->res1 == 1)
-                       private->ned = (struct dasd_ned *)sneq;
+                       conf->ned = (struct dasd_ned *)sneq;
                 sneq++;
         }
-       if (!private->ned || !private->gneq) {
-               private->ned = NULL;
-               private->sneq = NULL;
-               private->vdsneq = NULL;
-               private->gneq = NULL;
+       if (!conf->ned || !conf->gneq) {
+               conf->ned = NULL;
+               conf->sneq = NULL;
+               conf->vdsneq = NULL;
+               conf->gneq = NULL;
                 return -EINVAL;
         }
         return 0;
@@ -1016,9 +1015,9 @@ static void dasd_eckd_store_conf_data(struct dasd_device *device,
          * with the new one if this points to the same data
          */
         cdp = device->path[chp].conf_data;
-       if (private->conf_data == cdp) {
-               private->conf_data = (void *)conf_data;
-               dasd_eckd_identify_conf_parts(private);
+       if (private->conf.data == cdp) {
+               private->conf.data = (void *)conf_data;
+               dasd_eckd_identify_conf_parts(&private->conf);
         }
         ccw_device_get_schid(device->cdev, &sch_id);
         device->path[chp].conf_data = conf_data;
@@ -1036,8 +1035,8 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device)
         struct dasd_eckd_private *private = device->private;
         int i;
  
-       private->conf_data = NULL;
-       private->conf_len = 0;
+       private->conf.data = NULL;
+       private->conf.len = 0;
         for (i = 0; i < 8; i++) {
                 kfree(device->path[i].conf_data);
                 device->path[i].conf_data = NULL;
@@ -1071,15 +1070,55 @@ static void dasd_eckd_read_fc_security(struct dasd_device *device)
         }
  }
  
+static void dasd_eckd_get_uid_string(struct dasd_conf *conf,
+                                    char *print_uid)
+{
+       struct dasd_uid uid;
+
+       create_uid(conf, &uid);
+       if (strlen(uid.vduit) > 0)
+               snprintf(print_uid, sizeof(*print_uid),
+                        "%s.%s.%04x.%02x.%s",
+                        uid.vendor, uid.serial, uid.ssid,
+                        uid.real_unit_addr, uid.vduit);
+       else
+               snprintf(print_uid, sizeof(*print_uid),
+                        "%s.%s.%04x.%02x",
+                        uid.vendor, uid.serial, uid.ssid,
+                        uid.real_unit_addr);
+}
+
+static int dasd_eckd_check_cabling(struct dasd_device *device,
+                                  void *conf_data, __u8 lpm)
+{
+       struct dasd_eckd_private *private = device->private;
+       char print_path_uid[60], print_device_uid[60];
+       struct dasd_conf path_conf;
+
+       path_conf.data = conf_data;
+       path_conf.len = DASD_ECKD_RCD_DATA_SIZE;
+       if (dasd_eckd_identify_conf_parts(&path_conf))
+               return 1;
+
+       if (dasd_eckd_compare_path_uid(device, &path_conf)) {
+               dasd_eckd_get_uid_string(&path_conf, print_path_uid);
+               dasd_eckd_get_uid_string(&private->conf, print_device_uid);
+               dev_err(&device->cdev->dev,
+                       "Not all channel paths lead to the same device, path %02X leads to device %s instead of %s\n",
+                       lpm, print_path_uid, print_device_uid);
+               return 1;
+       }
+
+       return 0;
+}
+
  static int dasd_eckd_read_conf(struct dasd_device *device)
  {
         void *conf_data;
         int conf_len, conf_data_saved;
         int rc, path_err, pos;
         __u8 lpm, opm;
-       struct dasd_eckd_private *private, path_private;
-       struct dasd_uid *uid;
-       char print_path_uid[60], print_device_uid[60];
+       struct dasd_eckd_private *private;
  
         private = device->private;
         opm = ccw_device_get_path_mask(device->cdev);
@@ -1109,11 +1148,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                 if (!conf_data_saved) {
                         /* initially clear previously stored conf_data */
                         dasd_eckd_clear_conf_data(device);
-                       private->conf_data = conf_data;
-                       private->conf_len = conf_len;
-                       if (dasd_eckd_identify_conf_parts(private)) {
-                               private->conf_data = NULL;
-                               private->conf_len = 0;
+                       private->conf.data = conf_data;
+                       private->conf.len = conf_len;
+                       if (dasd_eckd_identify_conf_parts(&private->conf)) {
+                               private->conf.data = NULL;
+                               private->conf.len = 0;
                                 kfree(conf_data);
                                 continue;
                         }
@@ -1123,59 +1162,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                          */
                         dasd_eckd_generate_uid(device);
                         conf_data_saved++;
-               } else {
-                       path_private.conf_data = conf_data;
-                       path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE;
-                       if (dasd_eckd_identify_conf_parts(
-                                   &path_private)) {
-                               path_private.conf_data = NULL;
-                               path_private.conf_len = 0;
-                               kfree(conf_data);
-                               continue;
-                       }
-                       if (dasd_eckd_compare_path_uid(
-                                   device, &path_private)) {
-                               uid = &path_private.uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_path_uid,
-                                                sizeof(print_path_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_path_uid,
-                                                sizeof(print_path_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
-                               uid = &private->uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_device_uid,
-                                                sizeof(print_device_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_device_uid,
-                                                sizeof(print_device_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
-                               dev_err(&device->cdev->dev,
-                                       "Not all channel paths lead to "
-                                       "the same device, path %02X leads to "
-                                       "device %s instead of %s\n", lpm,
-                                       print_path_uid, print_device_uid);
-                               path_err = -EINVAL;
-                               dasd_path_add_cablepm(device, lpm);
-                               continue;
-                       }
-                       path_private.conf_data = NULL;
-                       path_private.conf_len = 0;
+               } else if (dasd_eckd_check_cabling(device, conf_data, lpm)) {
+                       dasd_path_add_cablepm(device, lpm);
+                       path_err = -EINVAL;
+                       kfree(conf_data);
+                       continue;
                 }
  
                 pos = pathmask_to_pos(lpm);
@@ -1197,8 +1188,6 @@ static int dasd_eckd_read_conf(struct dasd_device *device)
                 }
         }
  
-       dasd_eckd_read_fc_security(device);
-
         return path_err;
  }
  
@@ -1213,7 +1202,7 @@ static u32 get_fcx_max_data(struct dasd_device *device)
                 return 0;
         /* is transport mode supported? */
         fcx_in_css = css_general_characteristics.fcx;
-       fcx_in_gneq = private->gneq->reserved2[7] & 0x04;
+       fcx_in_gneq = private->conf.gneq->reserved2[7] & 0x04;
         fcx_in_features = private->features.feature[40] & 0x80;
         tpm = fcx_in_css && fcx_in_gneq && fcx_in_features;
  
@@ -1282,9 +1271,9 @@ static int rebuild_device_uid(struct dasd_device *device,
                                         "returned error %d", rc);
                         break;
                 }
-               memcpy(private->conf_data, data->rcd_buffer,
+               memcpy(private->conf.data, data->rcd_buffer,
                        DASD_ECKD_RCD_DATA_SIZE);
-               if (dasd_eckd_identify_conf_parts(private)) {
+               if (dasd_eckd_identify_conf_parts(&private->conf)) {
                         rc = -ENODEV;
                 } else /* first valid path is enough */
                         break;
@@ -1299,11 +1288,10 @@ static int rebuild_device_uid(struct dasd_device *device,
  static void dasd_eckd_path_available_action(struct dasd_device *device,
                                             struct pe_handler_work_data *data)
  {
-       struct dasd_eckd_private path_private;
-       struct dasd_uid *uid;
         __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE];
         __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm;
         struct dasd_conf_data *conf_data;
+       struct dasd_conf path_conf;
         unsigned long flags;
         char print_uid[60];
         int rc, pos;
@@ -1367,11 +1355,11 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                  */
                 memcpy(&path_rcd_buf, data->rcd_buffer,
                        DASD_ECKD_RCD_DATA_SIZE);
-               path_private.conf_data = (void *) &path_rcd_buf;
-               path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE;
-               if (dasd_eckd_identify_conf_parts(&path_private)) {
-                       path_private.conf_data = NULL;
-                       path_private.conf_len = 0;
+               path_conf.data = (void *)&path_rcd_buf;
+               path_conf.len = DASD_ECKD_RCD_DATA_SIZE;
+               if (dasd_eckd_identify_conf_parts(&path_conf)) {
+                       path_conf.data = NULL;
+                       path_conf.len = 0;
                         continue;
                 }
  
@@ -1382,7 +1370,7 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                  * the first working path UID will be used as device UID
                  */
                 if (dasd_path_get_opm(device) &&
-                   dasd_eckd_compare_path_uid(device, &path_private)) {
+                   dasd_eckd_compare_path_uid(device, &path_conf)) {
                         /*
                          * the comparison was not successful
                          * rebuild the device UID with at least one
@@ -1396,20 +1384,8 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                          */
                         if (rebuild_device_uid(device, data) ||
                             dasd_eckd_compare_path_uid(
-                                   device, &path_private)) {
-                               uid = &path_private.uid;
-                               if (strlen(uid->vduit) > 0)
-                                       snprintf(print_uid, sizeof(print_uid),
-                                                "%s.%s.%04x.%02x.%s",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid, uid->real_unit_addr,
-                                                uid->vduit);
-                               else
-                                       snprintf(print_uid, sizeof(print_uid),
-                                                "%s.%s.%04x.%02x",
-                                                uid->vendor, uid->serial,
-                                                uid->ssid,
-                                                uid->real_unit_addr);
+                                   device, &path_conf)) {
+                               dasd_eckd_get_uid_string(&path_conf, print_uid);
                                 dev_err(&device->cdev->dev,
                                         "The newly added channel path %02X "
                                         "will not be used because it leads "
@@ -1427,6 +1403,14 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                 if (conf_data) {
                         memcpy(conf_data, data->rcd_buffer,
                                DASD_ECKD_RCD_DATA_SIZE);
+               } else {
+                       /*
+                        * path is operational but path config data could not
+                        * be stored due to low mem condition
+                        * add it to the error path mask and schedule a path
+                        * verification later that this could be added again
+                        */
+                       epm |= lpm;
                 }
                 pos = pathmask_to_pos(lpm);
                 dasd_eckd_store_conf_data(device, conf_data, pos);
@@ -1447,7 +1431,10 @@ static void dasd_eckd_path_available_action(struct dasd_device *device,
                 }
                 dasd_path_add_nppm(device, npm);
                 dasd_path_add_ppm(device, ppm);
-               dasd_path_add_tbvpm(device, epm);
+               if (epm) {
+                       dasd_path_add_tbvpm(device, epm);
+                       dasd_device_set_timer(device, 50);
+               }
                 dasd_path_add_cablepm(device, cablepm);
                 dasd_path_add_nohpfpm(device, hpfpm);
                 spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
@@ -1625,8 +1612,8 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device)
         prssdp = cqr->data;
         prssdp->order = PSF_ORDER_PRSSD;
         prssdp->suborder = PSF_SUBORDER_VSQ;    /* Volume Storage Query */
-       prssdp->lss = private->ned->ID;
-       prssdp->volume = private->ned->unit_addr;
+       prssdp->lss = private->conf.ned->ID;
+       prssdp->volume = private->conf.ned->unit_addr;
  
         ccw = cqr->cpaddr;
         ccw->cmd_code = DASD_ECKD_CCW_PSF;
@@ -2085,11 +2072,11 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
         device->path_thrhld = DASD_ECKD_PATH_THRHLD;
         device->path_interval = DASD_ECKD_PATH_INTERVAL;
  
-       if (private->gneq) {
+       if (private->conf.gneq) {
                 value = 1;
-               for (i = 0; i < private->gneq->timeout.value; i++)
+               for (i = 0; i < private->conf.gneq->timeout.value; i++)
                         value = 10 * value;
-               value = value * private->gneq->timeout.number;
+               value = value * private->conf.gneq->timeout.number;
                 /* do not accept useless values */
                 if (value != 0 && value <= DASD_EXPIRES_MAX)
                         device->default_expires = value;
@@ -2121,6 +2108,7 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
         if (rc)
                 goto out_err3;
  
+       dasd_eckd_read_fc_security(device);
         dasd_path_create_kobjects(device);
  
         /* Read Feature Codes */
@@ -2195,10 +2183,10 @@ static void dasd_eckd_uncheck_device(struct dasd_device *device)
                 return;
  
         dasd_alias_disconnect_device_from_lcu(device);
-       private->ned = NULL;
-       private->sneq = NULL;
-       private->vdsneq = NULL;
-       private->gneq = NULL;
+       private->conf.ned = NULL;
+       private->conf.sneq = NULL;
+       private->conf.vdsneq = NULL;
+       private->conf.gneq = NULL;
         dasd_eckd_clear_conf_data(device);
         dasd_path_remove_kobjects(device);
  }
@@ -3750,8 +3738,8 @@ dasd_eckd_dso_ras(struct dasd_device *device, struct dasd_block *block,
          * subset.
          */
         ras_data->op_flags.guarantee_init = !!(features->feature[56] & 0x01);
-       ras_data->lss = private->ned->ID;
-       ras_data->dev_addr = private->ned->unit_addr;
+       ras_data->lss = private->conf.ned->ID;
+       ras_data->dev_addr = private->conf.ned->unit_addr;
         ras_data->nr_exts = nr_exts;
  
         if (by_extent) {
@@ -4293,8 +4281,8 @@ static int prepare_itcw(struct itcw *itcw,
  
         memset(&pfxdata, 0, sizeof(pfxdata));
         pfxdata.format = 1; /* PFX with LRE */
-       pfxdata.base_address = basepriv->ned->unit_addr;
-       pfxdata.base_lss = basepriv->ned->ID;
+       pfxdata.base_address = basepriv->conf.ned->unit_addr;
+       pfxdata.base_lss = basepriv->conf.ned->ID;
         pfxdata.validity.define_extent = 1;
  
         /* private uid is kept up to date, conf_data may be outdated */
@@ -4963,9 +4951,9 @@ dasd_eckd_fill_info(struct dasd_device * device,
         info->characteristics_size = sizeof(private->rdc_data);
         memcpy(info->characteristics, &private->rdc_data,
                sizeof(private->rdc_data));
-       info->confdata_size = min((unsigned long)private->conf_len,
-                                 sizeof(info->configuration_data));
-       memcpy(info->configuration_data, private->conf_data,
+       info->confdata_size = min_t(unsigned long, private->conf.len,
+                                   sizeof(info->configuration_data));
+       memcpy(info->configuration_data, private->conf.data,
                info->confdata_size);
         return 0;
  }
@@ -5808,6 +5796,8 @@ static int dasd_eckd_reload_device(struct dasd_device *device)
         if (rc)
                 goto out_err;
  
+       dasd_eckd_read_fc_security(device);
+
         rc = dasd_eckd_generate_uid(device);
         if (rc)
                 goto out_err;
@@ -5820,15 +5810,7 @@ static int dasd_eckd_reload_device(struct dasd_device *device)
         dasd_eckd_get_uid(device, &uid);
  
         if (old_base != uid.base_unit_addr) {
-               if (strlen(uid.vduit) > 0)
-                       snprintf(print_uid, sizeof(print_uid),
-                                "%s.%s.%04x.%02x.%s", uid.vendor, uid.serial,
-                                uid.ssid, uid.base_unit_addr, uid.vduit);
-               else
-                       snprintf(print_uid, sizeof(print_uid),
-                                "%s.%s.%04x.%02x", uid.vendor, uid.serial,
-                                uid.ssid, uid.base_unit_addr);
-
+               dasd_eckd_get_uid_string(&private->conf, print_uid);
                 dev_info(&device->cdev->dev,
                          "An Alias device was reassigned to a new base device "
                          "with UID: %s\n", print_uid);
@@ -5966,8 +5948,8 @@ static int dasd_eckd_query_host_access(struct dasd_device *device,
         prssdp->order = PSF_ORDER_PRSSD;
         prssdp->suborder = PSF_SUBORDER_QHA;    /* query host access */
         /* LSS and Volume that will be queried */
-       prssdp->lss = private->ned->ID;
-       prssdp->volume = private->ned->unit_addr;
+       prssdp->lss = private->conf.ned->ID;
+       prssdp->volume = private->conf.ned->unit_addr;
         /* all other bytes of prssdp must be zero */
  
         ccw = cqr->cpaddr;
diff --git a/drivers/s390/block/dasd_eckd.h b/drivers/s390/block/dasd_eckd.h

index 65e4630..a91b265 100644 (file)
--- a/drivers/s390/block/dasd_eckd.h
+++ b/drivers/s390/block/dasd_eckd.h
@@ -658,16 +658,19 @@ struct dasd_conf_data {
         struct dasd_gneq gneq;
  } __packed;
  
-struct dasd_eckd_private {
-       struct dasd_eckd_characteristics rdc_data;
-       u8 *conf_data;
-       int conf_len;
-
+struct dasd_conf {
+       u8 *data;
+       int len;
         /* pointers to specific parts in the conf_data */
         struct dasd_ned *ned;
         struct dasd_sneq *sneq;
         struct vd_sneq *vdsneq;
         struct dasd_gneq *gneq;
+};
+
+struct dasd_eckd_private {
+       struct dasd_eckd_characteristics rdc_data;
+       struct dasd_conf conf;
  
         struct eckd_count count_area[5];
         int init_cqr_status;
diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c

index ba4fa37..c07e6e7 100644 (file)
--- a/drivers/s390/block/dasd_erp.c
+++ b/drivers/s390/block/dasd_erp.c
@@ -24,7 +24,7 @@
  #include "dasd_int.h"
  
  struct dasd_ccw_req *
-dasd_alloc_erp_request(char *magic, int cplength, int datasize,
+dasd_alloc_erp_request(unsigned int magic, int cplength, int datasize,
                        struct dasd_device * device)
  {
         unsigned long flags;
@@ -33,8 +33,8 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize,
         int size;
  
         /* Sanity checks */
-       BUG_ON( magic == NULL || datasize > PAGE_SIZE ||
-            (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
+       BUG_ON(datasize > PAGE_SIZE ||
+              (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
  
         size = (sizeof(struct dasd_ccw_req) + 7L) & -8L;
         if (cplength > 0)
@@ -62,7 +62,7 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize,
                 cqr->data = data;
                 memset(cqr->data, 0, datasize);
         }
-       strncpy((char *) &cqr->magic, magic, 4);
+       cqr->magic = magic;
         ASCEBC((char *) &cqr->magic, 4);
         set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
         dasd_get_device(device);
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c

index fa966e0..3a6f3af 100644 (file)
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -14,6 +14,7 @@
  #define KMSG_COMPONENT "dasd"
  
  #include <linux/interrupt.h>
+#include <linux/major.h>
  #include <linux/fs.h>
  #include <linux/blkpg.h>
  
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h

index 155428b..8b45801 100644 (file)
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -887,7 +887,7 @@ void dasd_proc_exit(void);
  /* externals in dasd_erp.c */
  struct dasd_ccw_req *dasd_default_erp_action(struct dasd_ccw_req *);
  struct dasd_ccw_req *dasd_default_erp_postaction(struct dasd_ccw_req *);
-struct dasd_ccw_req *dasd_alloc_erp_request(char *, int, int,
+struct dasd_ccw_req *dasd_alloc_erp_request(unsigned int, int, int,
                                             struct dasd_device *);
  void dasd_free_erp_request(struct dasd_ccw_req *, struct dasd_device *);
  void dasd_log_sense(struct dasd_ccw_req *, struct irb *);
@@ -1305,6 +1305,15 @@ static inline void dasd_path_add_ppm(struct dasd_device *device, __u8 pm)
                         dasd_path_preferred(device, chp);
  }
  
+static inline void dasd_path_add_fcsecpm(struct dasd_device *device, __u8 pm)
+{
+       int chp;
+
+       for (chp = 0; chp < 8; chp++)
+               if (pm & (0x80 >> chp))
+                       dasd_path_fcsec(device, chp);
+}
+
  /*
   * set functions for path masks
   * the existing path mask will be replaced by the given path mask
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c

index 468cbeb..95349f9 100644 (file)
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -650,8 +650,8 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode,
  
  /**
   * dasd_biodasdinfo() - fill out the dasd information structure
- * @disk [in]: pointer to gendisk structure that references a DASD
- * @info [out]: pointer to the dasd_information2_t structure
+ * @disk: [in] pointer to gendisk structure that references a DASD
+ * @info: [out] pointer to the dasd_information2_t structure
   *
   * Provide access to DASD specific information.
   * The gendisk structure is checked if it belongs to the DASD driver by
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c

index 5be3d1c..59e513d 100644 (file)
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -30,7 +30,7 @@
  
  static int dcssblk_open(struct block_device *bdev, fmode_t mode);
  static void dcssblk_release(struct gendisk *disk, fmode_t mode);
-static blk_qc_t dcssblk_submit_bio(struct bio *bio);
+static void dcssblk_submit_bio(struct bio *bio);
  static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
                 long nr_pages, void **kaddr, pfn_t *pfn);
  
@@ -854,7 +854,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode)
         up_write(&dcssblk_devices_sem);
  }
  
-static blk_qc_t
+static void
  dcssblk_submit_bio(struct bio *bio)
  {
         struct dcssblk_dev_info *dev_info;
@@ -907,10 +907,9 @@ dcssblk_submit_bio(struct bio *bio)
                 bytes_done += bvec.bv_len;
         }
         bio_endio(bio);
-       return BLK_QC_T_NONE;
+       return;
  fail:
         bio_io_error(bio);
-       return BLK_QC_T_NONE;
  }
  
  static long
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c

index 3ab669d..27884f3 100644 (file)
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -3,6 +3,7 @@
   * Copyright (c) 2017 Hisilicon Limited.
   */
  
+#include <linux/sched/clock.h>
  #include "hisi_sas.h"
  #define DRV_NAME "hisi_sas_v3_hw"
  
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c

index 1f1586a..01f7999 100644 (file)
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1696,6 +1696,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
  
         spin_lock_irqsave(&evt->queue->l_lock, flags);
         list_add_tail(&evt->queue_list, &evt->queue->sent);
+       atomic_set(&evt->active, 1);
  
         mb();
  
@@ -1710,6 +1711,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
                                      be64_to_cpu(crq_as_u64[1]));
  
         if (rc) {
+               atomic_set(&evt->active, 0);
                 list_del(&evt->queue_list);
                 spin_unlock_irqrestore(&evt->queue->l_lock, flags);
                 del_timer(&evt->timer);
@@ -1737,7 +1739,6 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
  
                 evt->done(evt);
         } else {
-               atomic_set(&evt->active, 1);
                 spin_unlock_irqrestore(&evt->queue->l_lock, flags);
                 ibmvfc_trc_start(evt);
         }
diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h

index befeb7c..337e6ed 100644 (file)
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -22,6 +22,7 @@
   *******************************************************************/
  
  #include <scsi/scsi_host.h>
+#include <linux/hashtable.h>
  #include <linux/ktime.h>
  #include <linux/workqueue.h>
  
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c

index d383d4a..ad1b6c2 100644 (file)
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -5065,9 +5065,12 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
         if (scmd->prot_flags & SCSI_PROT_GUARD_CHECK)
                 eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
  
-       if (scmd->prot_flags & SCSI_PROT_REF_CHECK) {
-               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG |
-                       MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG;
+       if (scmd->prot_flags & SCSI_PROT_REF_CHECK)
+               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG;
+
+       if (scmd->prot_flags & SCSI_PROT_REF_INCREMENT) {
+               eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG;
+
                 mpi_request->CDB.EEDP32.PrimaryReferenceTag =
                         cpu_to_be32(scsi_prot_ref_tag(scmd));
         }
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c

index 1c5da2d..253055c 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -8,6 +8,8 @@
  #include <linux/delay.h>
  #include <linux/nvme.h>
  #include <linux/nvme-fc.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/blk-mq.h>
  
  static struct nvme_fc_port_template qla_nvme_fc_transport;
  
@@ -642,6 +644,18 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport,
         return rval;
  }
  
+static void qla_nvme_map_queues(struct nvme_fc_local_port *lport,
+               struct blk_mq_queue_map *map)
+{
+       struct scsi_qla_host *vha = lport->private;
+       int rc;
+
+       rc = blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset);
+       if (rc)
+               ql_log(ql_log_warn, vha, 0x21de,
+                      "pci map queue failed 0x%x", rc);
+}
+
  static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport)
  {
         struct scsi_qla_host *vha = lport->private;
@@ -676,6 +690,7 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
         .ls_abort       = qla_nvme_ls_abort,
         .fcp_io         = qla_nvme_post_cmd,
         .fcp_abort      = qla_nvme_fcp_abort,
+       .map_queues     = qla_nvme_map_queues,
         .max_hw_queues  = 8,
         .max_sgl_segments = 1024,
         .max_dif_sgl_segments = 64,
diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c

index 81c3853..081b84b 100644 (file)
--- a/drivers/scsi/scsi_bsg.c
+++ b/drivers/scsi/scsi_bsg.c
@@ -25,8 +25,8 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
                 return -EOPNOTSUPP;
         }
  
-       rq = blk_get_request(q, hdr->dout_xfer_len ?
-                            REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(q, hdr->dout_xfer_len ?
+                               REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
         rq->timeout = timeout;
@@ -95,7 +95,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
  out_free_cmd:
         scsi_req_free_cmd(scsi_req(rq));
  out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
         return ret;
  }
  
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c

index 66f5074..40b473e 100644 (file)
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5384,7 +5384,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
  {
         bool new_sd_dp;
         bool inject = false;
-       bool hipri = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_HIPRI;
+       bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED;
         int k, num_in_q, qdepth;
         unsigned long iflags;
         u64 ns_from_boot = 0;
@@ -5471,7 +5471,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
         if (sdebug_host_max_queue)
                 sd_dp->hc_idx = get_tag(cmnd);
  
-       if (hipri)
+       if (polled)
                 ns_from_boot = ktime_get_boottime_ns();
  
         /* one of the resp_*() response functions is called here */
@@ -5531,7 +5531,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
                                 kt -= d;
                         }
                 }
-               if (hipri) {
+               if (polled) {
                         sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);
                         spin_lock_irqsave(&sqp->qc_lock, iflags);
                         if (!sd_dp->init_poll) {
@@ -5562,7 +5562,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
                 if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
                              atomic_read(&sdeb_inject_pending)))
                         sd_dp->aborted = true;
-               if (hipri) {
+               if (polled) {
                         sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);
                         spin_lock_irqsave(&sqp->qc_lock, iflags);
                         if (!sd_dp->init_poll) {
@@ -7331,7 +7331,7 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
                         if (kt_from_boot < sd_dp->cmpl_ts)
                                 continue;
  
-               } else          /* ignoring non REQ_HIPRI requests */
+               } else          /* ignoring non REQ_POLLED requests */
                         continue;
                 devip = (struct sdebug_dev_info *)scp->device->hostdata;
                 if (likely(devip))
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c

index b6c86cc..36870b4 100644 (file)
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1979,7 +1979,7 @@ maybe_retry:
  
  static void eh_lock_door_done(struct request *req, blk_status_t status)
  {
-       blk_put_request(req);
+       blk_mq_free_request(req);
  }
  
  /**
@@ -1998,7 +1998,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
         struct request *req;
         struct scsi_request *rq;
  
-       req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, 0);
+       req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0);
         if (IS_ERR(req))
                 return;
         rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c

index 6ff2207..34412ea 100644 (file)
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -438,7 +438,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk,
                 at_head = 1;
  
         ret = -ENOMEM;
-       rq = blk_get_request(sdev->request_queue, writing ?
+       rq = scsi_alloc_request(sdev->request_queue, writing ?
                              REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
@@ -490,7 +490,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk,
  out_free_cdb:
         scsi_req_free_cmd(req);
  out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
         return ret;
  }
  
@@ -561,7 +561,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk,
  
         }
  
-       rq = blk_get_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq)) {
                 err = PTR_ERR(rq);
                 goto error_free_buffer;
@@ -634,7 +634,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk,
         }
  
  error:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
  
  error_free_buffer:
         kfree(buffer);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c

index 5726738..9c2b99e 100644 (file)
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -21,6 +21,7 @@
  #include <linux/hardirq.h>
  #include <linux/scatterlist.h>
  #include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
  #include <linux/ratelimit.h>
  #include <asm/unaligned.h>
  
@@ -215,7 +216,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
         struct scsi_request *rq;
         int ret;
  
-       req = blk_get_request(sdev->request_queue,
+       req = scsi_alloc_request(sdev->request_queue,
                         data_direction == DMA_TO_DEVICE ?
                         REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
                         rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0);
@@ -259,7 +260,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                 scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
         ret = rq->result;
   out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
         return ret;
  }
@@ -1078,9 +1079,6 @@ EXPORT_SYMBOL(scsi_alloc_sgtables);
   * This function initializes the members of struct scsi_cmnd that must be
   * initialized before request processing starts and that won't be
   * reinitialized if a SCSI command is requeued.
- *
- * Called from inside blk_get_request() for pass-through requests and from
- * inside scsi_init_command() for filesystem requests.
   */
  static void scsi_initialize_rq(struct request *rq)
  {
@@ -1097,6 +1095,18 @@ static void scsi_initialize_rq(struct request *rq)
         cmd->retries = 0;
  }
  
+struct request *scsi_alloc_request(struct request_queue *q,
+               unsigned int op, blk_mq_req_flags_t flags)
+{
+       struct request *rq;
+
+       rq = blk_mq_alloc_request(q, op, flags);
+       if (!IS_ERR(rq))
+               scsi_initialize_rq(rq);
+       return rq;
+}
+EXPORT_SYMBOL_GPL(scsi_alloc_request);
+
  /*
   * Only called when the request isn't completed by SCSI, and not freed by
   * SCSI
@@ -1783,7 +1793,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
  }
  
  
-static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  {
         struct Scsi_Host *shost = hctx->driver_data;
  
@@ -1863,7 +1873,6 @@ static const struct blk_mq_ops scsi_mq_ops_no_commit = {
  #endif
         .init_request   = scsi_mq_init_request,
         .exit_request   = scsi_mq_exit_request,
-       .initialize_rq_fn = scsi_initialize_rq,
         .cleanup_rq     = scsi_cleanup_rq,
         .busy           = scsi_mq_lld_busy,
         .map_queues     = scsi_map_queues,
@@ -1893,7 +1902,6 @@ static const struct blk_mq_ops scsi_mq_ops = {
  #endif
         .init_request   = scsi_mq_init_request,
         .exit_request   = scsi_mq_exit_request,
-       .initialize_rq_fn = scsi_initialize_rq,
         .cleanup_rq     = scsi_cleanup_rq,
         .busy           = scsi_mq_lld_busy,
         .map_queues     = scsi_map_queues,
@@ -1959,6 +1967,14 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
  
         return sdev;
  }
+/*
+ * pktcdvd should have been integrated into the SCSI layers, but for historical
+ * reasons like the old IDE driver it isn't.  This export allows it to safely
+ * probe if a given device is a SCSI one and only attach to that.
+ */
+#ifdef CONFIG_CDROM_PKTCDVD_MODULE
+EXPORT_SYMBOL_GPL(scsi_device_from_queue);
+#endif
  
  /**
   * scsi_block_requests - Utility function used by low-level drivers to prevent
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c

index fe22191..2808c0c 100644 (file)
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -280,7 +280,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
         sdev->request_queue = q;
         q->queuedata = sdev;
         __scsi_init_queue(sdev->host, q);
-       blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
         WARN_ON_ONCE(!blk_get_queue(q));
  
         depth = sdev->host->cmd_per_lun ?: 1;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index fce6333..252e43d 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -48,6 +48,7 @@
  #include <linux/blkpg.h>
  #include <linux/blk-pm.h>
  #include <linux/delay.h>
+#include <linux/major.h>
  #include <linux/mutex.h>
  #include <linux/string_helpers.h>
  #include <linux/async.h>
@@ -1756,6 +1757,44 @@ static void sd_rescan(struct device *dev)
         sd_revalidate_disk(sdkp->disk);
  }
  
+static int sd_get_unique_id(struct gendisk *disk, u8 id[16],
+               enum blk_unique_id type)
+{
+       struct scsi_device *sdev = scsi_disk(disk)->device;
+       const struct scsi_vpd *vpd;
+       const unsigned char *d;
+       int ret = -ENXIO, len;
+
+       rcu_read_lock();
+       vpd = rcu_dereference(sdev->vpd_pg83);
+       if (!vpd)
+               goto out_unlock;
+
+       ret = -EINVAL;
+       for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) {
+               /* we only care about designators with LU association */
+               if (((d[1] >> 4) & 0x3) != 0x00)
+                       continue;
+               if ((d[1] & 0xf) != type)
+                       continue;
+
+               /*
+                * Only exit early if a 16-byte descriptor was found.  Otherwise
+                * keep looking as one with more entropy might still show up.
+                */
+               len = d[3];
+               if (len != 8 && len != 12 && len != 16)
+                       continue;
+               ret = len;
+               memcpy(id, d + 4, len);
+               if (len == 16)
+                       break;
+       }
+out_unlock:
+       rcu_read_unlock();
+       return ret;
+}
+
  static char sd_pr_type(enum pr_type type)
  {
         switch (type) {
@@ -1860,6 +1899,7 @@ static const struct block_device_operations sd_fops = {
         .check_events           = sd_check_events,
         .unlock_native_capacity = sd_unlock_native_capacity,
         .report_zones           = sd_zbc_report_zones,
+       .get_unique_id          = sd_get_unique_id,
         .pr_ops                 = &sd_pr_ops,
  };
  
@@ -3087,6 +3127,86 @@ static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer)
                 sdkp->security = 1;
  }
  
+static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf)
+{
+       return logical_to_sectors(sdkp->device, get_unaligned_be64(buf));
+}
+
+/**
+ * sd_read_cpr - Query concurrent positioning ranges
+ * @sdkp:      disk to query
+ */
+static void sd_read_cpr(struct scsi_disk *sdkp)
+{
+       struct blk_independent_access_ranges *iars = NULL;
+       unsigned char *buffer = NULL;
+       unsigned int nr_cpr = 0;
+       int i, vpd_len, buf_len = SD_BUF_SIZE;
+       u8 *desc;
+
+       /*
+        * We need to have the capacity set first for the block layer to be
+        * able to check the ranges.
+        */
+       if (sdkp->first_scan)
+               return;
+
+       if (!sdkp->capacity)
+               goto out;
+
+       /*
+        * Concurrent Positioning Ranges VPD: there can be at most 256 ranges,
+        * leading to a maximum page size of 64 + 256*32 bytes.
+        */
+       buf_len = 64 + 256*32;
+       buffer = kmalloc(buf_len, GFP_KERNEL);
+       if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len))
+               goto out;
+
+       /* We must have at least a 64B header and one 32B range descriptor */
+       vpd_len = get_unaligned_be16(&buffer[2]) + 3;
+       if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) {
+               sd_printk(KERN_ERR, sdkp,
+                         "Invalid Concurrent Positioning Ranges VPD page\n");
+               goto out;
+       }
+
+       nr_cpr = (vpd_len - 64) / 32;
+       if (nr_cpr == 1) {
+               nr_cpr = 0;
+               goto out;
+       }
+
+       iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr);
+       if (!iars) {
+               nr_cpr = 0;
+               goto out;
+       }
+
+       desc = &buffer[64];
+       for (i = 0; i < nr_cpr; i++, desc += 32) {
+               if (desc[0] != i) {
+                       sd_printk(KERN_ERR, sdkp,
+                               "Invalid Concurrent Positioning Range number\n");
+                       nr_cpr = 0;
+                       break;
+               }
+
+               iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8);
+               iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16);
+       }
+
+out:
+       disk_set_independent_access_ranges(sdkp->disk, iars);
+       if (nr_cpr && sdkp->nr_actuators != nr_cpr) {
+               sd_printk(KERN_NOTICE, sdkp,
+                         "%u concurrent positioning ranges\n", nr_cpr);
+               sdkp->nr_actuators = nr_cpr;
+       }
+
+       kfree(buffer);
+}
+
  /*
   * Determine the device's preferred I/O size for reads and writes
   * unless the reported value is unreasonably small, large, not a
@@ -3202,6 +3322,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
                 sd_read_app_tag_own(sdkp, buffer);
                 sd_read_write_same(sdkp, buffer);
                 sd_read_security(sdkp, buffer);
+               sd_read_cpr(sdkp);
         }
  
         /*
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h

index b59136c..2e5932b 100644 (file)
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -106,6 +106,7 @@ struct scsi_disk {
         u8              protection_type;/* Data Integrity Field */
         u8              provisioning_mode;
         u8              zeroing_mode;
+       u8              nr_actuators;           /* Number of actuators */
         unsigned        ATO : 1;        /* state of disk ATO bit */
         unsigned        cache_override : 1; /* temp override of WCE,RCD */
         unsigned        WCE : 1;        /* state of disk WCE bit */
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c

index 4cadb26..3499506 100644 (file)
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -6,7 +6,7 @@
   * Written by: Martin K. Petersen <martin.petersen@oracle.com>
   */
  
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/t10-pi.h>
  
  #include <scsi/scsi.h>
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c

index 8f05248..141099a 100644 (file)
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -31,6 +31,7 @@ static int sg_version_num = 30536;    /* 2 digits for each component */
  #include <linux/errno.h>
  #include <linux/mtio.h>
  #include <linux/ioctl.h>
+#include <linux/major.h>
  #include <linux/slab.h>
  #include <linux/fcntl.h>
  #include <linux/init.h>
@@ -814,7 +815,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
         if (atomic_read(&sdp->detaching)) {
                 if (srp->bio) {
                         scsi_req_free_cmd(scsi_req(srp->rq));
-                       blk_put_request(srp->rq);
+                       blk_mq_free_request(srp->rq);
                         srp->rq = NULL;
                 }
  
@@ -1389,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
          */
         srp->rq = NULL;
         scsi_req_free_cmd(scsi_req(rq));
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
  
         write_lock_irqsave(&sfp->rq_list_lock, iflags);
         if (unlikely(srp->orphan)) {
@@ -1717,13 +1718,13 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
          *
          * With scsi-mq enabled, there are a fixed number of preallocated
          * requests equal in number to shost->can_queue.  If all of the
-        * preallocated requests are already in use, then blk_get_request()
+        * preallocated requests are already in use, then scsi_alloc_request()
          * will sleep until an active command completes, freeing up a request.
          * Although waiting in an asynchronous interface is less than ideal, we
          * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might
          * not expect an EWOULDBLOCK from this condition.
          */
-       rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
+       rq = scsi_alloc_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
                         REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq)) {
                 kfree(long_cmdp);
@@ -1829,7 +1830,7 @@ sg_finish_rem_req(Sg_request *srp)
  
         if (srp->rq) {
                 scsi_req_free_cmd(scsi_req(srp->rq));
-               blk_put_request(srp->rq);
+               blk_mq_free_request(srp->rq);
         }
  
         if (srp->res_used)
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c

index 8b17b35..3009b98 100644 (file)
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -44,6 +44,7 @@
  #include <linux/cdrom.h>
  #include <linux/interrupt.h>
  #include <linux/init.h>
+#include <linux/major.h>
  #include <linux/blkdev.h>
  #include <linux/blk-pm.h>
  #include <linux/mutex.h>
@@ -966,7 +967,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf,
         struct bio *bio;
         int ret;
  
-       rq = blk_get_request(disk->queue, REQ_OP_DRV_IN, 0);
+       rq = scsi_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
         req = scsi_req(rq);
@@ -1002,7 +1003,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf,
         if (blk_rq_unmap_user(bio))
                 ret = -EFAULT;
  out_put_request:
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
         return ret;
  }
  
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c

index ae8636d..c2d5608 100644 (file)
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -32,6 +32,7 @@ static const char *verstr = "20160209";
  #include <linux/slab.h>
  #include <linux/errno.h>
  #include <linux/mtio.h>
+#include <linux/major.h>
  #include <linux/cdrom.h>
  #include <linux/ioctl.h>
  #include <linux/fcntl.h>
@@ -529,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
                 complete(SRpnt->waiting);
  
         blk_rq_unmap_user(tmp);
-       blk_put_request(req);
+       blk_mq_free_request(req);
  }
  
  static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
@@ -542,7 +543,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
         int err = 0;
         struct scsi_tape *STp = SRpnt->stp;
  
-       req = blk_get_request(SRpnt->stp->device->request_queue,
+       req = scsi_alloc_request(SRpnt->stp->device->request_queue,
                         data_direction == DMA_TO_DEVICE ?
                         REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(req))
@@ -556,7 +557,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
                 err = blk_rq_map_user(req->q, req, mdata, NULL, bufflen,
                                       GFP_KERNEL);
                 if (err) {
-                       blk_put_request(req);
+                       blk_mq_free_request(req);
                         return err;
                 }
         }
diff --git a/drivers/scsi/ufs/ufs-exynos.c b/drivers/scsi/ufs/ufs-exynos.c

index a14dd8c..bb2dd79 100644 (file)
--- a/drivers/scsi/ufs/ufs-exynos.c
+++ b/drivers/scsi/ufs/ufs-exynos.c
@@ -642,9 +642,9 @@ static int exynos_ufs_pre_pwr_mode(struct ufs_hba *hba,
         }
  
         /* setting for three timeout values for traffic class #0 */
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), 8064);
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), 28224);
-       ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), 20160);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_FC0PROTTIMEOUTVAL), 8064);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_TC0REPLAYTIMEOUTVAL), 28224);
+       ufshcd_dme_set(hba, UIC_ARG_MIB(DL_AFC0REQTIMEOUTVAL), 20160);
  
         return 0;
  out:
diff --git a/drivers/scsi/ufs/ufshcd-crypto.c b/drivers/scsi/ufs/ufshcd-crypto.c

index d70cdcd..67402ba 100644 (file)
--- a/drivers/scsi/ufs/ufshcd-crypto.c
+++ b/drivers/scsi/ufs/ufshcd-crypto.c
@@ -48,11 +48,12 @@ out:
         return err;
  }
  
-static int ufshcd_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile,
                                          const struct blk_crypto_key *key,
                                          unsigned int slot)
  {
-       struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+       struct ufs_hba *hba =
+               container_of(profile, struct ufs_hba, crypto_profile);
         const union ufs_crypto_cap_entry *ccap_array = hba->crypto_cap_array;
         const struct ufs_crypto_alg_entry *alg =
                         &ufs_crypto_algs[key->crypto_cfg.crypto_mode];
@@ -105,11 +106,12 @@ static int ufshcd_clear_keyslot(struct ufs_hba *hba, int slot)
         return ufshcd_program_key(hba, &cfg, slot);
  }
  
-static int ufshcd_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_evict(struct blk_crypto_profile *profile,
                                        const struct blk_crypto_key *key,
                                        unsigned int slot)
  {
-       struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+       struct ufs_hba *hba =
+               container_of(profile, struct ufs_hba, crypto_profile);
  
         return ufshcd_clear_keyslot(hba, slot);
  }
@@ -120,11 +122,11 @@ bool ufshcd_crypto_enable(struct ufs_hba *hba)
                 return false;
  
         /* Reset might clear all keys, so reprogram all the keys. */
-       blk_ksm_reprogram_all_keys(&hba->ksm);
+       blk_crypto_reprogram_all_keys(&hba->crypto_profile);
         return true;
  }
  
-static const struct blk_ksm_ll_ops ufshcd_ksm_ops = {
+static const struct blk_crypto_ll_ops ufshcd_crypto_ops = {
         .keyslot_program        = ufshcd_crypto_keyslot_program,
         .keyslot_evict          = ufshcd_crypto_keyslot_evict,
  };
@@ -179,15 +181,16 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
         }
  
         /* The actual number of configurations supported is (CFGC+1) */
-       err = devm_blk_ksm_init(hba->dev, &hba->ksm,
-                               hba->crypto_capabilities.config_count + 1);
+       err = devm_blk_crypto_profile_init(
+                       hba->dev, &hba->crypto_profile,
+                       hba->crypto_capabilities.config_count + 1);
         if (err)
                 goto out;
  
-       hba->ksm.ksm_ll_ops = ufshcd_ksm_ops;
+       hba->crypto_profile.ll_ops = ufshcd_crypto_ops;
         /* UFS only supports 8 bytes for any DUN */
-       hba->ksm.max_dun_bytes_supported = 8;
-       hba->ksm.dev = hba->dev;
+       hba->crypto_profile.max_dun_bytes_supported = 8;
+       hba->crypto_profile.dev = hba->dev;
  
         /*
          * Cache all the UFS crypto capabilities and advertise the supported
@@ -202,7 +205,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
                 blk_mode_num = ufshcd_find_blk_crypto_mode(
                                                 hba->crypto_cap_array[cap_idx]);
                 if (blk_mode_num != BLK_ENCRYPTION_MODE_INVALID)
-                       hba->ksm.crypto_modes_supported[blk_mode_num] |=
+                       hba->crypto_profile.modes_supported[blk_mode_num] |=
                                 hba->crypto_cap_array[cap_idx].sdus_mask * 512;
         }
  
@@ -230,9 +233,8 @@ void ufshcd_init_crypto(struct ufs_hba *hba)
                 ufshcd_clear_keyslot(hba, slot);
  }
  
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                           struct request_queue *q)
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q)
  {
         if (hba->caps & UFSHCD_CAP_CRYPTO)
-               blk_ksm_register(&hba->ksm, q);
+               blk_crypto_register(&hba->crypto_profile, q);
  }
diff --git a/drivers/scsi/ufs/ufshcd-crypto.h b/drivers/scsi/ufs/ufshcd-crypto.h

index 78a58e7..e18c012 100644 (file)
--- a/drivers/scsi/ufs/ufshcd-crypto.h
+++ b/drivers/scsi/ufs/ufshcd-crypto.h
@@ -18,7 +18,7 @@ static inline void ufshcd_prepare_lrbp_crypto(struct request *rq,
                 return;
         }
  
-       lrbp->crypto_key_slot = blk_ksm_get_slot_idx(rq->crypt_keyslot);
+       lrbp->crypto_key_slot = blk_crypto_keyslot_index(rq->crypt_keyslot);
         lrbp->data_unit_num = rq->crypt_ctx->bc_dun[0];
  }
  
@@ -40,8 +40,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba);
  
  void ufshcd_init_crypto(struct ufs_hba *hba);
  
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                           struct request_queue *q);
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q);
  
  #else /* CONFIG_SCSI_UFS_CRYPTO */
  
@@ -64,8 +63,8 @@ static inline int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
  
  static inline void ufshcd_init_crypto(struct ufs_hba *hba) { }
  
-static inline void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
-                                               struct request_queue *q) { }
+static inline void ufshcd_crypto_register(struct ufs_hba *hba,
+                                         struct request_queue *q) { }
  
  #endif /* CONFIG_SCSI_UFS_CRYPTO */
  
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c

index 95be7ec..db1bc86 100644 (file)
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2737,12 +2737,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
  
         lrbp->req_abort_skip = false;
  
-       err = ufshpb_prep(hba, lrbp);
-       if (err == -EAGAIN) {
-               lrbp->cmd = NULL;
-               ufshcd_release(hba);
-               goto out;
-       }
+       ufshpb_prep(hba, lrbp);
  
         ufshcd_comp_scsi_upiu(hba, lrbp);
  
@@ -2925,7 +2920,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
          * Even though we use wait_event() which sleeps indefinitely,
          * the maximum wait time is bounded by SCSI request timeout.
          */
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
         if (IS_ERR(req)) {
                 err = PTR_ERR(req);
                 goto out_unlock;
@@ -2952,7 +2947,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
                                     (struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
  
  out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
  out_unlock:
         up_read(&hba->clk_scaling_lock);
         return err;
@@ -4986,7 +4981,7 @@ static int ufshcd_slave_configure(struct scsi_device *sdev)
         else if (ufshcd_is_rpm_autosuspend_allowed(hba))
                 sdev->rpm_autosuspend = 1;
  
-       ufshcd_crypto_setup_rq_keyslot_manager(hba, q);
+       ufshcd_crypto_register(hba, q);
  
         return 0;
  }
@@ -6517,9 +6512,9 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba,
         int task_tag, err;
  
         /*
-        * blk_get_request() is used here only to get a free tag.
+        * blk_mq_alloc_request() is used here only to get a free tag.
          */
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
         if (IS_ERR(req))
                 return PTR_ERR(req);
  
@@ -6575,7 +6570,7 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba,
         spin_unlock_irqrestore(hba->host->host_lock, flags);
  
         ufshcd_release(hba);
-       blk_put_request(req);
+       blk_mq_free_request(req);
  
         return err;
  }
@@ -6660,7 +6655,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
  
         down_read(&hba->clk_scaling_lock);
  
-       req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+       req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0);
         if (IS_ERR(req)) {
                 err = PTR_ERR(req);
                 goto out_unlock;
@@ -6741,7 +6736,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
                                     (struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
  
  out:
-       blk_put_request(req);
+       blk_mq_free_request(req);
  out_unlock:
         up_read(&hba->clk_scaling_lock);
         return err;
@@ -7912,7 +7907,7 @@ static void ufshcd_request_sense_done(struct request *rq, blk_status_t error)
         if (error != BLK_STS_OK)
                 pr_err("%s: REQUEST SENSE failed (%d)\n", __func__, error);
         kfree(rq->end_io_data);
-       blk_put_request(rq);
+       blk_mq_free_request(rq);
  }
  
  static int
@@ -7932,7 +7927,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev)
         if (!buffer)
                 return -ENOMEM;
  
-       req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN,
+       req = blk_mq_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
                               /*flags=*/BLK_MQ_REQ_PM);
         if (IS_ERR(req)) {
                 ret = PTR_ERR(req);
@@ -7957,7 +7952,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev)
         return 0;
  
  out_put:
-       blk_put_request(req);
+       blk_mq_free_request(req);
  out_free:
         kfree(buffer);
         return ret;
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h

index 41f6e06..62bdc41 100644 (file)
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -32,7 +32,7 @@
  #include <linux/regulator/consumer.h>
  #include <linux/bitfield.h>
  #include <linux/devfreq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  #include "unipro.h"
  
  #include <asm/irq.h>
@@ -766,7 +766,7 @@ struct ufs_hba_monitor {
   * @crypto_capabilities: Content of crypto capabilities register (0x100)
   * @crypto_cap_array: Array of crypto capabilities
   * @crypto_cfg_register: Start of the crypto cfg array
- * @ksm: the keyslot manager tied to this hba
+ * @crypto_profile: the crypto profile of this hba (if applicable)
   */
  struct ufs_hba {
         void __iomem *mmio_base;
@@ -911,7 +911,7 @@ struct ufs_hba {
         union ufs_crypto_capabilities crypto_capabilities;
         union ufs_crypto_cap_entry *crypto_cap_array;
         u32 crypto_cfg_register;
-       struct blk_keyslot_manager ksm;
+       struct blk_crypto_profile crypto_profile;
  #endif
  #ifdef CONFIG_DEBUG_FS
         struct dentry *debugfs_root;
diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c

index 589af5f..182bcbf 100644 (file)
--- a/drivers/scsi/ufs/ufshpb.c
+++ b/drivers/scsi/ufs/ufshpb.c
@@ -84,16 +84,6 @@ static bool ufshpb_is_supported_chunk(struct ufshpb_lu *hpb, int transfer_len)
         return transfer_len <= hpb->pre_req_max_tr_len;
  }
  
-/*
- * In this driver, WRITE_BUFFER CMD support 36KB (len=9) ~ 1MB (len=256) as
- * default. It is possible to change range of transfer_len through sysfs.
- */
-static inline bool ufshpb_is_required_wb(struct ufshpb_lu *hpb, int len)
-{
-       return len > hpb->pre_req_min_tr_len &&
-              len <= hpb->pre_req_max_tr_len;
-}
-
  static bool ufshpb_is_general_lun(int lun)
  {
         return lun < UFS_UPIU_MAX_UNIT_NUM_ID;
@@ -334,7 +324,7 @@ ufshpb_get_pos_from_lpn(struct ufshpb_lu *hpb, unsigned long lpn, int *rgn_idx,
  
  static void
  ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-                           __be64 ppn, u8 transfer_len, int read_id)
+                           __be64 ppn, u8 transfer_len)
  {
         unsigned char *cdb = lrbp->cmd->cmnd;
         __be64 ppn_tmp = ppn;
@@ -346,256 +336,11 @@ ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
         /* ppn value is stored as big-endian in the host memory */
         memcpy(&cdb[6], &ppn_tmp, sizeof(__be64));
         cdb[14] = transfer_len;
-       cdb[15] = read_id;
+       cdb[15] = 0;
  
         lrbp->cmd->cmd_len = UFS_CDB_SIZE;
  }
  
-static inline void ufshpb_set_write_buf_cmd(unsigned char *cdb,
-                                           unsigned long lpn, unsigned int len,
-                                           int read_id)
-{
-       cdb[0] = UFSHPB_WRITE_BUFFER;
-       cdb[1] = UFSHPB_WRITE_BUFFER_PREFETCH_ID;
-
-       put_unaligned_be32(lpn, &cdb[2]);
-       cdb[6] = read_id;
-       put_unaligned_be16(len * HPB_ENTRY_SIZE, &cdb[7]);
-
-       cdb[9] = 0x00;  /* Control = 0x00 */
-}
-
-static struct ufshpb_req *ufshpb_get_pre_req(struct ufshpb_lu *hpb)
-{
-       struct ufshpb_req *pre_req;
-
-       if (hpb->num_inflight_pre_req >= hpb->throttle_pre_req) {
-               dev_info(&hpb->sdev_ufs_lu->sdev_dev,
-                        "pre_req throttle. inflight %d throttle %d",
-                        hpb->num_inflight_pre_req, hpb->throttle_pre_req);
-               return NULL;
-       }
-
-       pre_req = list_first_entry_or_null(&hpb->lh_pre_req_free,
-                                          struct ufshpb_req, list_req);
-       if (!pre_req) {
-               dev_info(&hpb->sdev_ufs_lu->sdev_dev, "There is no pre_req");
-               return NULL;
-       }
-
-       list_del_init(&pre_req->list_req);
-       hpb->num_inflight_pre_req++;
-
-       return pre_req;
-}
-
-static inline void ufshpb_put_pre_req(struct ufshpb_lu *hpb,
-                                     struct ufshpb_req *pre_req)
-{
-       pre_req->req = NULL;
-       bio_reset(pre_req->bio);
-       list_add_tail(&pre_req->list_req, &hpb->lh_pre_req_free);
-       hpb->num_inflight_pre_req--;
-}
-
-static void ufshpb_pre_req_compl_fn(struct request *req, blk_status_t error)
-{
-       struct ufshpb_req *pre_req = (struct ufshpb_req *)req->end_io_data;
-       struct ufshpb_lu *hpb = pre_req->hpb;
-       unsigned long flags;
-
-       if (error) {
-               struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-               struct scsi_sense_hdr sshdr;
-
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev, "block status %d", error);
-               scsi_command_normalize_sense(cmd, &sshdr);
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "code %x sense_key %x asc %x ascq %x",
-                       sshdr.response_code,
-                       sshdr.sense_key, sshdr.asc, sshdr.ascq);
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "byte4 %x byte5 %x byte6 %x additional_len %x",
-                       sshdr.byte4, sshdr.byte5,
-                       sshdr.byte6, sshdr.additional_length);
-       }
-
-       blk_mq_free_request(req);
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       ufshpb_put_pre_req(pre_req->hpb, pre_req);
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-}
-
-static int ufshpb_prep_entry(struct ufshpb_req *pre_req, struct page *page)
-{
-       struct ufshpb_lu *hpb = pre_req->hpb;
-       struct ufshpb_region *rgn;
-       struct ufshpb_subregion *srgn;
-       __be64 *addr;
-       int offset = 0;
-       int copied;
-       unsigned long lpn = pre_req->wb.lpn;
-       int rgn_idx, srgn_idx, srgn_offset;
-       unsigned long flags;
-
-       addr = page_address(page);
-       ufshpb_get_pos_from_lpn(hpb, lpn, &rgn_idx, &srgn_idx, &srgn_offset);
-
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-
-next_offset:
-       rgn = hpb->rgn_tbl + rgn_idx;
-       srgn = rgn->srgn_tbl + srgn_idx;
-
-       if (!ufshpb_is_valid_srgn(rgn, srgn))
-               goto mctx_error;
-
-       if (!srgn->mctx)
-               goto mctx_error;
-
-       copied = ufshpb_fill_ppn_from_page(hpb, srgn->mctx, srgn_offset,
-                                          pre_req->wb.len - offset,
-                                          &addr[offset]);
-
-       if (copied < 0)
-               goto mctx_error;
-
-       offset += copied;
-       srgn_offset += copied;
-
-       if (srgn_offset == hpb->entries_per_srgn) {
-               srgn_offset = 0;
-
-               if (++srgn_idx == hpb->srgns_per_rgn) {
-                       srgn_idx = 0;
-                       rgn_idx++;
-               }
-       }
-
-       if (offset < pre_req->wb.len)
-               goto next_offset;
-
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       return 0;
-mctx_error:
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       return -ENOMEM;
-}
-
-static int ufshpb_pre_req_add_bio_page(struct ufshpb_lu *hpb,
-                                      struct request_queue *q,
-                                      struct ufshpb_req *pre_req)
-{
-       struct page *page = pre_req->wb.m_page;
-       struct bio *bio = pre_req->bio;
-       int entries_bytes, ret;
-
-       if (!page)
-               return -ENOMEM;
-
-       if (ufshpb_prep_entry(pre_req, page))
-               return -ENOMEM;
-
-       entries_bytes = pre_req->wb.len * sizeof(__be64);
-
-       ret = bio_add_pc_page(q, bio, page, entries_bytes, 0);
-       if (ret != entries_bytes) {
-               dev_err(&hpb->sdev_ufs_lu->sdev_dev,
-                       "bio_add_pc_page fail: %d", ret);
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-static inline int ufshpb_get_read_id(struct ufshpb_lu *hpb)
-{
-       if (++hpb->cur_read_id >= MAX_HPB_READ_ID)
-               hpb->cur_read_id = 1;
-       return hpb->cur_read_id;
-}
-
-static int ufshpb_execute_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd,
-                                 struct ufshpb_req *pre_req, int read_id)
-{
-       struct scsi_device *sdev = cmd->device;
-       struct request_queue *q = sdev->request_queue;
-       struct request *req;
-       struct scsi_request *rq;
-       struct bio *bio = pre_req->bio;
-
-       pre_req->hpb = hpb;
-       pre_req->wb.lpn = sectors_to_logical(cmd->device,
-                                            blk_rq_pos(scsi_cmd_to_rq(cmd)));
-       pre_req->wb.len = sectors_to_logical(cmd->device,
-                                            blk_rq_sectors(scsi_cmd_to_rq(cmd)));
-       if (ufshpb_pre_req_add_bio_page(hpb, q, pre_req))
-               return -ENOMEM;
-
-       req = pre_req->req;
-
-       /* 1. request setup */
-       blk_rq_append_bio(req, bio);
-       req->rq_disk = NULL;
-       req->end_io_data = (void *)pre_req;
-       req->end_io = ufshpb_pre_req_compl_fn;
-
-       /* 2. scsi_request setup */
-       rq = scsi_req(req);
-       rq->retries = 1;
-
-       ufshpb_set_write_buf_cmd(rq->cmd, pre_req->wb.lpn, pre_req->wb.len,
-                                read_id);
-       rq->cmd_len = scsi_command_size(rq->cmd);
-
-       if (blk_insert_cloned_request(q, req) != BLK_STS_OK)
-               return -EAGAIN;
-
-       hpb->stats.pre_req_cnt++;
-
-       return 0;
-}
-
-static int ufshpb_issue_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd,
-                               int *read_id)
-{
-       struct ufshpb_req *pre_req;
-       struct request *req = NULL;
-       unsigned long flags;
-       int _read_id;
-       int ret = 0;
-
-       req = blk_get_request(cmd->device->request_queue,
-                             REQ_OP_DRV_OUT | REQ_SYNC, BLK_MQ_REQ_NOWAIT);
-       if (IS_ERR(req))
-               return -EAGAIN;
-
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       pre_req = ufshpb_get_pre_req(hpb);
-       if (!pre_req) {
-               ret = -EAGAIN;
-               goto unlock_out;
-       }
-       _read_id = ufshpb_get_read_id(hpb);
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-
-       pre_req->req = req;
-
-       ret = ufshpb_execute_pre_req(hpb, cmd, pre_req, _read_id);
-       if (ret)
-               goto free_pre_req;
-
-       *read_id = _read_id;
-
-       return ret;
-free_pre_req:
-       spin_lock_irqsave(&hpb->rgn_state_lock, flags);
-       ufshpb_put_pre_req(hpb, pre_req);
-unlock_out:
-       spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
-       blk_put_request(req);
-       return ret;
-}
-
  /*
   * This function will set up HPB read command using host-side L2P map data.
   */
@@ -609,7 +354,6 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
         __be64 ppn;
         unsigned long flags;
         int transfer_len, rgn_idx, srgn_idx, srgn_offset;
-       int read_id = 0;
         int err = 0;
  
         hpb = ufshpb_get_hpb_data(cmd->device);
@@ -685,24 +429,8 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
                 dev_err(hba->dev, "get ppn failed. err %d\n", err);
                 return err;
         }
-       if (!ufshpb_is_legacy(hba) &&
-           ufshpb_is_required_wb(hpb, transfer_len)) {
-               err = ufshpb_issue_pre_req(hpb, cmd, &read_id);
-               if (err) {
-                       unsigned long timeout;
-
-                       timeout = cmd->jiffies_at_alloc + msecs_to_jiffies(
-                                 hpb->params.requeue_timeout_ms);
-
-                       if (time_before(jiffies, timeout))
-                               return -EAGAIN;
-
-                       hpb->stats.miss_cnt++;
-                       return 0;
-               }
-       }
  
-       ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len, read_id);
+       ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len);
  
         hpb->stats.hit_cnt++;
         return 0;
@@ -721,7 +449,7 @@ static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb,
                 return NULL;
  
  retry:
-       req = blk_get_request(hpb->sdev_ufs_lu->request_queue, dir,
+       req = blk_mq_alloc_request(hpb->sdev_ufs_lu->request_queue, dir,
                               BLK_MQ_REQ_NOWAIT);
  
         if (!atomic && (PTR_ERR(req) == -EWOULDBLOCK) && (--retries > 0)) {
@@ -745,7 +473,7 @@ free_rq:
  
  static void ufshpb_put_req(struct ufshpb_lu *hpb, struct ufshpb_req *rq)
  {
-       blk_put_request(rq->req);
+       blk_mq_free_request(rq->req);
         kmem_cache_free(hpb->map_req_cache, rq);
  }
  
@@ -1841,16 +1569,11 @@ static void ufshpb_lu_parameter_init(struct ufs_hba *hba,
         u32 entries_per_rgn;
         u64 rgn_mem_size, tmp;
  
-       /* for pre_req */
-       hpb->pre_req_min_tr_len = hpb_dev_info->max_hpb_single_cmd + 1;
-
         if (ufshpb_is_legacy(hba))
                 hpb->pre_req_max_tr_len = HPB_LEGACY_CHUNK_HIGH;
         else
                 hpb->pre_req_max_tr_len = HPB_MULTI_CHUNK_HIGH;
  
-       hpb->cur_read_id = 0;
-
         hpb->lu_pinned_start = hpb_lu_info->pinned_start;
         hpb->lu_pinned_end = hpb_lu_info->num_pinned ?
                 (hpb_lu_info->pinned_start + hpb_lu_info->num_pinned - 1)
diff --git a/drivers/scsi/ufs/ufshpb.h b/drivers/scsi/ufs/ufshpb.h

index a79e073..f15d8fd 100644 (file)
--- a/drivers/scsi/ufs/ufshpb.h
+++ b/drivers/scsi/ufs/ufshpb.h
@@ -241,8 +241,6 @@ struct ufshpb_lu {
         spinlock_t param_lock;
  
         struct list_head lh_pre_req_free;
-       int cur_read_id;
-       int pre_req_min_tr_len;
         int pre_req_max_tr_len;
  
         /* cached L2P map management worker */
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c

index 07d0250..b8455fc 100644 (file)
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -22,6 +22,7 @@
  #include <linux/virtio_scsi.h>
  #include <linux/cpu.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <scsi/scsi_host.h>
  #include <scsi/scsi_device.h>
  #include <scsi/scsi_cmnd.h>
diff --git a/drivers/spi/spi-altera-dfl.c b/drivers/spi/spi-altera-dfl.c

index 44fc9ee..ca40923 100644 (file)
--- a/drivers/spi/spi-altera-dfl.c
+++ b/drivers/spi/spi-altera-dfl.c
@@ -134,7 +134,7 @@ static int dfl_spi_altera_probe(struct dfl_device *dfl_dev)
         if (!master)
                 return -ENOMEM;
  
-       master->bus_num = dfl_dev->id;
+       master->bus_num = -1;
  
         hw = spi_master_get_devdata(master);
  
diff --git a/drivers/spi/spi-altera-platform.c b/drivers/spi/spi-altera-platform.c

index f7a7c14..65147aa 100644 (file)
--- a/drivers/spi/spi-altera-platform.c
+++ b/drivers/spi/spi-altera-platform.c
@@ -48,7 +48,7 @@ static int altera_spi_probe(struct platform_device *pdev)
                 return err;
  
         /* setup the master state. */
-       master->bus_num = pdev->id;
+       master->bus_num = -1;
  
         if (pdata) {
                 if (pdata->num_chipselect > ALTERA_SPI_MAX_CS) {
diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c

index feebda6..e4484ac 100644 (file)
--- a/drivers/spi/spi-pl022.c
+++ b/drivers/spi/spi-pl022.c
@@ -1716,12 +1716,13 @@ static int verify_controller_parameters(struct pl022 *pl022,
                                 return -EINVAL;
                         }
                 } else {
-                       if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX)
+                       if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) {
                                 dev_err(&pl022->adev->dev,
                                         "Microwire half duplex mode requested,"
                                         " but this is only available in the"
                                         " ST version of PL022\n");
-                       return -EINVAL;
+                               return -EINVAL;
+                       }
                 }
         }
         return 0;
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c

index ef4a8e1..8190b84 100644 (file)
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -20,6 +20,7 @@
  #include <linux/vmalloc.h>
  #include <linux/falloc.h>
  #include <linux/uio.h>
+#include <linux/scatterlist.h>
  #include <scsi/scsi_proto.h>
  #include <asm/unaligned.h>
  
@@ -244,7 +245,7 @@ struct target_core_file_cmd {
         struct bio_vec  bvecs[];
  };
  
-static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void cmd_rw_aio_complete(struct kiocb *iocb, long ret)
  {
         struct target_core_file_cmd *cmd;
  
@@ -302,7 +303,7 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
                 ret = call_read_iter(file, &aio_cmd->iocb, &iter);
  
         if (ret != -EIOCBQUEUED)
-               cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
+               cmd_rw_aio_complete(&aio_cmd->iocb, ret);
  
         return 0;
  }
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c

index 4069a1e..b1ef041 100644 (file)
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -16,12 +16,14 @@
  #include <linux/timer.h>
  #include <linux/fs.h>
  #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
  #include <linux/slab.h>
  #include <linux/spinlock.h>
  #include <linux/bio.h>
  #include <linux/genhd.h>
  #include <linux/file.h>
  #include <linux/module.h>
+#include <linux/scatterlist.h>
  #include <scsi/scsi_proto.h>
  #include <asm/unaligned.h>
  
@@ -230,9 +232,9 @@ static unsigned long long iblock_emulate_read_cap_with_block_size(
         struct block_device *bd,
         struct request_queue *q)
  {
-       unsigned long long blocks_long = (div_u64(i_size_read(bd->bd_inode),
-                                       bdev_logical_block_size(bd)) - 1);
         u32 block_size = bdev_logical_block_size(bd);
+       unsigned long long blocks_long =
+               div_u64(bdev_nr_bytes(bd), block_size) - 1;
  
         if (block_size == dev->dev_attrib.block_size)
                 return blocks_long;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c

index 75ef52f..7fa57fb 100644 (file)
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -980,11 +980,10 @@ pscsi_execute_cmd(struct se_cmd *cmd)
         memcpy(pt->pscsi_cdb, cmd->t_task_cdb,
                 scsi_command_size(cmd->t_task_cdb));
  
-       req = blk_get_request(pdv->pdv_sd->request_queue,
+       req = scsi_alloc_request(pdv->pdv_sd->request_queue,
                         cmd->data_direction == DMA_TO_DEVICE ?
                         REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
         if (IS_ERR(req)) {
-               pr_err("PSCSI: blk_get_request() failed\n");
                 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
                 goto fail;
         }
@@ -1012,7 +1011,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
         return 0;
  
  fail_put_request:
-       blk_put_request(req);
+       blk_mq_free_request(req);
  fail:
         kfree(pt);
         return ret;
@@ -1067,7 +1066,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
                 break;
         }
  
-       blk_put_request(req);
+       blk_mq_free_request(req);
         kfree(pt);
  }
  
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c

index 8260f38..e20c19a 100644 (file)
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
                 kthread_unuse_mm(io_data->mm);
         }
  
-       io_data->kiocb->ki_complete(io_data->kiocb, ret, ret);
+       io_data->kiocb->ki_complete(io_data->kiocb, ret);
  
         if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd)
                 eventfd_signal(io_data->ffs->ffs_eventfd, 1);
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c

index 539220d..78be947 100644 (file)
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -469,7 +469,7 @@ static void ep_user_copy_worker(struct work_struct *work)
                 ret = -EFAULT;
  
         /* completing the iocb can drop the ctx and mm, don't touch mm after */
-       iocb->ki_complete(iocb, ret, ret);
+       iocb->ki_complete(iocb, ret);
  
         kfree(priv->buf);
         kfree(priv->to_free);
@@ -496,11 +496,8 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
                 kfree(priv->to_free);
                 kfree(priv);
                 iocb->private = NULL;
-               /* aio_complete() reports bytes-transferred _and_ faults */
-
                 iocb->ki_complete(iocb,
-                               req->actual ? req->actual : (long)req->status,
-                               req->status);
+                               req->actual ? req->actual : (long)req->status);
         } else {
                 /* ep_copy_to_user() won't report both; we hide some faults */
                 if (unlikely(0 != req->status))
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c

index 26e3d90..841667a 100644 (file)
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -80,6 +80,7 @@ struct vduse_dev {
         struct vdpa_callback config_cb;
         struct work_struct inject;
         spinlock_t irq_lock;
+       struct rw_semaphore rwsem;
         int minor;
         bool broken;
         bool connected;
@@ -410,6 +411,8 @@ static void vduse_dev_reset(struct vduse_dev *dev)
         if (domain->bounce_map)
                 vduse_domain_reset_bounce_map(domain);
  
+       down_write(&dev->rwsem);
+
         dev->status = 0;
         dev->driver_features = 0;
         dev->generation++;
@@ -443,6 +446,8 @@ static void vduse_dev_reset(struct vduse_dev *dev)
                 flush_work(&vq->inject);
                 flush_work(&vq->kick);
         }
+
+       up_write(&dev->rwsem);
  }
  
  static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
@@ -885,6 +890,23 @@ static void vduse_vq_irq_inject(struct work_struct *work)
         spin_unlock_irq(&vq->irq_lock);
  }
  
+static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
+                                   struct work_struct *irq_work)
+{
+       int ret = -EINVAL;
+
+       down_read(&dev->rwsem);
+       if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+               goto unlock;
+
+       ret = 0;
+       queue_work(vduse_irq_wq, irq_work);
+unlock:
+       up_read(&dev->rwsem);
+
+       return ret;
+}
+
  static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
  {
@@ -966,8 +988,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                 break;
         }
         case VDUSE_DEV_INJECT_CONFIG_IRQ:
-               ret = 0;
-               queue_work(vduse_irq_wq, &dev->inject);
+               ret = vduse_dev_queue_irq_work(dev, &dev->inject);
                 break;
         case VDUSE_VQ_SETUP: {
                 struct vduse_vq_config config;
@@ -1053,9 +1074,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                 if (index >= dev->vq_num)
                         break;
  
-               ret = 0;
                 index = array_index_nospec(index, dev->vq_num);
-               queue_work(vduse_irq_wq, &dev->vqs[index].inject);
+               ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
                 break;
         }
         default:
@@ -1136,6 +1156,7 @@ static struct vduse_dev *vduse_dev_create(void)
         INIT_LIST_HEAD(&dev->send_list);
         INIT_LIST_HEAD(&dev->recv_list);
         spin_lock_init(&dev->irq_lock);
+       init_rwsem(&dev->rwsem);
  
         INIT_WORK(&dev->inject, vduse_dev_irq_inject);
         init_waitqueue_head(&dev->waitq);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c

index dd95dfd..3035bb6 100644 (file)
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -576,7 +576,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
         /* Last one doesn't continue. */
         desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
         if (!indirect && vq->use_dma_api)
-               vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags =
+               vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
                         ~VRING_DESC_F_NEXT;
  
         if (indirect) {
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c

index 643c6c2..ced2fc0 100644 (file)
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -71,8 +71,6 @@
  #define TCOBASE(p)     ((p)->tco_res->start)
  /* SMI Control and Enable Register */
  #define SMI_EN(p)      ((p)->smi_res->start)
-#define TCO_EN         (1 << 13)
-#define GBL_SMI_EN     (1 << 0)
  
  #define TCO_RLD(p)     (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */
  #define TCOv1_TMR(p)   (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/
@@ -357,12 +355,8 @@ static int iTCO_wdt_set_timeout(struct watchdog_device *wd_dev, unsigned int t)
  
         tmrval = seconds_to_ticks(p, t);
  
-       /*
-        * If TCO SMIs are off, the timer counts down twice before rebooting.
-        * Otherwise, the BIOS generally reboots when the SMI triggers.
-        */
-       if (p->smi_res &&
-           (inl(SMI_EN(p)) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN))
+       /* For TCO v1 the timer counts down twice before rebooting */
+       if (p->iTCO_version == 1)
                 tmrval /= 2;
  
         /* from the specs: */
@@ -527,7 +521,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev)
                  * Disables TCO logic generating an SMI#
                  */
                 val32 = inl(SMI_EN(p));
-               val32 &= ~TCO_EN;       /* Turn off SMI clearing watchdog */
+               val32 &= 0xffffdfff;    /* Turn off SMI clearing watchdog */
                 outl(val32, SMI_EN(p));
         }
  
diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c

index 2693ffb..31b03fa 100644 (file)
--- a/drivers/watchdog/ixp4xx_wdt.c
+++ b/drivers/watchdog/ixp4xx_wdt.c
@@ -119,7 +119,7 @@ static int ixp4xx_wdt_probe(struct platform_device *pdev)
         iwdt = devm_kzalloc(dev, sizeof(*iwdt), GFP_KERNEL);
         if (!iwdt)
                 return -ENOMEM;
-       iwdt->base = dev->platform_data;
+       iwdt->base = (void __iomem *)dev->platform_data;
  
         /*
          * Retrieve rate from a fixed clock from the device tree if
diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c

index 1616f93..74d785b 100644 (file)
--- a/drivers/watchdog/omap_wdt.c
+++ b/drivers/watchdog/omap_wdt.c
@@ -268,8 +268,12 @@ static int omap_wdt_probe(struct platform_device *pdev)
                         wdev->wdog.bootstatus = WDIOF_CARDRESET;
         }
  
-       if (!early_enable)
+       if (early_enable) {
+               omap_wdt_start(&wdev->wdog);
+               set_bit(WDOG_HW_RUNNING, &wdev->wdog.status);
+       } else {
                 omap_wdt_disable(wdev);
+       }
  
         ret = watchdog_register_device(&wdev->wdog);
         if (ret) {
diff --git a/drivers/watchdog/sbsa_gwdt.c b/drivers/watchdog/sbsa_gwdt.c

index ee9ff38..9791c74 100644 (file)
--- a/drivers/watchdog/sbsa_gwdt.c
+++ b/drivers/watchdog/sbsa_gwdt.c
@@ -130,7 +130,7 @@ static u64 sbsa_gwdt_reg_read(struct sbsa_gwdt *gwdt)
         if (gwdt->version == 0)
                 return readl(gwdt->control_base + SBSA_GWDT_WOR);
         else
-               return readq(gwdt->control_base + SBSA_GWDT_WOR);
+               return lo_hi_readq(gwdt->control_base + SBSA_GWDT_WOR);
  }
  
  static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt)
@@ -138,7 +138,7 @@ static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt)
         if (gwdt->version == 0)
                 writel((u32)val, gwdt->control_base + SBSA_GWDT_WOR);
         else
-               writeq(val, gwdt->control_base + SBSA_GWDT_WOR);
+               lo_hi_writeq(val, gwdt->control_base + SBSA_GWDT_WOR);
  }
  
  /*
@@ -411,4 +411,3 @@ MODULE_AUTHOR("Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>");
  MODULE_AUTHOR("Al Stone <al.stone@linaro.org>");
  MODULE_AUTHOR("Timur Tabi <timur@codeaurora.org>");
  MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/fs/affs/super.c b/fs/affs/super.c

index c6c2a51..c609005 100644 (file)
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
          * blocks, we will have to change it.
          */
  
-       size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       size = bdev_nr_sectors(sb->s_bdev);
         pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
  
         affs_set_blocksize(sb, PAGE_SIZE);
diff --git a/fs/afs/write.c b/fs/afs/write.c

index f24370f..8b1d9c2 100644 (file)
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
   */
  vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
  {
-       struct page *page = thp_head(vmf->page);
+       struct folio *folio = page_folio(vmf->page);
+       struct page *page = &folio->page;
         struct file *file = vmf->vma->vm_file;
         struct inode *inode = file_inode(file);
         struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
                 goto out;
  #endif
  
-       if (wait_on_page_writeback_killable(page))
+       if (folio_wait_writeback_killable(folio))
                 goto out;
  
         if (lock_page_killable(page) < 0)
@@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
          * details the portion of the page we need to write back and we might
          * need to redirty the page if there's a problem.
          */
-       if (wait_on_page_writeback_killable(page) < 0) {
-               unlock_page(page);
+       if (folio_wait_writeback_killable(folio) < 0) {
+               folio_unlock(folio);
                 goto out;
         }
  
diff --git a/fs/aio.c b/fs/aio.c

index 51b08ab..836dc7e 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1417,7 +1417,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
         spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
  
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
  {
         struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
  
@@ -1437,7 +1437,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
         }
  
         iocb->ki_res.res = res;
-       iocb->ki_res.res2 = res2;
+       iocb->ki_res.res2 = 0;
         iocb_put(iocb);
  }
  
@@ -1508,7 +1508,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
                 ret = -EINTR;
                 fallthrough;
         default:
-               req->ki_complete(req, ret, 0);
+               req->ki_complete(req, ret);
         }
  }
  
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 1d071c8..32da97c 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
  #include <linux/fs.h>
  #include <linux/pagemap.h>
  #include <linux/highmem.h>
+#include <linux/kthread.h>
  #include <linux/time.h>
  #include <linux/init.h>
  #include <linux/string.h>
@@ -173,9 +174,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
                 /* Hash through the page sector by sector */
                 for (pg_offset = 0; pg_offset < bytes_left;
                      pg_offset += sectorsize) {
-                       kaddr = page_address(page);
+                       kaddr = kmap_atomic(page);
                         crypto_shash_digest(shash, kaddr + pg_offset,
                                             sectorsize, csum);
+                       kunmap_atomic(kaddr);
  
                         if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                 btrfs_print_data_csum_error(inode, disk_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 74c8e18..c3983bd 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
  #include <linux/slab.h>
  #include <linux/rbtree.h>
  #include <linux/mm.h>
+#include <linux/error-injection.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index 59ef388..c85a7d4 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -281,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
         }
  
  
-       if (i_size_read(bdev->bd_inode) <
-           btrfs_device_get_total_bytes(srcdev)) {
+       if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
                 btrfs_err(fs_info,
                           "target device is smaller than source device!");
                 ret = -EINVAL;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index c725433..59c3be8 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3748,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
         else if (ret)
                 return ERR_PTR(ret);
  
-       if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+       if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
                 return ERR_PTR(-EINVAL);
  
         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 5fec009..b8c911a 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
  #include <crypto/hash.h>
  #include <linux/kernel.h>
  #include <linux/bio.h>
+#include <linux/blk-cgroup.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
  
-                       kaddr = page_address(cpage);
+                       kaddr = kmap_atomic(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                       kunmap_atomic(kaddr);
  
                         i++;
                         ptr += cur_size;
@@ -8261,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
         return dip;
  }
  
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
                 struct bio *dio_bio, loff_t file_offset)
  {
         struct inode *inode = iter->inode;
@@ -8291,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
                 }
                 dio_bio->bi_status = BLK_STS_RESOURCE;
                 bio_endio(dio_bio);
-               return BLK_QC_T_NONE;
+               return;
         }
  
         if (!write) {
@@ -8384,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
  
                 free_extent_map(em);
         } while (submit_len > 0);
-       return BLK_QC_T_NONE;
+       return;
  
  out_err_em:
         free_extent_map(em);
  out_err:
         dip->dio_bio->bi_status = status;
         btrfs_dio_private_put(dip);
-
-       return BLK_QC_T_NONE;
  }
  
  const struct iomap_ops btrfs_dio_iomap_ops = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 92424a2..02ff085 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1691,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
         }
  
         if (!strcmp(sizestr, "max"))
-               new_size = device->bdev->bd_inode->i_size;
+               new_size = bdev_nr_bytes(device->bdev);
         else {
                 if (sizestr[0] == '-') {
                         mod = -1;
@@ -1732,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                 ret = -EINVAL;
                 goto out_finish;
         }
-       if (new_size > device->bdev->bd_inode->i_size) {
+       if (new_size > bdev_nr_bytes(device->bdev)) {
                 ret = -EFBIG;
                 goto out_finish;
         }
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c

index 00cffc1..65cb076 100644 (file)
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -131,6 +131,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
         u32 sector_bytes_left;
         u32 orig_out;
         struct page *cur_page;
+       char *kaddr;
  
         /*
          * We never allow a segment header crossing sector boundary, previous
@@ -147,7 +148,8 @@ static int copy_compressed_data_to_page(char *compressed_data,
                 out_pages[*cur_out / PAGE_SIZE] = cur_page;
         }
  
-       write_compress_length(page_address(cur_page) + offset_in_page(*cur_out),
+       kaddr = kmap(cur_page);
+       write_compress_length(kaddr + offset_in_page(*cur_out),
                               compressed_size);
         *cur_out += LZO_LEN;
  
@@ -158,6 +160,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
                 u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
                                      orig_out + compressed_size - *cur_out);
  
+               kunmap(cur_page);
                 cur_page = out_pages[*cur_out / PAGE_SIZE];
                 /* Allocate a new page */
                 if (!cur_page) {
@@ -166,8 +169,9 @@ static int copy_compressed_data_to_page(char *compressed_data,
                                 return -ENOMEM;
                         out_pages[*cur_out / PAGE_SIZE] = cur_page;
                 }
+               kaddr = kmap(cur_page);
  
-               memcpy(page_address(cur_page) + offset_in_page(*cur_out),
+               memcpy(kaddr + offset_in_page(*cur_out),
                        compressed_data + *cur_out - orig_out, copy_len);
  
                 *cur_out += copy_len;
@@ -179,12 +183,15 @@ static int copy_compressed_data_to_page(char *compressed_data,
          */
         sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
         if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
-               return 0;
+               goto out;
  
         /* The remaining size is not enough, pad it with zeros */
-       memset(page_address(cur_page) + offset_in_page(*cur_out), 0,
+       memset(kaddr + offset_in_page(*cur_out), 0,
                sector_bytes_left);
         *cur_out += sector_bytes_left;
+
+out:
+       kunmap(cur_page);
         return 0;
  }
  
@@ -195,6 +202,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
         struct workspace *workspace = list_entry(ws, struct workspace, list);
         const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
         struct page *page_in = NULL;
+       char *sizes_ptr;
         int ret = 0;
         /* Points to the file offset of input data */
         u64 cur_in = start;
@@ -212,6 +220,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
          */
         cur_out += LZO_LEN;
         while (cur_in < start + len) {
+               char *data_in;
                 const u32 sectorsize_mask = sectorsize - 1;
                 u32 sector_off = (cur_in - start) & sectorsize_mask;
                 u32 in_len;
@@ -226,10 +235,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                 /* Compress at most one sector of data each time */
                 in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
                 ASSERT(in_len);
-               ret = lzo1x_1_compress(page_address(page_in) +
+               data_in = kmap(page_in);
+               ret = lzo1x_1_compress(data_in +
                                        offset_in_page(cur_in), in_len,
                                        workspace->cbuf, &out_len,
                                        workspace->mem);
+               kunmap(page_in);
                 if (ret < 0) {
                         pr_debug("BTRFS: lzo in loop returned %d\n", ret);
                         ret = -EIO;
@@ -260,7 +271,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
         }
  
         /* Store the size of all chunks of compressed data */
-       write_compress_length(page_address(pages[0]), cur_out);
+       sizes_ptr = kmap_local_page(pages[0]);
+       write_compress_length(sizes_ptr, cur_out);
+       kunmap_local(sizes_ptr);
  
         ret = 0;
         *total_out = cur_out;
@@ -281,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
         u32 orig_in = *cur_in;
  
         while (*cur_in < orig_in + len) {
+               char *kaddr;
                 struct page *cur_page;
                 u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
                                           orig_in + len - *cur_in);
@@ -288,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
                 ASSERT(copy_len);
                 cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
  
+               kaddr = kmap(cur_page);
                 memcpy(dest + *cur_in - orig_in,
-                       page_address(cur_page) + offset_in_page(*cur_in),
+                       kaddr + offset_in_page(*cur_in),
                         copy_len);
+               kunmap(cur_page);
  
                 *cur_in += copy_len;
         }
@@ -301,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
         struct workspace *workspace = list_entry(ws, struct workspace, list);
         const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
         const u32 sectorsize = fs_info->sectorsize;
+       char *kaddr;
         int ret;
         /* Compressed data length, can be unaligned */
         u32 len_in;
@@ -309,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
         /* Bytes decompressed so far */
         u32 cur_out = 0;
  
-       len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+       kaddr = kmap(cb->compressed_pages[0]);
+       len_in = read_compress_length(kaddr);
+       kunmap(cb->compressed_pages[0]);
         cur_in += LZO_LEN;
  
         /*
@@ -343,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                        (cur_in + LZO_LEN - 1) / sectorsize);
                 cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
                 ASSERT(cur_page);
-               seg_len = read_compress_length(page_address(cur_page) +
-                                              offset_in_page(cur_in));
+               kaddr = kmap(cur_page);
+               seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+               kunmap(cur_page);
                 cur_in += LZO_LEN;
  
                 /* Copy the compressed segment payload into workspace */
@@ -429,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
         destlen = min_t(unsigned long, destlen, PAGE_SIZE);
         bytes = min_t(unsigned long, destlen, out_len - start_byte);
  
-       kaddr = page_address(dest_page);
+       kaddr = kmap_local_page(dest_page);
         memcpy(kaddr, workspace->buf + start_byte, bytes);
  
         /*
@@ -439,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
          */
         if (bytes < destlen)
                 memset(kaddr+bytes, 0, destlen-bytes);
+       kunmap_local(kaddr);
  out:
         return ret;
  }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 546bf11..61ac57b 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -509,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
         }
  
         if (flush)
-               filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+               sync_blockdev(*bdev);
         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
         if (ret) {
                 blkdev_put(*bdev, flags);
@@ -1293,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
         pgoff_t index;
  
         /* make sure our super fits in the device */
-       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+       if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
                 return ERR_PTR(-EINVAL);
  
         /* make sure our super fits in the page */
@@ -2657,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
         device->io_width = fs_info->sectorsize;
         device->io_align = fs_info->sectorsize;
         device->sector_size = fs_info->sectorsize;
-       device->total_bytes = round_down(i_size_read(bdev->bd_inode),
-                                        fs_info->sectorsize);
+       device->total_bytes =
+               round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
         device->disk_total_bytes = device->total_bytes;
         device->commit_total_bytes = device->total_bytes;
         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -7313,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf,
  
         fill_device_from_item(leaf, dev_item, device);
         if (device->bdev) {
-               u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+               u64 max_total_bytes = bdev_nr_bytes(device->bdev);
  
                 if (device->total_bytes > max_total_bytes) {
                         btrfs_err(fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c

index 8afa900..767a0c6 100644 (file)
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                 ret = -ENOMEM;
                 goto out;
         }
-       cpage_out = page_address(out_page);
+       cpage_out = kmap(out_page);
         pages[0] = out_page;
         nr_pages = 1;
  
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 int i;
  
                                 for (i = 0; i < in_buf_pages; i++) {
-                                       if (in_page)
+                                       if (in_page) {
+                                               kunmap(in_page);
                                                 put_page(in_page);
+                                       }
                                         in_page = find_get_page(mapping,
                                                                 start >> PAGE_SHIFT);
-                                       data_in = page_address(in_page);
+                                       data_in = kmap(in_page);
                                         memcpy(workspace->buf + i * PAGE_SIZE,
                                                data_in, PAGE_SIZE);
                                         start += PAGE_SIZE;
                                 }
                                 workspace->strm.next_in = workspace->buf;
                         } else {
-                               if (in_page)
+                               if (in_page) {
+                                       kunmap(in_page);
                                         put_page(in_page);
+                               }
                                 in_page = find_get_page(mapping,
                                                         start >> PAGE_SHIFT);
-                               data_in = page_address(in_page);
+                               data_in = kmap(in_page);
                                 start += PAGE_SIZE;
                                 workspace->strm.next_in = data_in;
                         }
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                  * the stream end if required
                  */
                 if (workspace->strm.avail_out == 0) {
+                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 ret = -ENOMEM;
                                 goto out;
                         }
-                       cpage_out = page_address(out_page);
+                       cpage_out = kmap(out_page);
                         pages[nr_pages] = out_page;
                         nr_pages++;
                         workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                         goto out;
                 } else if (workspace->strm.avail_out == 0) {
                         /* get another page for the stream end */
+                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 ret = -ENOMEM;
                                 goto out;
                         }
-                       cpage_out = page_address(out_page);
+                       cpage_out = kmap(out_page);
                         pages[nr_pages] = out_page;
                         nr_pages++;
                         workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
         *total_in = workspace->strm.total_in;
  out:
         *out_pages = nr_pages;
-       if (in_page)
+       if (out_page)
+               kunmap(out_page);
+
+       if (in_page) {
+               kunmap(in_page);
                 put_page(in_page);
+       }
         return ret;
  }
  
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
         unsigned long buf_start;
         struct page **pages_in = cb->compressed_pages;
  
-       data_in = page_address(pages_in[page_in_index]);
+       data_in = kmap(pages_in[page_in_index]);
         workspace->strm.next_in = data_in;
         workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
         workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  
         if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
                 pr_warn("BTRFS: inflateInit failed\n");
+               kunmap(pages_in[page_in_index]);
                 return -EIO;
         }
         while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
  
                 if (workspace->strm.avail_in == 0) {
                         unsigned long tmp;
-
+                       kunmap(pages_in[page_in_index]);
                         page_in_index++;
                         if (page_in_index >= total_pages_in) {
                                 data_in = NULL;
                                 break;
                         }
-                       data_in = page_address(pages_in[page_in_index]);
+                       data_in = kmap(pages_in[page_in_index]);
                         workspace->strm.next_in = data_in;
                         tmp = srclen - workspace->strm.total_in;
                         workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 ret = 0;
  done:
         zlib_inflateEnd(&workspace->strm);
+       if (data_in)
+               kunmap(pages_in[page_in_index]);
         if (!ret)
                 zero_fill_bio(cb->orig_bio);
         return ret;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c

index 56dce9f..f06b680 100644 (file)
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  
         /* map in the first page of input data */
         in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       workspace->in_buf.src = page_address(in_page);
+       workspace->in_buf.src = kmap(in_page);
         workspace->in_buf.pos = 0;
         workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
  
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                 goto out;
         }
         pages[nr_pages++] = out_page;
-       workspace->out_buf.dst = page_address(out_page);
+       workspace->out_buf.dst = kmap(out_page);
         workspace->out_buf.pos = 0;
         workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
  
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                 if (workspace->out_buf.pos == workspace->out_buf.size) {
                         tot_out += PAGE_SIZE;
                         max_out -= PAGE_SIZE;
+                       kunmap(out_page);
                         if (nr_pages == nr_dest_pages) {
                                 out_page = NULL;
                                 ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                                 goto out;
                         }
                         pages[nr_pages++] = out_page;
-                       workspace->out_buf.dst = page_address(out_page);
+                       workspace->out_buf.dst = kmap(out_page);
                         workspace->out_buf.pos = 0;
                         workspace->out_buf.size = min_t(size_t, max_out,
                                                         PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                 /* Check if we need more input */
                 if (workspace->in_buf.pos == workspace->in_buf.size) {
                         tot_in += PAGE_SIZE;
+                       kunmap(in_page);
                         put_page(in_page);
  
                         start += PAGE_SIZE;
                         len -= PAGE_SIZE;
                         in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-                       workspace->in_buf.src = page_address(in_page);
+                       workspace->in_buf.src = kmap(in_page);
                         workspace->in_buf.pos = 0;
                         workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
                 }
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  
                 tot_out += PAGE_SIZE;
                 max_out -= PAGE_SIZE;
+               kunmap(out_page);
                 if (nr_pages == nr_dest_pages) {
                         out_page = NULL;
                         ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
                         goto out;
                 }
                 pages[nr_pages++] = out_page;
-               workspace->out_buf.dst = page_address(out_page);
+               workspace->out_buf.dst = kmap(out_page);
                 workspace->out_buf.pos = 0;
                 workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
         }
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
  out:
         *out_pages = nr_pages;
         /* Cleanup */
-       if (in_page)
+       if (in_page) {
+               kunmap(in_page);
                 put_page(in_page);
+       }
+       if (out_page)
+               kunmap(out_page);
         return ret;
  }
  
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                 goto done;
         }
  
-       workspace->in_buf.src = page_address(pages_in[page_in_index]);
+       workspace->in_buf.src = kmap(pages_in[page_in_index]);
         workspace->in_buf.pos = 0;
         workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
  
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
                         break;
  
                 if (workspace->in_buf.pos == workspace->in_buf.size) {
-                       page_in_index++;
+                       kunmap(pages_in[page_in_index++]);
                         if (page_in_index >= total_pages_in) {
                                 workspace->in_buf.src = NULL;
                                 ret = -EIO;
                                 goto done;
                         }
                         srclen -= PAGE_SIZE;
-                       workspace->in_buf.src = page_address(pages_in[page_in_index]);
+                       workspace->in_buf.src = kmap(pages_in[page_in_index]);
                         workspace->in_buf.pos = 0;
                         workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
                 }
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
         ret = 0;
         zero_fill_bio(cb->orig_bio);
  done:
+       if (workspace->in_buf.src)
+               kunmap(pages_in[page_in_index]);
         return ret;
  }
  
diff --git a/fs/buffer.c b/fs/buffer.c

index c615387..46bc589 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
  static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
  {
         sector_t retval = ~((sector_t)0);
-       loff_t sz = i_size_read(bdev->bd_inode);
+       loff_t sz = bdev_nr_bytes(bdev);
  
         if (sz) {
                 unsigned int sizebits = blksize_bits(size);
@@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
         struct buffer_head *head = page_buffers(page);
         struct buffer_head *bh = head;
         int uptodate = PageUptodate(page);
-       sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+       sector_t end_block = blkdev_max_block(bdev, size);
  
         do {
                 if (!buffer_mapped(bh)) {
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c

index fac2e8e..effe37e 100644 (file)
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
  /*
   * Handle completion of a read from the cache.
   */
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
  {
         struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
  
-       _enter("%ld,%ld", ret, ret2);
+       _enter("%ld", ret);
  
         if (ki->term_func) {
                 if (ret >= 0)
@@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
                 fallthrough;
         default:
                 ki->was_async = false;
-               cachefiles_read_complete(&ki->iocb, ret, 0);
+               cachefiles_read_complete(&ki->iocb, ret);
                 if (ret > 0)
                         ret = 0;
                 break;
@@ -159,12 +159,12 @@ presubmission_error:
  /*
   * Handle completion of a write to the cache.
   */
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
  {
         struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
         struct inode *inode = file_inode(ki->iocb.ki_filp);
  
-       _enter("%ld,%ld", ret, ret2);
+       _enter("%ld", ret);
  
         /* Tell lockdep we inherited freeze protection from submission thread */
         __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
@@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
                 fallthrough;
         default:
                 ki->was_async = false;
-               cachefiles_write_complete(&ki->iocb, ret, 0);
+               cachefiles_write_complete(&ki->iocb, ret);
                 if (ret > 0)
                         ret = 0;
                 break;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c

index 8ffc40e..fcf4f3b 100644 (file)
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
         struct cachefiles_object *object;
         struct fscache_retrieval *op = monitor->op;
         struct wait_page_key *key = _key;
-       struct page *page = wait->private;
+       struct folio *folio = wait->private;
  
         ASSERT(key);
  
         _enter("{%lu},%u,%d,{%p,%u}",
                monitor->netfs_page->index, mode, sync,
-              key->page, key->bit_nr);
+              key->folio, key->bit_nr);
  
-       if (key->page != page || key->bit_nr != PG_locked)
+       if (key->folio != folio || key->bit_nr != PG_locked)
                 return 0;
  
-       _debug("--- monitor %p %lx ---", page, page->flags);
+       _debug("--- monitor %p %lx ---", folio, folio->flags);
  
-       if (!PageUptodate(page) && !PageError(page)) {
+       if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
                 /* unlocked, not uptodate and not erronous? */
                 _debug("page probably truncated");
         }
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
         put_page(backpage2);
  
         INIT_LIST_HEAD(&monitor->op_link);
-       add_page_wait_queue(backpage, &monitor->monitor);
+       folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
  
         if (trylock_page(backpage)) {
                 ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
         get_page(backpage);
         monitor->back_page = backpage;
         monitor->monitor.private = backpage;
-       add_page_wait_queue(backpage, &monitor->monitor);
+       folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
         monitor = NULL;
  
         /* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                 get_page(backpage);
                 monitor->back_page = backpage;
                 monitor->monitor.private = backpage;
-               add_page_wait_queue(backpage, &monitor->monitor);
+               folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
                 monitor = NULL;
  
                 /* but the page may have been read before the monitor was
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index e61018d..b129ea5 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1022,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode,
         ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
                                                 CEPH_CAP_FILE_RD));
  
-       aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+       aio_req->iocb->ki_complete(aio_req->iocb, ret);
  
         ceph_free_cap_flush(aio_req->prealloc_cf);
         kfree(aio_req);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c

index bdeb271..d8c3106 100644 (file)
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
  
         if (!(fl->fl_flags & FL_FLOCK))
                 return -ENOLCK;
-       /* No mandatory locks */
-       if (fl->fl_type & LOCK_MAND)
-               return -EOPNOTSUPP;
  
         dout("ceph_flock, fl_file: %p\n", fl->fl_file);
  
diff --git a/fs/cifs/file.c b/fs/cifs/file.c

index 13f3182..1b855fc 100644 (file)
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3184,7 +3184,7 @@ restart_loop:
         mutex_unlock(&ctx->aio_mutex);
  
         if (ctx->iocb && ctx->iocb->ki_complete)
-               ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+               ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
         else
                 complete(&ctx->done);
  }
@@ -3917,7 +3917,7 @@ again:
         mutex_unlock(&ctx->aio_mutex);
  
         if (ctx->iocb && ctx->iocb->ki_complete)
-               ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+               ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
         else
                 complete(&ctx->done);
  }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c

index 2be6526..666aa38 100644 (file)
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
                 return read_buffers[i] + blk_offset;
         }
  
-       devsize = mapping->host->i_size >> PAGE_SHIFT;
+       devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
  
         /* Ok, read in BLKS_PER_BUF pages completely first. */
         for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c

index 68a2de6..bfc2a5b 100644 (file)
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,23 +1,10 @@
  // SPDX-License-Identifier: GPL-2.0
  /*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
   *
   * Copyright (C) 2015, Google, Inc.
   * Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- *     Uday Savagaonkar, 2014
- * Encryption policy handling additions
- *     Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- *     Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
   */
  
  #include <linux/pagemap.h>
@@ -26,6 +13,21 @@
  #include <linux/namei.h>
  #include "fscrypt_private.h"
  
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read.  The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache.  If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable.  This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted.  The filesystem must not mark such pages uptodate.
+ */
  void fscrypt_decrypt_bio(struct bio *bio)
  {
         struct bio_vec *bv;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c

index eb538c2..a9be4bc 100644 (file)
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
  
         if (fscrypt_has_encryption_key(dir)) {
                 if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
-                                                 iname->len,
-                                                 dir->i_sb->s_cop->max_namelen,
+                                                 iname->len, NAME_MAX,
                                                   &fname->crypto_buf.len))
                         return -ENAMETOOLONG;
                 fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h

index 3fa965e..5b0a9e6 100644 (file)
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -20,6 +20,11 @@
  
  #define FSCRYPT_FILE_NONCE_SIZE        16
  
+/*
+ * Minimum size of an fscrypt master key.  Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used.  This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
  #define FSCRYPT_MIN_KEY_SIZE   16
  
  #define FSCRYPT_CONTEXT_V1     1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
          */
         struct fscrypt_hkdf     hkdf;
  
-       /* Size of the raw key in bytes.  Set even if ->raw isn't set. */
+       /*
+        * Size of the raw key in bytes.  This remains set even if ->raw was
+        * zeroized due to no longer being needed.  I.e. we still remember the
+        * size of the key even if we don't need to remember the key itself.
+        */
         u32                     size;
  
         /* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
  struct fscrypt_mode {
         const char *friendly_name;
         const char *cipher_str;
-       int keysize;
-       int ivsize;
+       int keysize;            /* key size in bytes */
+       int security_strength;  /* security strength in bytes */
+       int ivsize;             /* IV size in bytes */
         int logged_impl_name;
         enum blk_crypto_mode_num blk_crypto_mode;
  };
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c

index e0ec210..7607d18 100644 (file)
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -16,9 +16,14 @@
  
  /*
   * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here.  A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256.  In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
   */
  #define HKDF_HMAC_ALG          "hmac(sha512)"
  #define HKDF_HASHLEN           SHA512_DIGEST_SIZE
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c

index bca9c66..eede186 100644 (file)
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
                 .friendly_name = "AES-256-XTS",
                 .cipher_str = "xts(aes)",
                 .keysize = 64,
+               .security_strength = 32,
                 .ivsize = 16,
                 .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
         },
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
                 .friendly_name = "AES-256-CTS-CBC",
                 .cipher_str = "cts(cbc(aes))",
                 .keysize = 32,
+               .security_strength = 32,
                 .ivsize = 16,
         },
         [FSCRYPT_MODE_AES_128_CBC] = {
                 .friendly_name = "AES-128-CBC-ESSIV",
                 .cipher_str = "essiv(cbc(aes),sha256)",
                 .keysize = 16,
+               .security_strength = 16,
                 .ivsize = 16,
                 .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
         },
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
                 .friendly_name = "AES-128-CTS-CBC",
                 .cipher_str = "cts(cbc(aes))",
                 .keysize = 16,
+               .security_strength = 16,
                 .ivsize = 16,
         },
         [FSCRYPT_MODE_ADIANTUM] = {
                 .friendly_name = "Adiantum",
                 .cipher_str = "adiantum(xchacha12,aes)",
                 .keysize = 32,
+               .security_strength = 32,
                 .ivsize = 32,
                 .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
         },
@@ -117,8 +122,9 @@ err_free_tfm:
  
  /*
   * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
   */
  int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                         const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
  }
  
  /*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode).  This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long.  (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+                                         const struct fscrypt_info *ci)
+{
+       unsigned int min_keysize;
+
+       if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+               min_keysize = ci->ci_mode->keysize;
+       else
+               min_keysize = ci->ci_mode->security_strength;
+
+       if (mk->mk_secret.size < min_keysize) {
+               fscrypt_warn(NULL,
+                            "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+                            master_key_spec_type(&mk->mk_spec),
+                            master_key_spec_len(&mk->mk_spec),
+                            (u8 *)&mk->mk_spec.u,
+                            mk->mk_secret.size, min_keysize);
+               return false;
+       }
+       return true;
+}
+
+/*
   * Find the master key, then set up the inode's actual encryption key.
   *
   * If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
                 goto out_release_key;
         }
  
-       /*
-        * Require that the master key be at least as long as the derived key.
-        * Otherwise, the derived key cannot possibly contain as much entropy as
-        * that required by the encryption mode it will be used for.  For v1
-        * policies it's also required for the KDF to work at all.
-        */
-       if (mk->mk_secret.size < ci->ci_mode->keysize) {
-               fscrypt_warn(NULL,
-                            "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
-                            master_key_spec_type(&mk_spec),
-                            master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
-                            mk->mk_secret.size, ci->ci_mode->keysize);
+       if (!fscrypt_valid_master_key_size(mk, ci)) {
                 err = -ENOKEY;
                 goto out_release_key;
         }
diff --git a/fs/direct-io.c b/fs/direct-io.c

index b2e86e7..6544435 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
         int flags;                      /* doesn't change */
         int op;
         int op_flags;
-       blk_qc_t bio_cookie;
         struct gendisk *bio_disk;
         struct inode *inode;
         loff_t i_size;                  /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
  
                 if (ret > 0 && dio->op == REQ_OP_WRITE)
                         ret = generic_write_sync(dio->iocb, ret);
-               dio->iocb->ki_complete(dio->iocb, ret, 0);
+               dio->iocb->ki_complete(dio->iocb, ret);
         }
  
         kmem_cache_free(dio_cache, dio);
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
  
         dio->bio_disk = bio->bi_bdev->bd_disk;
  
-       if (sdio->submit_io) {
+       if (sdio->submit_io)
                 sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
-               dio->bio_cookie = BLK_QC_T_NONE;
-       } else
-               dio->bio_cookie = submit_bio(bio);
+       else
+               submit_bio(bio);
  
         sdio->bio = NULL;
         sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
                 __set_current_state(TASK_UNINTERRUPTIBLE);
                 dio->waiter = current;
                 spin_unlock_irqrestore(&dio->bio_lock, flags);
-               if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
-                       blk_io_schedule();
+               blk_io_schedule();
                 /* wake up sets us TASK_RUNNING */
                 spin_lock_irqsave(&dio->bio_lock, flags);
                 dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
         } else {
                 dio->op = REQ_OP_READ;
         }
-       if (iocb->ki_flags & IOCB_HIPRI)
-               dio->op_flags |= REQ_HIPRI;
  
         /*
          * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig

index 14b7470..f57255a 100644 (file)
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -6,16 +6,22 @@ config EROFS_FS
         select FS_IOMAP
         select LIBCRC32C
         help
-         EROFS (Enhanced Read-Only File System) is a lightweight
-         read-only file system with modern designs (eg. page-sized
-         blocks, inline xattrs/data, etc.) for scenarios which need
-         high-performance read-only requirements, e.g. Android OS
-         for mobile phones and LIVECDs.
+         EROFS (Enhanced Read-Only File System) is a lightweight read-only
+         file system with modern designs (e.g. no buffer heads, inline
+         xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+         scenarios which need high-performance read-only solutions, e.g.
+         smartphones with Android OS, LiveCDs and high-density hosts with
+         numerous containers;
  
-         It also provides fixed-sized output compression support,
-         which improves storage density, keeps relatively higher
-         compression ratios, which is more useful to achieve high
-         performance for embedded devices with limited memory.
+         It also provides fixed-sized output compression support in order to
+         improve storage density as well as keep relatively higher compression
+         ratios and implements in-place decompression to reuse the file page
+         for compressed data temporarily with proper strategies, which is
+         quite useful to ensure guaranteed end-to-end runtime decompression
+         performance under extremely memory pressure without extra cost.
+
+         See the documentation at <file:Documentation/filesystems/erofs.rst>
+         for more details.
  
           If unsure, say N.
  
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
           Enable fixed-sized output compression for EROFS.
  
           If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+       bool "EROFS LZMA compressed data support"
+       depends on EROFS_FS_ZIP
+       select XZ_DEC
+       select XZ_DEC_MICROLZMA
+       help
+         Saying Y here includes support for reading EROFS file systems
+         containing LZMA compressed data, specifically called microLZMA. it
+         gives better compression ratios than the LZ4 algorithm, at the
+         expense of more CPU overhead.
+
+         LZMA support is an experimental feature for now and so most file
+         systems will be readable without selecting this option.
+
+         If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile

index 1f9aced..756fe2d 100644 (file)
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
  erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
  erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
  erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h

index 3701c72..5794065 100644 (file)
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -8,11 +8,6 @@
  
  #include "internal.h"
  
-enum {
-       Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
-       Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
  struct z_erofs_decompress_req {
         struct super_block *sb;
         struct page **in, **out;
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
         bool inplace_io, partial_decoding;
  };
  
+struct z_erofs_decompressor {
+       int (*decompress)(struct z_erofs_decompress_req *rq,
+                         struct page **pagepool);
+       char *name;
+};
+
  /* some special page->private (unsigned long, see below) */
  #define Z_EROFS_SHORTLIVED_PAGE                (-1UL << 2)
  #define Z_EROFS_PREALLOCATED_PAGE      (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
         return true;
  }
  
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
                                               struct page *page)
  {
         if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
                 put_page(page);
         } else {
                 /* follow the pcluster rule above. */
-               set_page_private(page, 0);
-               list_add(&page->lru, pagepool);
+               erofs_pagepool_add(pagepool, page);
         }
         return true;
  }
  
+#define MNGD_MAPPING(sbi)      ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+                                        struct page *page)
+{
+       return page->mapping == MNGD_MAPPING(sbi);
+}
+
  int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-                      struct list_head *pagepool);
+                      struct page **pagepool);
  
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+                           struct page **pagepool);
  #endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c

index 9db8297..808234d 100644 (file)
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
         erofs_off_t pos;
         int err = 0;
  
+       map->m_deviceid = 0;
         if (map->m_la >= inode->i_size) {
                 /* leave out-of-bound access unmapped */
                 map->m_flags = 0;
@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
                 map->m_flags = 0;
                 break;
         default:
-               /* only one device is supported for now */
-               if (idx->device_id) {
-                       erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
-                                 le16_to_cpu(idx->device_id),
-                                 chunknr, vi->nid);
-                       err = -EFSCORRUPTED;
-                       goto out_unlock;
-               }
+               map->m_deviceid = le16_to_cpu(idx->device_id) &
+                       EROFS_SB(sb)->device_id_mask;
                 map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
                 map->m_flags = EROFS_MAP_MAPPED;
                 break;
@@ -155,11 +150,55 @@ out:
         return err;
  }
  
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+       struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+       struct erofs_device_info *dif;
+       int id;
+
+       /* primary device by default */
+       map->m_bdev = sb->s_bdev;
+       map->m_daxdev = EROFS_SB(sb)->dax_dev;
+
+       if (map->m_deviceid) {
+               down_read(&devs->rwsem);
+               dif = idr_find(&devs->tree, map->m_deviceid - 1);
+               if (!dif) {
+                       up_read(&devs->rwsem);
+                       return -ENODEV;
+               }
+               map->m_bdev = dif->bdev;
+               map->m_daxdev = dif->dax_dev;
+               up_read(&devs->rwsem);
+       } else if (devs->extra_devices) {
+               down_read(&devs->rwsem);
+               idr_for_each_entry(&devs->tree, dif, id) {
+                       erofs_off_t startoff, length;
+
+                       if (!dif->mapped_blkaddr)
+                               continue;
+                       startoff = blknr_to_addr(dif->mapped_blkaddr);
+                       length = blknr_to_addr(dif->blocks);
+
+                       if (map->m_pa >= startoff &&
+                           map->m_pa < startoff + length) {
+                               map->m_pa -= startoff;
+                               map->m_bdev = dif->bdev;
+                               map->m_daxdev = dif->dax_dev;
+                               break;
+                       }
+               }
+               up_read(&devs->rwsem);
+       }
+       return 0;
+}
+
  static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                 unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
  {
         int ret;
         struct erofs_map_blocks map;
+       struct erofs_map_dev mdev;
  
         map.m_la = offset;
         map.m_llen = length;
@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
         if (ret < 0)
                 return ret;
  
-       iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+       mdev = (struct erofs_map_dev) {
+               .m_deviceid = map.m_deviceid,
+               .m_pa = map.m_pa,
+       };
+       ret = erofs_map_dev(inode->i_sb, &mdev);
+       if (ret)
+               return ret;
+
+       iomap->bdev = mdev.m_bdev;
+       iomap->dax_dev = mdev.m_daxdev;
         iomap->offset = map.m_la;
         iomap->length = map.m_llen;
         iomap->flags = 0;
@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
  
                 iomap->type = IOMAP_INLINE;
                 ipage = erofs_get_meta_page(inode->i_sb,
-                                           erofs_blknr(map.m_pa));
+                                           erofs_blknr(mdev.m_pa));
                 if (IS_ERR(ipage))
                         return PTR_ERR(ipage);
                 iomap->inline_data = page_address(ipage) +
-                                       erofs_blkoff(map.m_pa);
+                                       erofs_blkoff(mdev.m_pa);
                 iomap->private = ipage;
         } else {
                 iomap->type = IOMAP_MAPPED;
-               iomap->addr = map.m_pa;
+               iomap->addr = mdev.m_pa;
         }
         return 0;
  }
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c

index a5bc4b1..bf37fc7 100644 (file)
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,17 +16,6 @@
  #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
  #endif
  
-struct z_erofs_decompressor {
-       /*
-        * if destpages have sparsed pages, fill them with bounce pages.
-        * it also check whether destpages indicate continuous physical memory.
-        */
-       int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
-                                struct list_head *pagepool);
-       int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
-       char *name;
-};
-
  int z_erofs_load_lz4_config(struct super_block *sb,
                             struct erofs_super_block *dsb,
                             struct z_erofs_lz4_cfgs *lz4, int size)
@@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
         return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
  }
  
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
-                                        struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+                                       struct page **pagepool)
  {
         const unsigned int nr =
                 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
         return kaddr ? 1 : 0;
  }
  
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
                         void *inpage, unsigned int *inputmargin, int *maptype,
                         bool support_0padding)
  {
@@ -189,7 +182,8 @@ docopy:
         return src;
  }
  
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+                                     u8 *out)
  {
         unsigned int inputmargin;
         u8 *headpage, *src;
@@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
         }
  
         rq->inputsize -= inputmargin;
-       src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
-                                       support_0padding);
+       src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
+                                           &maptype, support_0padding);
         if (IS_ERR(src))
                 return PTR_ERR(src);
  
@@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                 erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
                           ret, rq->inputsize, inputmargin, rq->outputsize);
  
-               WARN_ON(1);
                 print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
                                16, 1, src + inputmargin, rq->inputsize, true);
                 print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                 if (ret >= 0)
                         memset(out + ret, 0, rq->outputsize - ret);
                 ret = -EIO;
+       } else {
+               ret = 0;
         }
  
         if (maptype == 0) {
@@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
         return ret;
  }
  
-static struct z_erofs_decompressor decompressors[] = {
-       [Z_EROFS_COMPRESSION_SHIFTED] = {
-               .name = "shifted"
-       },
-       [Z_EROFS_COMPRESSION_LZ4] = {
-               .prepare_destpages = z_erofs_lz4_prepare_destpages,
-               .decompress = z_erofs_lz4_decompress,
-               .name = "lz4"
-       },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
-                             unsigned short pageofs_out,
-                             unsigned int outputsize)
-{
-       const char *end = dst + outputsize;
-       const unsigned int righthalf = PAGE_SIZE - pageofs_out;
-       const char *cur = dst - pageofs_out;
-
-       while (cur < end) {
-               struct page *const page = *out++;
-
-               if (page) {
-                       char *buf = kmap_atomic(page);
-
-                       if (cur >= dst) {
-                               memcpy(buf, cur, min_t(uint, PAGE_SIZE,
-                                                      end - cur));
-                       } else {
-                               memcpy(buf + pageofs_out, cur + pageofs_out,
-                                      min_t(uint, righthalf, end - cur));
-                       }
-                       kunmap_atomic(buf);
-               }
-               cur += PAGE_SIZE;
-       }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
-                                     struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+                                 struct page **pagepool)
  {
         const unsigned int nrpages_out =
                 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
-       const struct z_erofs_decompressor *alg = decompressors + rq->alg;
         unsigned int dst_maptype;
         void *dst;
         int ret;
  
-       /* two optimized fast paths only for non bigpcluster cases yet */
-       if (rq->inputsize <= PAGE_SIZE) {
-               if (nrpages_out == 1 && !rq->inplace_io) {
-                       DBG_BUGON(!*rq->out);
-                       dst = kmap_atomic(*rq->out);
-                       dst_maptype = 0;
-                       goto dstmap_out;
-               }
-
-               /*
-                * For the case of small output size (especially much less
-                * than PAGE_SIZE), memcpy the decompressed data rather than
-                * compressed data is preferred.
-                */
-               if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-                       dst = erofs_get_pcpubuf(1);
-                       if (IS_ERR(dst))
-                               return PTR_ERR(dst);
-
-                       rq->inplace_io = false;
-                       ret = alg->decompress(rq, dst);
-                       if (!ret)
-                               copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
-                                                 rq->outputsize);
-
-                       erofs_put_pcpubuf(dst);
-                       return ret;
-               }
+       /* one optimized fast path only for non bigpcluster cases yet */
+       if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+               DBG_BUGON(!*rq->out);
+               dst = kmap_atomic(*rq->out);
+               dst_maptype = 0;
+               goto dstmap_out;
         }
  
         /* general decoding path which can be used for all cases */
-       ret = alg->prepare_destpages(rq, pagepool);
+       ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
         if (ret < 0)
                 return ret;
         if (ret) {
@@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
         dst_maptype = 2;
  
  dstmap_out:
-       ret = alg->decompress(rq, dst + rq->pageofs_out);
+       ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
  
         if (!dst_maptype)
                 kunmap_atomic(dst);
@@ -360,8 +294,8 @@ dstmap_out:
         return ret;
  }
  
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
-                                    struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+                                    struct page **pagepool)
  {
         const unsigned int nrpages_out =
                 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
         return 0;
  }
  
+static struct z_erofs_decompressor decompressors[] = {
+       [Z_EROFS_COMPRESSION_SHIFTED] = {
+               .decompress = z_erofs_shifted_transform,
+               .name = "shifted"
+       },
+       [Z_EROFS_COMPRESSION_LZ4] = {
+               .decompress = z_erofs_lz4_decompress,
+               .name = "lz4"
+       },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+       [Z_EROFS_COMPRESSION_LZMA] = {
+               .decompress = z_erofs_lzma_decompress,
+               .name = "lzma"
+       },
+#endif
+};
+
  int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-                      struct list_head *pagepool)
+                      struct page **pagepool)
  {
-       if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
-               return z_erofs_shifted_transform(rq, pagepool);
-       return z_erofs_decompress_generic(rq, pagepool);
+       return decompressors[rq->alg].decompress(rq, pagepool);
  }
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c

new file mode 100644 (file)

index 0000000..5004551
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+       struct z_erofs_lzma *next;
+       struct xz_dec_microlzma *state;
+       struct xz_buf buf;
+       u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+       /* there should be no running fs instance */
+       while (z_erofs_lzma_avail_strms) {
+               struct z_erofs_lzma *strm;
+
+               spin_lock(&z_erofs_lzma_lock);
+               strm = z_erofs_lzma_head;
+               if (!strm) {
+                       spin_unlock(&z_erofs_lzma_lock);
+                       DBG_BUGON(1);
+                       return;
+               }
+               z_erofs_lzma_head = NULL;
+               spin_unlock(&z_erofs_lzma_lock);
+
+               while (strm) {
+                       struct z_erofs_lzma *n = strm->next;
+
+                       if (strm->state)
+                               xz_dec_microlzma_end(strm->state);
+                       kfree(strm);
+                       --z_erofs_lzma_avail_strms;
+                       strm = n;
+               }
+       }
+}
+
+int z_erofs_lzma_init(void)
+{
+       unsigned int i;
+
+       /* by default, use # of possible CPUs instead */
+       if (!z_erofs_lzma_nstrms)
+               z_erofs_lzma_nstrms = num_possible_cpus();
+
+       for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+               struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+               if (!strm) {
+                       z_erofs_lzma_exit();
+                       return -ENOMEM;
+               }
+               spin_lock(&z_erofs_lzma_lock);
+               strm->next = z_erofs_lzma_head;
+               z_erofs_lzma_head = strm;
+               spin_unlock(&z_erofs_lzma_lock);
+               ++z_erofs_lzma_avail_strms;
+       }
+       return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size)
+{
+       static DEFINE_MUTEX(lzma_resize_mutex);
+       unsigned int dict_size, i;
+       struct z_erofs_lzma *strm, *head = NULL;
+       int err;
+
+       if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+               erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+               return -EINVAL;
+       }
+       if (lzma->format) {
+               erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+                         le16_to_cpu(lzma->format));
+               return -EINVAL;
+       }
+       dict_size = le32_to_cpu(lzma->dict_size);
+       if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+               erofs_err(sb, "unsupported lzma dictionary size %u",
+                         dict_size);
+               return -EINVAL;
+       }
+
+       erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+       /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+       mutex_lock(&lzma_resize_mutex);
+
+       if (z_erofs_lzma_max_dictsize >= dict_size) {
+               mutex_unlock(&lzma_resize_mutex);
+               return 0;
+       }
+
+       /* 1. collect/isolate all streams for the following check */
+       for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+               struct z_erofs_lzma *last;
+
+again:
+               spin_lock(&z_erofs_lzma_lock);
+               strm = z_erofs_lzma_head;
+               if (!strm) {
+                       spin_unlock(&z_erofs_lzma_lock);
+                       wait_event(z_erofs_lzma_wq,
+                                  READ_ONCE(z_erofs_lzma_head));
+                       goto again;
+               }
+               z_erofs_lzma_head = NULL;
+               spin_unlock(&z_erofs_lzma_lock);
+
+               for (last = strm; last->next; last = last->next)
+                       ++i;
+               last->next = head;
+               head = strm;
+       }
+
+       err = 0;
+       /* 2. walk each isolated stream and grow max dict_size if needed */
+       for (strm = head; strm; strm = strm->next) {
+               if (strm->state)
+                       xz_dec_microlzma_end(strm->state);
+               strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+               if (!strm->state)
+                       err = -ENOMEM;
+       }
+
+       /* 3. push back all to the global list and update max dict_size */
+       spin_lock(&z_erofs_lzma_lock);
+       DBG_BUGON(z_erofs_lzma_head);
+       z_erofs_lzma_head = head;
+       spin_unlock(&z_erofs_lzma_lock);
+
+       z_erofs_lzma_max_dictsize = dict_size;
+       mutex_unlock(&lzma_resize_mutex);
+       return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+                           struct page **pagepool)
+{
+       const unsigned int nrpages_out =
+               PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+       const unsigned int nrpages_in =
+               PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+       unsigned int inputmargin, inlen, outlen, pageofs;
+       struct z_erofs_lzma *strm;
+       u8 *kin;
+       bool bounced = false;
+       int no, ni, j, err = 0;
+
+       /* 1. get the exact LZMA compressed size */
+       kin = kmap(*rq->in);
+       inputmargin = 0;
+       while (!kin[inputmargin & ~PAGE_MASK])
+               if (!(++inputmargin & ~PAGE_MASK))
+                       break;
+
+       if (inputmargin >= PAGE_SIZE) {
+               kunmap(*rq->in);
+               return -EFSCORRUPTED;
+       }
+       rq->inputsize -= inputmargin;
+
+       /* 2. get an available lzma context */
+again:
+       spin_lock(&z_erofs_lzma_lock);
+       strm = z_erofs_lzma_head;
+       if (!strm) {
+               spin_unlock(&z_erofs_lzma_lock);
+               wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+               goto again;
+       }
+       z_erofs_lzma_head = strm->next;
+       spin_unlock(&z_erofs_lzma_lock);
+
+       /* 3. multi-call decompress */
+       inlen = rq->inputsize;
+       outlen = rq->outputsize;
+       xz_dec_microlzma_reset(strm->state, inlen, outlen,
+                              !rq->partial_decoding);
+       pageofs = rq->pageofs_out;
+       strm->buf.in = kin + inputmargin;
+       strm->buf.in_pos = 0;
+       strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+       inlen -= strm->buf.in_size;
+       strm->buf.out = NULL;
+       strm->buf.out_pos = 0;
+       strm->buf.out_size = 0;
+
+       for (ni = 0, no = -1;;) {
+               enum xz_ret xz_err;
+
+               if (strm->buf.out_pos == strm->buf.out_size) {
+                       if (strm->buf.out) {
+                               kunmap(rq->out[no]);
+                               strm->buf.out = NULL;
+                       }
+
+                       if (++no >= nrpages_out || !outlen) {
+                               erofs_err(rq->sb, "decompressed buf out of bound");
+                               err = -EFSCORRUPTED;
+                               break;
+                       }
+                       strm->buf.out_pos = 0;
+                       strm->buf.out_size = min_t(u32, outlen,
+                                                  PAGE_SIZE - pageofs);
+                       outlen -= strm->buf.out_size;
+                       if (rq->out[no])
+                               strm->buf.out = kmap(rq->out[no]) + pageofs;
+                       pageofs = 0;
+               } else if (strm->buf.in_pos == strm->buf.in_size) {
+                       kunmap(rq->in[ni]);
+
+                       if (++ni >= nrpages_in || !inlen) {
+                               erofs_err(rq->sb, "compressed buf out of bound");
+                               err = -EFSCORRUPTED;
+                               break;
+                       }
+                       strm->buf.in_pos = 0;
+                       strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+                       inlen -= strm->buf.in_size;
+                       kin = kmap(rq->in[ni]);
+                       strm->buf.in = kin;
+                       bounced = false;
+               }
+
+               /*
+                * Handle overlapping: Use bounced buffer if the compressed
+                * data is under processing; Otherwise, Use short-lived pages
+                * from the on-stack pagepool where pages share with the same
+                * request.
+                */
+               if (!bounced && rq->out[no] == rq->in[ni]) {
+                       memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+                       strm->buf.in = strm->bounce;
+                       bounced = true;
+               }
+               for (j = ni + 1; j < nrpages_in; ++j) {
+                       struct page *tmppage;
+
+                       if (rq->out[no] != rq->in[j])
+                               continue;
+
+                       DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+                                                       rq->in[j]));
+                       tmppage = erofs_allocpage(pagepool,
+                                                 GFP_KERNEL | __GFP_NOFAIL);
+                       set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+                       copy_highpage(tmppage, rq->in[j]);
+                       rq->in[j] = tmppage;
+               }
+               xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+               DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+               DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+               if (xz_err != XZ_OK) {
+                       if (xz_err == XZ_STREAM_END && !outlen)
+                               break;
+                       erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+                                 xz_err, rq->inputsize, rq->outputsize);
+                       err = -EFSCORRUPTED;
+                       break;
+               }
+       }
+       if (no < nrpages_out && strm->buf.out)
+               kunmap(rq->in[no]);
+       if (ni < nrpages_in)
+               kunmap(rq->in[ni]);
+       /* 4. push back LZMA stream context to the global list */
+       spin_lock(&z_erofs_lzma_lock);
+       strm->next = z_erofs_lzma_head;
+       z_erofs_lzma_head = strm;
+       spin_unlock(&z_erofs_lzma_lock);
+       wake_up(&z_erofs_lzma_wq);
+       return err;
+}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h

index b0b23f4..083997a 100644 (file)
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -21,14 +21,29 @@
  #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS      0x00000002
  #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER    0x00000002
  #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE    0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE    0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2     0x00000008
  #define EROFS_ALL_FEATURE_INCOMPAT             \
         (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
          EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
          EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
-        EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+        EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+        EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+        EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
  
  #define EROFS_SB_EXTSLOT_SIZE  16
  
+struct erofs_deviceslot {
+       union {
+               u8 uuid[16];            /* used for device manager later */
+               u8 userdata[64];        /* digest(sha256), etc. */
+       } u;
+       __le32 blocks;                  /* total fs blocks of this device */
+       __le32 mapped_blkaddr;          /* map starting at mapped_blkaddr */
+       u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE   sizeof(struct erofs_deviceslot)
+
  /* erofs on-disk super block (currently 128 bytes) */
  struct erofs_super_block {
         __le32 magic;           /* file system magic number */
@@ -54,7 +69,9 @@ struct erofs_super_block {
                 /* customized sliding window size instead of 64k by default */
                 __le16 lz4_max_distance;
         } __packed u1;
-       __u8 reserved2[42];
+       __le16 extra_devices;   /* # of devices besides the primary device */
+       __le16 devt_slotoff;    /* startoff = devt_slotoff * devt_slotsize */
+       __u8 reserved2[38];
  };
  
  /*
@@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
  /* 8-byte inode chunk indexes */
  struct erofs_inode_chunk_index {
         __le16 advise;          /* always 0, don't care for now */
-       __le16 device_id;       /* back-end storage id, always 0 for now */
+       __le16 device_id;       /* back-end storage id (with bits masked) */
         __le32 blkaddr;         /* start block address of this inode chunk */
  };
  
@@ -247,10 +264,11 @@ struct erofs_inode_chunk_index {
  
  /* available compression algorithm types (for h_algorithmtype) */
  enum {
-       Z_EROFS_COMPRESSION_LZ4 = 0,
+       Z_EROFS_COMPRESSION_LZ4         = 0,
+       Z_EROFS_COMPRESSION_LZMA        = 1,
         Z_EROFS_COMPRESSION_MAX
  };
-#define Z_EROFS_ALL_COMPR_ALGS         (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS         ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
  
  /* 14 bytes (+ length field = 16 bytes) */
  struct z_erofs_lz4_cfgs {
@@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs {
         u8 reserved[10];
  } __packed;
  
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+       __le32 dict_size;
+       __le16 format;
+       u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE     (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
  /*
   * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
   *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
@@ -288,35 +315,34 @@ struct z_erofs_map_header {
  #define Z_EROFS_VLE_LEGACY_HEADER_PADDING       8
  
  /*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- *    0 - literal (uncompressed) cluster
- *    1 - compressed cluster (for the head logical cluster)
- *    2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ *    0   - literal (uncompressed) lcluster
+ *    1,3 - compressed lcluster (for HEAD lclusters)
+ *    2   - compressed lcluster (for NONHEAD lclusters)
   *
   * In detail,
- *    0 - literal (uncompressed) cluster,
+ *    0 - literal (uncompressed) lcluster,
   *        di_advise = 0
- *        di_clusterofs = the literal data offset of the cluster
- *        di_blkaddr = the blkaddr of the literal cluster
+ *        di_clusterofs = the literal data offset of the lcluster
+ *        di_blkaddr = the blkaddr of the literal pcluster
   *
- *    1 - compressed cluster (for the head logical cluster)
- *        di_advise = 1
- *        di_clusterofs = the decompressed data offset of the cluster
- *        di_blkaddr = the blkaddr of the compressed cluster
+ *    1,3 - compressed lcluster (for HEAD lclusters)
+ *        di_advise = 1 or 3
+ *        di_clusterofs = the decompressed data offset of the lcluster
+ *        di_blkaddr = the blkaddr of the compressed pcluster
   *
- *    2 - compressed cluster (for the other logical clusters)
+ *    2 - compressed lcluster (for NONHEAD lclusters)
   *        di_advise = 2
   *        di_clusterofs =
- *           the decompressed data offset in its own head cluster
- *        di_u.delta[0] = distance to its corresponding head cluster
- *        di_u.delta[1] = distance to its corresponding tail cluster
- *                (di_advise could be 0, 1 or 2)
+ *           the decompressed data offset in its own HEAD lcluster
+ *        di_u.delta[0] = distance to this HEAD lcluster
+ *        di_u.delta[1] = distance to the next HEAD lcluster
   */
  enum {
         Z_EROFS_VLE_CLUSTER_TYPE_PLAIN          = 0,
-       Z_EROFS_VLE_CLUSTER_TYPE_HEAD           = 1,
+       Z_EROFS_VLE_CLUSTER_TYPE_HEAD1          = 1,
         Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD        = 2,
-       Z_EROFS_VLE_CLUSTER_TYPE_RESERVED       = 3,
+       Z_EROFS_VLE_CLUSTER_TYPE_HEAD2          = 3,
         Z_EROFS_VLE_CLUSTER_TYPE_MAX
  };
  
@@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
         /* keep in sync between 2 index structures for better extendibility */
         BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
                      sizeof(struct z_erofs_vle_decompressed_index));
+       BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
  
         BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
                      Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c

index a552399..2345f1d 100644 (file)
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
  
         inode->i_flags &= ~S_DAX;
-       if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+       if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
             vi->datalayout == EROFS_INODE_FLAT_PLAIN)
                 inode->i_flags |= S_DAX;
         if (!nblks)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h

index 9524e15..3265688 100644 (file)
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,16 @@ typedef u64 erofs_off_t;
  /* data type for filesystem-wide blocks number */
  typedef u32 erofs_blk_t;
  
-struct erofs_fs_context {
+struct erofs_device_info {
+       char *path;
+       struct block_device *bdev;
+       struct dax_device *dax_dev;
+
+       u32 blocks;
+       u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
  #ifdef CONFIG_EROFS_FS_ZIP
         /* current strategy of how to use managed cache */
         unsigned char cache_strategy;
@@ -60,6 +69,18 @@ struct erofs_fs_context {
         unsigned int mount_opt;
  };
  
+struct erofs_dev_context {
+       struct idr tree;
+       struct rw_semaphore rwsem;
+
+       unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+       struct erofs_mount_opts opt;
+       struct erofs_dev_context *devs;
+};
+
  /* all filesystem-wide lz4 configurations */
  struct erofs_sb_lz4_info {
         /* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +90,7 @@ struct erofs_sb_lz4_info {
  };
  
  struct erofs_sb_info {
+       struct erofs_mount_opts opt;    /* options */
  #ifdef CONFIG_EROFS_FS_ZIP
         /* list for all registered superblocks, mainly for shrinker */
         struct list_head list;
@@ -85,12 +107,16 @@ struct erofs_sb_info {
  
         struct erofs_sb_lz4_info lz4;
  #endif /* CONFIG_EROFS_FS_ZIP */
+       struct erofs_dev_context *devs;
         struct dax_device *dax_dev;
-       u32 blocks;
+       u64 total_blocks;
+       u32 primarydevice_blocks;
+
         u32 meta_blkaddr;
  #ifdef CONFIG_EROFS_FS_XATTR
         u32 xattr_blkaddr;
  #endif
+       u16 device_id_mask;     /* valid bits of device id to be used */
  
         /* inode slot unit size in bit shift */
         unsigned char islotbits;
@@ -108,8 +134,6 @@ struct erofs_sb_info {
         u8 volume_name[16];             /* volume name */
         u32 feature_compat;
         u32 feature_incompat;
-
-       struct erofs_fs_context ctx;    /* options */
  };
  
  #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +145,9 @@ struct erofs_sb_info {
  #define EROFS_MOUNT_DAX_ALWAYS         0x00000040
  #define EROFS_MOUNT_DAX_NEVER          0x00000080
  
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option)   ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option)  ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option)   ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option)  ((opt)->mount_opt & EROFS_MOUNT_##option)
  
  enum {
         EROFS_ZIP_CACHE_DISABLED,
@@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
  EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
  EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
  EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
  EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
  
  /* atomic flag definitions */
@@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
                               EROFS_I_DATALAYOUT_BITS);
  }
  
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+                                         pgoff_t index)
+{
+       return pagecache_get_page(mapping, index,
+                       FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+                       readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
  extern const struct super_operations erofs_sops;
  
  extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops;
   * of the corresponding uncompressed data in the file.
   */
  enum {
-       BH_Zipped = BH_PrivateStart,
+       BH_Encoded = BH_PrivateStart,
         BH_FullMapped,
  };
  
@@ -346,8 +384,8 @@ enum {
  #define EROFS_MAP_MAPPED       (1 << BH_Mapped)
  /* Located in metadata (could be copied from bd_inode) */
  #define EROFS_MAP_META         (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED       (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED      (1 << BH_Encoded)
  /* The length of extent is full */
  #define EROFS_MAP_FULL_MAPPED  (1 << BH_FullMapped)
  
@@ -355,6 +393,8 @@ struct erofs_map_blocks {
         erofs_off_t m_pa, m_la;
         u64 m_plen, m_llen;
  
+       unsigned short m_deviceid;
+       char m_algorithmformat;
         unsigned int m_flags;
  
         struct page *mpage;
@@ -367,6 +407,13 @@ struct erofs_map_blocks {
   * approach instead if possible since it's more metadata lightweight.)
   */
  #define EROFS_GET_BLOCKS_FIEMAP        0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE      0x0004
+
+enum {
+       Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+       Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
  
  /* zmap.c */
  extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
  }
  #endif /* !CONFIG_EROFS_FS_ZIP */
  
+struct erofs_map_dev {
+       struct block_device *m_bdev;
+       struct dax_device *m_daxdev;
+
+       erofs_off_t m_pa;
+       unsigned int m_deviceid;
+};
+
  /* data.c */
  extern const struct file_operations erofs_file_fops;
  struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
  int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                  u64 start, u64 len);
  
@@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void);
  void erofs_pcpubuf_exit(void);
  
  /* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+               struct page *page)
+{
+       set_page_private(page, (unsigned long)*pagepool);
+       *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
  
  #ifdef CONFIG_EROFS_FS_ZIP
  int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
  }
  #endif /* !CONFIG_EROFS_FS_ZIP */
  
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+                            struct erofs_super_block *dsb,
+                            struct z_erofs_lzma_cfgs *lzma, int size) {
+       if (lzma) {
+               erofs_err(sb, "lzma algorithm isn't enabled");
+               return -EINVAL;
+       }
+       return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
  #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
  
  #endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c

index 6c88557..a2efd83 100644 (file)
--- a/fs/erofs/pcpubuf.c
+++ b/fs/erofs/pcpubuf.c
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
  {
         static DEFINE_MUTEX(pcb_resize_mutex);
         static unsigned int pcb_nrpages;
-       LIST_HEAD(pagepool);
+       struct page *pagepool = NULL;
         int delta, cpu, ret, i;
  
         mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
                         vunmap(old_ptr);
  free_pagearray:
                 while (i)
-                       list_add(&oldpages[--i]->lru, &pagepool);
+                       erofs_pagepool_add(&pagepool, oldpages[--i]);
                 kfree(oldpages);
                 if (ret)
                         break;
         }
         pcb_nrpages = nrpages;
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
  out:
         mutex_unlock(&pcb_resize_mutex);
         return ret;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c

index 11b8855..6a969b1 100644 (file)
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
                 case Z_EROFS_COMPRESSION_LZ4:
                         ret = z_erofs_load_lz4_config(sb, dsb, data, size);
                         break;
+               case Z_EROFS_COMPRESSION_LZMA:
+                       ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+                       break;
                 default:
                         DBG_BUGON(1);
                         ret = -EFAULT;
@@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
  }
  #endif
  
+static int erofs_init_devices(struct super_block *sb,
+                             struct erofs_super_block *dsb)
+{
+       struct erofs_sb_info *sbi = EROFS_SB(sb);
+       unsigned int ondisk_extradevs;
+       erofs_off_t pos;
+       struct page *page = NULL;
+       struct erofs_device_info *dif;
+       struct erofs_deviceslot *dis;
+       void *ptr;
+       int id, err = 0;
+
+       sbi->total_blocks = sbi->primarydevice_blocks;
+       if (!erofs_sb_has_device_table(sbi))
+               ondisk_extradevs = 0;
+       else
+               ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+       if (ondisk_extradevs != sbi->devs->extra_devices) {
+               erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+                         ondisk_extradevs, sbi->devs->extra_devices);
+               return -EINVAL;
+       }
+       if (!ondisk_extradevs)
+               return 0;
+
+       sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+       pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+       down_read(&sbi->devs->rwsem);
+       idr_for_each_entry(&sbi->devs->tree, dif, id) {
+               erofs_blk_t blk = erofs_blknr(pos);
+               struct block_device *bdev;
+
+               if (!page || page->index != blk) {
+                       if (page) {
+                               kunmap(page);
+                               unlock_page(page);
+                               put_page(page);
+                       }
+
+                       page = erofs_get_meta_page(sb, blk);
+                       if (IS_ERR(page)) {
+                               up_read(&sbi->devs->rwsem);
+                               return PTR_ERR(page);
+                       }
+                       ptr = kmap(page);
+               }
+               dis = ptr + erofs_blkoff(pos);
+
+               bdev = blkdev_get_by_path(dif->path,
+                                         FMODE_READ | FMODE_EXCL,
+                                         sb->s_type);
+               if (IS_ERR(bdev)) {
+                       err = PTR_ERR(bdev);
+                       goto err_out;
+               }
+               dif->bdev = bdev;
+               dif->dax_dev = fs_dax_get_by_bdev(bdev);
+               dif->blocks = le32_to_cpu(dis->blocks);
+               dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+               sbi->total_blocks += dif->blocks;
+               pos += EROFS_DEVT_SLOT_SIZE;
+       }
+err_out:
+       up_read(&sbi->devs->rwsem);
+       if (page) {
+               kunmap(page);
+               unlock_page(page);
+               put_page(page);
+       }
+       return err;
+}
+
  static int erofs_read_superblock(struct super_block *sb)
  {
         struct erofs_sb_info *sbi;
@@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb)
                           sbi->sb_size);
                 goto out;
         }
-       sbi->blocks = le32_to_cpu(dsb->blocks);
+       sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
         sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
  #ifdef CONFIG_EROFS_FS_XATTR
         sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb)
                 ret = erofs_load_compr_cfgs(sb, dsb);
         else
                 ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+       if (ret < 0)
+               goto out;
+
+       /* handle multiple devices */
+       ret = erofs_init_devices(sb, dsb);
  out:
         kunmap(page);
         put_page(page);
@@ -340,15 +421,15 @@ out:
  static void erofs_default_options(struct erofs_fs_context *ctx)
  {
  #ifdef CONFIG_EROFS_FS_ZIP
-       ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
-       ctx->max_sync_decompress_pages = 3;
-       ctx->readahead_sync_decompress = false;
+       ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+       ctx->opt.max_sync_decompress_pages = 3;
+       ctx->opt.readahead_sync_decompress = false;
  #endif
  #ifdef CONFIG_EROFS_FS_XATTR
-       set_opt(ctx, XATTR_USER);
+       set_opt(&ctx->opt, XATTR_USER);
  #endif
  #ifdef CONFIG_EROFS_FS_POSIX_ACL
-       set_opt(ctx, POSIX_ACL);
+       set_opt(&ctx->opt, POSIX_ACL);
  #endif
  }
  
@@ -358,6 +439,7 @@ enum {
         Opt_cache_strategy,
         Opt_dax,
         Opt_dax_enum,
+       Opt_device,
         Opt_err
  };
  
@@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
                      erofs_param_cache_strategy),
         fsparam_flag("dax",             Opt_dax),
         fsparam_enum("dax",             Opt_dax_enum, erofs_dax_param_enums),
+       fsparam_string("device",        Opt_device),
         {}
  };
  
@@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
         switch (mode) {
         case EROFS_MOUNT_DAX_ALWAYS:
                 warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-               set_opt(ctx, DAX_ALWAYS);
-               clear_opt(ctx, DAX_NEVER);
+               set_opt(&ctx->opt, DAX_ALWAYS);
+               clear_opt(&ctx->opt, DAX_NEVER);
                 return true;
         case EROFS_MOUNT_DAX_NEVER:
-               set_opt(ctx, DAX_NEVER);
-               clear_opt(ctx, DAX_ALWAYS);
+               set_opt(&ctx->opt, DAX_NEVER);
+               clear_opt(&ctx->opt, DAX_ALWAYS);
                 return true;
         default:
                 DBG_BUGON(1);
@@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
  static int erofs_fc_parse_param(struct fs_context *fc,
                                 struct fs_parameter *param)
  {
-       struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+       struct erofs_fs_context *ctx = fc->fs_private;
         struct fs_parse_result result;
-       int opt;
+       struct erofs_device_info *dif;
+       int opt, ret;
  
         opt = fs_parse(fc, erofs_fs_parameters, param, &result);
         if (opt < 0)
@@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
         case Opt_user_xattr:
  #ifdef CONFIG_EROFS_FS_XATTR
                 if (result.boolean)
-                       set_opt(ctx, XATTR_USER);
+                       set_opt(&ctx->opt, XATTR_USER);
                 else
-                       clear_opt(ctx, XATTR_USER);
+                       clear_opt(&ctx->opt, XATTR_USER);
  #else
                 errorfc(fc, "{,no}user_xattr options not supported");
  #endif
@@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
         case Opt_acl:
  #ifdef CONFIG_EROFS_FS_POSIX_ACL
                 if (result.boolean)
-                       set_opt(ctx, POSIX_ACL);
+                       set_opt(&ctx->opt, POSIX_ACL);
                 else
-                       clear_opt(ctx, POSIX_ACL);
+                       clear_opt(&ctx->opt, POSIX_ACL);
  #else
                 errorfc(fc, "{,no}acl options not supported");
  #endif
                 break;
         case Opt_cache_strategy:
  #ifdef CONFIG_EROFS_FS_ZIP
-               ctx->cache_strategy = result.uint_32;
+               ctx->opt.cache_strategy = result.uint_32;
  #else
                 errorfc(fc, "compression not supported, cache_strategy ignored");
  #endif
@@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
                 if (!erofs_fc_set_dax_mode(fc, result.uint_32))
                         return -EINVAL;
                 break;
+       case Opt_device:
+               dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+               if (!dif)
+                       return -ENOMEM;
+               dif->path = kstrdup(param->string, GFP_KERNEL);
+               if (!dif->path) {
+                       kfree(dif);
+                       return -ENOMEM;
+               }
+               down_write(&ctx->devs->rwsem);
+               ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+               up_write(&ctx->devs->rwsem);
+               if (ret < 0) {
+                       kfree(dif->path);
+                       kfree(dif);
+                       return ret;
+               }
+               ++ctx->devs->extra_devices;
+               break;
         default:
                 return -ENOPARAM;
         }
@@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
                 return -ENOMEM;
  
         sb->s_fs_info = sbi;
+       sbi->opt = ctx->opt;
         sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+       sbi->devs = ctx->devs;
+       ctx->devs = NULL;
+
         err = erofs_read_superblock(sb);
         if (err)
                 return err;
  
-       if (test_opt(ctx, DAX_ALWAYS) &&
+       if (test_opt(&sbi->opt, DAX_ALWAYS) &&
             !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
                 errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
-               clear_opt(ctx, DAX_ALWAYS);
+               clear_opt(&sbi->opt, DAX_ALWAYS);
         }
         sb->s_flags |= SB_RDONLY | SB_NOATIME;
         sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
         sb->s_op = &erofs_sops;
         sb->s_xattr = erofs_xattr_handlers;
  
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(&sbi->opt, POSIX_ACL))
                 sb->s_flags |= SB_POSIXACL;
         else
                 sb->s_flags &= ~SB_POSIXACL;
  
-       sbi->ctx = *ctx;
-
  #ifdef CONFIG_EROFS_FS_ZIP
         xa_init(&sbi->managed_pslots);
  #endif
@@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
  
         DBG_BUGON(!sb_rdonly(sb));
  
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(&ctx->opt, POSIX_ACL))
                 fc->sb_flags |= SB_POSIXACL;
         else
                 fc->sb_flags &= ~SB_POSIXACL;
  
-       sbi->ctx = *ctx;
+       sbi->opt = ctx->opt;
  
         fc->sb_flags |= SB_RDONLY;
         return 0;
  }
  
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+       struct erofs_device_info *dif = ptr;
+
+       fs_put_dax(dif->dax_dev);
+       if (dif->bdev)
+               blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+       kfree(dif->path);
+       kfree(dif);
+       return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+       if (!devs)
+               return;
+       idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+       idr_destroy(&devs->tree);
+       kfree(devs);
+}
+
  static void erofs_fc_free(struct fs_context *fc)
  {
-       kfree(fc->fs_private);
+       struct erofs_fs_context *ctx = fc->fs_private;
+
+       erofs_free_dev_context(ctx->devs);
+       kfree(ctx);
  }
  
  static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = {
  
  static int erofs_init_fs_context(struct fs_context *fc)
  {
-       fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
-       if (!fc->fs_private)
-               return -ENOMEM;
+       struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  
-       /* set default mount options */
-       erofs_default_options(fc->fs_private);
+       if (!ctx)
+               return -ENOMEM;
+       ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+       if (!ctx->devs) {
+               kfree(ctx);
+               return -ENOMEM;
+       }
+       fc->fs_private = ctx;
  
+       idr_init(&ctx->devs->tree);
+       init_rwsem(&ctx->devs->rwsem);
+       erofs_default_options(ctx);
         fc->ops = &erofs_context_ops;
-
         return 0;
  }
  
@@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb)
         sbi = EROFS_SB(sb);
         if (!sbi)
                 return;
+
+       erofs_free_dev_context(sbi->devs);
         fs_put_dax(sbi->dax_dev);
         kfree(sbi);
         sb->s_fs_info = NULL;
@@ -706,6 +843,10 @@ static int __init erofs_module_init(void)
         if (err)
                 goto shrinker_err;
  
+       err = z_erofs_lzma_init();
+       if (err)
+               goto lzma_err;
+
         erofs_pcpubuf_init();
         err = z_erofs_init_zip_subsystem();
         if (err)
@@ -720,6 +861,8 @@ static int __init erofs_module_init(void)
  fs_err:
         z_erofs_exit_zip_subsystem();
  zip_err:
+       z_erofs_lzma_exit();
+lzma_err:
         erofs_exit_shrinker();
  shrinker_err:
         kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +873,13 @@ icache_err:
  static void __exit erofs_module_exit(void)
  {
         unregister_filesystem(&erofs_fs_type);
-       z_erofs_exit_zip_subsystem();
-       erofs_exit_shrinker();
  
-       /* Ensure all RCU free inodes are safe before cache is destroyed. */
+       /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
         rcu_barrier();
+
+       z_erofs_exit_zip_subsystem();
+       z_erofs_lzma_exit();
+       erofs_exit_shrinker();
         kmem_cache_destroy(erofs_inode_cachep);
         erofs_pcpubuf_exit();
  }
@@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
  
         buf->f_type = sb->s_magic;
         buf->f_bsize = EROFS_BLKSIZ;
-       buf->f_blocks = sbi->blocks;
+       buf->f_blocks = sbi->total_blocks;
         buf->f_bfree = buf->f_bavail = 0;
  
         buf->f_files = ULLONG_MAX;
@@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
  static int erofs_show_options(struct seq_file *seq, struct dentry *root)
  {
         struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
-       struct erofs_fs_context *ctx = &sbi->ctx;
+       struct erofs_mount_opts *opt = &sbi->opt;
  
  #ifdef CONFIG_EROFS_FS_XATTR
-       if (test_opt(ctx, XATTR_USER))
+       if (test_opt(opt, XATTR_USER))
                 seq_puts(seq, ",user_xattr");
         else
                 seq_puts(seq, ",nouser_xattr");
  #endif
  #ifdef CONFIG_EROFS_FS_POSIX_ACL
-       if (test_opt(ctx, POSIX_ACL))
+       if (test_opt(opt, POSIX_ACL))
                 seq_puts(seq, ",acl");
         else
                 seq_puts(seq, ",noacl");
  #endif
  #ifdef CONFIG_EROFS_FS_ZIP
-       if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+       if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
                 seq_puts(seq, ",cache_strategy=disabled");
-       else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+       else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
                 seq_puts(seq, ",cache_strategy=readahead");
-       else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+       else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
                 seq_puts(seq, ",cache_strategy=readaround");
  #endif
-       if (test_opt(ctx, DAX_ALWAYS))
+       if (test_opt(opt, DAX_ALWAYS))
                 seq_puts(seq, ",dax=always");
-       if (test_opt(ctx, DAX_NEVER))
+       if (test_opt(opt, DAX_NEVER))
                 seq_puts(seq, ",dax=never");
         return 0;
  }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c

index bd86067..84da2c2 100644 (file)
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -6,20 +6,29 @@
  #include "internal.h"
  #include <linux/pagevec.h>
  
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
  {
-       struct page *page;
+       struct page *page = *pagepool;
  
-       if (!list_empty(pool)) {
-               page = lru_to_page(pool);
+       if (page) {
                 DBG_BUGON(page_ref_count(page) != 1);
-               list_del(&page->lru);
+               *pagepool = (struct page *)page_private(page);
         } else {
                 page = alloc_page(gfp);
         }
         return page;
  }
  
+void erofs_release_pages(struct page **pagepool)
+{
+       while (*pagepool) {
+               struct page *page = *pagepool;
+
+               *pagepool = (struct page *)page_private(page);
+               put_page(page);
+       }
+}
+
  #ifdef CONFIG_EROFS_FS_ZIP
  /* global shrink count (for all mounted EROFS instances) */
  static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c

index 778f2c5..01c581e 100644 (file)
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
  
  static bool erofs_xattr_user_list(struct dentry *dentry)
  {
-       return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+       return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
  }
  
  static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
  
         switch (handler->flags) {
         case EROFS_XATTR_INDEX_USER:
-               if (!test_opt(&sbi->ctx, XATTR_USER))
+               if (!test_opt(&sbi->opt, XATTR_USER))
                         return -EOPNOTSUPP;
                 break;
         case EROFS_XATTR_INDEX_TRUSTED:
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c

index 11c7a1a..bcb1b91 100644 (file)
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
  static void preload_compressed_pages(struct z_erofs_collector *clt,
                                      struct address_space *mc,
                                      enum z_erofs_cache_alloctype type,
-                                    struct list_head *pagepool)
+                                    struct page **pagepool)
  {
         struct z_erofs_pcluster *pcl = clt->pcl;
         bool standalone = true;
@@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
                 if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
                         continue;
  
-               if (page) {
+               if (page)
                         put_page(page);
-               } else if (newpage) {
-                       set_page_private(newpage, 0);
-                       list_add(&newpage->lru, pagepool);
-               }
+               else if (newpage)
+                       erofs_pagepool_add(pagepool, newpage);
         }
  
         /*
@@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
         struct erofs_workgroup *grp;
         int err;
  
+       if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+               DBG_BUGON(1);
+               return -EFSCORRUPTED;
+       }
+
         /* no available pcluster, let's allocate one */
         pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
         if (IS_ERR(pcl))
@@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
  
         atomic_set(&pcl->obj.refcount, 1);
         pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+       pcl->algorithmformat = map->m_algorithmformat;
         pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
                 (map->m_flags & EROFS_MAP_FULL_MAPPED ?
                         Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
  
-       if (map->m_flags & EROFS_MAP_ZIPPED)
-               pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
-       else
-               pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
         /* new pclusters should be claimed as type 1, primary and followed */
         pcl->next = clt->owned_head;
         clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
  }
  
  static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-                               struct page *page, struct list_head *pagepool)
+                               struct page *page, struct page **pagepool)
  {
         struct inode *const inode = fe->inode;
         struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -695,7 +693,7 @@ restart_now:
                 goto err_out;
  
         /* preload all compressed pages (maybe downgrade role if necessary) */
-       if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+       if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
                 cache_strategy = TRYALLOC;
         else
                 cache_strategy = DONTALLOC;
@@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
         /* Use workqueue and sync decompression for atomic contexts only */
         if (in_atomic() || irqs_disabled()) {
                 queue_work(z_erofs_workqueue, &io->u.work);
-               sbi->ctx.readahead_sync_decompress = true;
+               sbi->opt.readahead_sync_decompress = true;
                 return;
         }
         z_erofs_decompressqueue_work(&io->u.work);
@@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
  
  static int z_erofs_decompress_pcluster(struct super_block *sb,
                                        struct z_erofs_pcluster *pcl,
-                                      struct list_head *pagepool)
+                                      struct page **pagepool)
  {
         struct erofs_sb_info *const sbi = EROFS_SB(sb);
         struct z_erofs_pagevec_ctor ctor;
@@ -1036,7 +1034,7 @@ out:
  }
  
  static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
-                                    struct list_head *pagepool)
+                                    struct page **pagepool)
  {
         z_erofs_next_pcluster_t owned = io->head;
  
@@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
  {
         struct z_erofs_decompressqueue *bgq =
                 container_of(work, struct z_erofs_decompressqueue, u.work);
-       LIST_HEAD(pagepool);
+       struct page *pagepool = NULL;
  
         DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
         z_erofs_decompress_queue(bgq, &pagepool);
  
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
         kvfree(bgq);
  }
  
  static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
                                                unsigned int nr,
-                                              struct list_head *pagepool,
+                                              struct page **pagepool,
                                                struct address_space *mc,
                                                gfp_t gfp)
  {
@@ -1173,7 +1171,7 @@ repeat:
  out_allocpage:
         page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
         if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
-               list_add(&page->lru, pagepool);
+               erofs_pagepool_add(pagepool, page);
                 cond_resched();
                 goto repeat;
         }
@@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
  
  static void z_erofs_submit_queue(struct super_block *sb,
                                  struct z_erofs_decompress_frontend *f,
-                                struct list_head *pagepool,
+                                struct page **pagepool,
                                  struct z_erofs_decompressqueue *fgq,
                                  bool *force_fg)
  {
@@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
         struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
         void *bi_private;
         z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
-       /* since bio will be NULL, no need to initialize last_index */
+       /* bio is NULL initially, so no need to initialize last_{index,bdev} */
         pgoff_t last_index;
+       struct block_device *last_bdev;
         unsigned int nr_bios = 0;
         struct bio *bio = NULL;
  
@@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
         q[JQ_SUBMIT]->head = owned_head;
  
         do {
+               struct erofs_map_dev mdev;
                 struct z_erofs_pcluster *pcl;
                 pgoff_t cur, end;
                 unsigned int i = 0;
@@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
  
                 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
  
-               cur = pcl->obj.index;
+               /* no device id here, thus it will always succeed */
+               mdev = (struct erofs_map_dev) {
+                       .m_pa = blknr_to_addr(pcl->obj.index),
+               };
+               (void)erofs_map_dev(sb, &mdev);
+
+               cur = erofs_blknr(mdev.m_pa);
                 end = cur + pcl->pclusterpages;
  
                 /* close the main owned chain at first */
@@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
                         if (!page)
                                 continue;
  
-                       if (bio && cur != last_index + 1) {
+                       if (bio && (cur != last_index + 1 ||
+                                   last_bdev != mdev.m_bdev)) {
  submit_bio_retry:
                                 submit_bio(bio);
                                 bio = NULL;
@@ -1314,9 +1321,10 @@ submit_bio_retry:
  
                         if (!bio) {
                                 bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
                                 bio->bi_end_io = z_erofs_decompressqueue_endio;
-                               bio_set_dev(bio, sb->s_bdev);
+
+                               bio_set_dev(bio, mdev.m_bdev);
+                               last_bdev = mdev.m_bdev;
                                 bio->bi_iter.bi_sector = (sector_t)cur <<
                                         LOG_SECTORS_PER_BLOCK;
                                 bio->bi_private = bi_private;
@@ -1355,7 +1363,7 @@ submit_bio_retry:
  
  static void z_erofs_runqueue(struct super_block *sb,
                              struct z_erofs_decompress_frontend *f,
-                            struct list_head *pagepool, bool force_fg)
+                            struct page **pagepool, bool force_fg)
  {
         struct z_erofs_decompressqueue io[NR_JOBQUEUES];
  
@@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb,
         z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
  }
  
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+                                     struct readahead_control *rac,
+                                     erofs_off_t end,
+                                     struct page **pagepool,
+                                     bool backmost)
+{
+       struct inode *inode = f->inode;
+       struct erofs_map_blocks *map = &f->map;
+       erofs_off_t cur;
+       int err;
+
+       if (backmost) {
+               map->m_la = end;
+               err = z_erofs_map_blocks_iter(inode, map,
+                                             EROFS_GET_BLOCKS_READMORE);
+               if (err)
+                       return;
+
+               /* expend ra for the trailing edge if readahead */
+               if (rac) {
+                       loff_t newstart = readahead_pos(rac);
+
+                       cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+                       readahead_expand(rac, newstart, cur - newstart);
+                       return;
+               }
+               end = round_up(end, PAGE_SIZE);
+       } else {
+               end = round_up(map->m_la, PAGE_SIZE);
+
+               if (!map->m_llen)
+                       return;
+       }
+
+       cur = map->m_la + map->m_llen - 1;
+       while (cur >= end) {
+               pgoff_t index = cur >> PAGE_SHIFT;
+               struct page *page;
+
+               page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+               if (!page)
+                       goto skip;
+
+               if (PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto skip;
+               }
+
+               err = z_erofs_do_read_page(f, page, pagepool);
+               if (err)
+                       erofs_err(inode->i_sb,
+                                 "readmore error at page %lu @ nid %llu",
+                                 index, EROFS_I(inode)->nid);
+               put_page(page);
+skip:
+               if (cur < PAGE_SIZE)
+                       break;
+               cur = (index << PAGE_SHIFT) - 1;
+       }
+}
+
  static int z_erofs_readpage(struct file *file, struct page *page)
  {
         struct inode *const inode = page->mapping->host;
         struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+       struct page *pagepool = NULL;
         int err;
-       LIST_HEAD(pagepool);
  
         trace_erofs_readpage(page, false);
-
         f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
  
+       z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+                                 &pagepool, true);
         err = z_erofs_do_read_page(&f, page, &pagepool);
+       z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
         (void)z_erofs_collector_end(&f.clt);
  
         /* if some compressed cluster ready, need submit them anyway */
@@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
         if (f.map.mpage)
                 put_page(f.map.mpage);
  
-       /* clean up the remaining free pages */
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
         return err;
  }
  
@@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
  {
         struct inode *const inode = rac->mapping->host;
         struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
-       unsigned int nr_pages = readahead_count(rac);
-       bool sync = (sbi->ctx.readahead_sync_decompress &&
-                       nr_pages <= sbi->ctx.max_sync_decompress_pages);
         struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
-       struct page *page, *head = NULL;
-       LIST_HEAD(pagepool);
-
-       trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+       struct page *pagepool = NULL, *head = NULL, *page;
+       unsigned int nr_pages;
  
         f.readahead = true;
         f.headoffset = readahead_pos(rac);
  
-       while ((page = readahead_page(rac))) {
-               prefetchw(&page->flags);
-
-               /*
-                * A pure asynchronous readahead is indicated if
-                * a PG_readahead marked page is hitted at first.
-                * Let's also do asynchronous decompression for this case.
-                */
-               sync &= !(PageReadahead(page) && !head);
+       z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+                                 readahead_length(rac) - 1, &pagepool, true);
+       nr_pages = readahead_count(rac);
+       trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
  
+       while ((page = readahead_page(rac))) {
                 set_page_private(page, (unsigned long)head);
                 head = page;
         }
@@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac)
                                   page->index, EROFS_I(inode)->nid);
                 put_page(page);
         }
-
+       z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
         (void)z_erofs_collector_end(&f.clt);
  
-       z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
+       z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+                        sbi->opt.readahead_sync_decompress &&
+                        nr_pages <= sbi->opt.max_sync_decompress_pages);
         if (f.map.mpage)
                 put_page(f.map.mpage);
-
-       /* clean up the remaining free pages */
-       put_pages_list(&pagepool);
+       erofs_release_pages(&pagepool);
  }
  
  const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h

index 3a008f1..879df53 100644 (file)
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
         } u;
  };
  
-#define MNGD_MAPPING(sbi)      ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
-                                        struct page *page)
-{
-       return page->mapping == MNGD_MAPPING(sbi);
-}
-
  #define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
  #define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
  #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c

index 7a6df35..660489a 100644 (file)
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
  {
         struct erofs_inode *const vi = EROFS_I(inode);
         struct super_block *const sb = inode->i_sb;
-       int err;
+       int err, headnr;
         erofs_off_t pos;
         struct page *page;
         void *kaddr;
@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
         vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
         vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
  
-       if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
-               erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
-                         vi->z_algorithmtype[0], vi->nid);
+       headnr = 0;
+       if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+           vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+               erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+                         headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
                 err = -EOPNOTSUPP;
                 goto unmap_done;
         }
@@ -111,7 +113,7 @@ struct z_erofs_maprecorder {
  
         unsigned long lcn;
         /* compression extent information gathered */
-       u8  type;
+       u8  type, headtype;
         u16 clusterofs;
         u16 delta[2];
         erofs_blk_t pblk, compressedlcs;
@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
                 m->clusterofs = 1 << vi->z_logical_clusterbits;
                 m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
                 if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
-                       if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                       if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+                                       Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
                                 DBG_BUGON(1);
                                 return -EFSCORRUPTED;
                         }
@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
                 m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
                 break;
         case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                 m->clusterofs = le16_to_cpu(di->di_clusterofs);
                 m->pblk = le32_to_cpu(di->di_u.blkaddr);
                 break;
@@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
                 }
                 return z_erofs_extent_lookback(m, m->delta[0]);
         case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-               map->m_flags &= ~EROFS_MAP_ZIPPED;
-               fallthrough;
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+               m->headtype = m->type;
                 map->m_la = (lcn << lclusterbits) | m->clusterofs;
                 break;
         default:
@@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
         int err;
  
         DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
-                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
-       if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
-           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+       DBG_BUGON(m->type != m->headtype);
+
+       if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+           ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+            !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+           ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+            !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
                 map->m_plen = 1 << lclusterbits;
                 return 0;
         }
-
         lcn = m->lcn + 1;
         if (m->compressedlcs)
                 goto out;
@@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
  
         switch (m->type) {
         case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                 /*
                  * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
                  * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
                         DBG_BUGON(!m->delta[1] &&
                                   m->clusterofs != 1 << lclusterbits);
                 } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
-                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+                          m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
                         /* go on until the next HEAD lcluster */
                         if (lcn != headlcn)
                                 break;
@@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
         if (err)
                 goto unmap_out;
  
-       map->m_flags = EROFS_MAP_ZIPPED;        /* by default, compressed */
+       map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
         end = (m.lcn + 1ULL) << lclusterbits;
  
         switch (m.type) {
         case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-               if (endoff >= m.clusterofs)
-                       map->m_flags &= ~EROFS_MAP_ZIPPED;
-               fallthrough;
-       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+       case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
                 if (endoff >= m.clusterofs) {
+                       m.headtype = m.type;
                         map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
                         break;
                 }
@@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode,
  
         map->m_llen = end - map->m_la;
         map->m_pa = blknr_to_addr(m.pblk);
-       map->m_flags |= EROFS_MAP_MAPPED;
  
         err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
         if (err)
                 goto out;
  
-       if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+       if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+               map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+       else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+               map->m_algorithmformat = vi->z_algorithmtype[1];
+       else
+               map->m_algorithmformat = vi->z_algorithmtype[0];
+
+       if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+           ((flags & EROFS_GET_BLOCKS_READMORE) &&
+            map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+            map->m_llen >= EROFS_BLKSIZ)) {
                 err = z_erofs_get_extent_decompressedlen(&m);
                 if (!err)
                         map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c

index ca37d43..1c7aa1e 100644 (file)
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
         exfat_save_attr(inode, info->attr);
  
         inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
-               ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+               ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits;
         inode->i_mtime = info->mtime;
         inode->i_ctime = info->mtime;
         ei->i_crtime = info->crtime;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index ac0e11b..9c5559f 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = {
         .llseek         = ext4_llseek,
         .read_iter      = ext4_file_read_iter,
         .write_iter     = ext4_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
         .unlocked_ioctl = ext4_ioctl,
  #ifdef CONFIG_COMPAT
         .compat_ioctl   = ext4_compat_ioctl,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 88d5d27..79b6a0c 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = {
         .set_context            = ext4_set_context,
         .get_dummy_policy       = ext4_get_dummy_policy,
         .empty_dir              = ext4_empty_dir,
-       .max_namelen            = EXT4_NAME_LEN,
         .has_stable_inodes      = ext4_has_stable_inodes,
         .get_ino_and_lblk_bits  = ext4_get_ino_and_lblk_bits,
  };
@@ -4474,7 +4473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 goto cantfind_ext4;
  
         /* check blocks count against device size */
-       blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+       blocks_count = sb_bdev_nr_blocks(sb);
         if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                        "exceeds size of device (%llu blocks)",
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c

index c1bf9ad..20a083d 100644 (file)
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
  
  #include <linux/fs.h>
  #include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
  #include <linux/writeback.h>
  #include <linux/backing-dev.h>
  #include <linux/lzo.h>
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 78ebc30..cf049a0 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
         .set_context            = f2fs_set_context,
         .get_dummy_policy       = f2fs_get_dummy_policy,
         .empty_dir              = f2fs_empty_dir,
-       .max_namelen            = F2FS_NAME_LEN,
         .has_stable_inodes      = f2fs_has_stable_inodes,
         .get_ino_and_lblk_bits  = f2fs_get_ino_and_lblk_bits,
         .get_num_devices        = f2fs_get_num_devices,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c

index de0c9b0..a6f1c6d 100644 (file)
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb,
         struct fat_bios_param_block *bpb)
  {
         static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+       sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
         struct fat_floppy_defaults *fdefaults = NULL;
         int error = -EINVAL;
-       sector_t bd_sects;
         unsigned i;
  
-       bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
         /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
         if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
                 if (!silent)
@@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
                 ret = writeback_inode(i1);
         if (!ret && i2)
                 ret = writeback_inode(i2);
-       if (!ret) {
-               struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-               ret = filemap_flush(mapping);
-       }
+       if (!ret)
+               ret = sync_blockdev_nowait(sb->s_bdev);
         return ret;
  }
  EXPORT_SYMBOL_GPL(fat_flush_inodes);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 81ec192..4124a89 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
                          * unplug, so get our IOs out the door before we
                          * give up the CPU.
                          */
-                       blk_flush_plug(current);
+                       if (current->plug)
+                               blk_flush_plug(current->plug, false);
                         cond_resched();
                 }
  
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
          * If we are expecting writeback progress we must submit plugged IO.
          */
         if (blk_needs_flush_plug(current))
-               blk_schedule_flush_plug(current);
+               blk_flush_plug(current->plug, true);
  
         rcu_read_lock();
         list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 11404f8..e6039f2 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -687,7 +687,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                         spin_unlock(&fi->lock);
                 }
  
-               io->iocb->ki_complete(io->iocb, res, 0);
+               io->iocb->ki_complete(io->iocb, res);
         }
  
         kref_put(&io->refcnt, fuse_io_release);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index c559827..5436a68 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
  {
         if (!(fl->fl_flags & FL_FLOCK))
                 return -ENOLCK;
-       if (fl->fl_type & LOCK_MAND)
-               return -EOPNOTSUPP;
  
         if (fl->fl_type == F_UNLCK) {
                 do_unflock(file, fl);
@@ -1353,7 +1351,7 @@ const struct file_operations gfs2_file_fops = {
         .llseek         = gfs2_llseek,
         .read_iter      = gfs2_file_read_iter,
         .write_iter     = gfs2_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
         .unlocked_ioctl = gfs2_ioctl,
         .compat_ioctl   = gfs2_compat_ioctl,
         .mmap           = gfs2_mmap,
@@ -1386,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = {
         .llseek         = gfs2_llseek,
         .read_iter      = gfs2_file_read_iter,
         .write_iter     = gfs2_file_write_iter,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
         .unlocked_ioctl = gfs2_ioctl,
         .compat_ioctl   = gfs2_compat_ioctl,
         .mmap           = gfs2_mmap,
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c

index cdf0ede..5beb826 100644 (file)
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
  
         /* default values */
         *start = 0;
-       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       *size = bdev_nr_sectors(sb->s_bdev);
  
         if (HFS_SB(sb)->session >= 0) {
                 struct cdrom_tocentry te;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c

index 0350dc7..51ae6f1 100644 (file)
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
  
         /* default values */
         *start = 0;
-       *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+       *size = bdev_nr_sectors(sb->s_bdev);
  
         if (HFSPLUS_SB(sb)->session >= 0) {
                 struct cdrom_tocentry te;
diff --git a/fs/internal.h b/fs/internal.h

index 3cd065c..cdd83d4 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,22 +23,11 @@ struct pipe_inode_info;
  #ifdef CONFIG_BLOCK
  extern void __init bdev_cache_init(void);
  
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
  void emergency_thaw_bdev(struct super_block *sb);
  #else
  static inline void bdev_cache_init(void)
  {
  }
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
-       return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
-               void *arg)
-{
-}
  static inline int emergency_thaw_bdev(struct super_block *sb)
  {
         return 0;
diff --git a/fs/io-wq.c b/fs/io-wq.c

index 422a7ed..38b33ad 100644 (file)
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
  static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                         struct io_wqe_acct *acct,
                                         struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
  
  static bool io_worker_get(struct io_worker *worker)
  {
@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
                 complete(&wq->worker_done);
  }
  
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+       struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+       struct io_wqe *wqe = worker->wqe;
+       struct io_wq *wq = wqe->wq;
+
+       atomic_dec(&acct->nr_running);
+       raw_spin_lock(&worker->wqe->lock);
+       acct->nr_workers--;
+       raw_spin_unlock(&worker->wqe->lock);
+       io_worker_ref_put(wq);
+       clear_bit_unlock(0, &worker->create_state);
+       io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+       struct io_worker *worker;
+
+       if (cb->func != create_worker_cb)
+               return false;
+       worker = container_of(cb, struct io_worker, create_work);
+       return worker == data;
+}
+
  static void io_worker_exit(struct io_worker *worker)
  {
         struct io_wqe *wqe = worker->wqe;
+       struct io_wq *wq = wqe->wq;
  
-       if (refcount_dec_and_test(&worker->ref))
-               complete(&worker->ref_done);
+       while (1) {
+               struct callback_head *cb = task_work_cancel_match(wq->task,
+                                               io_task_worker_match, worker);
+
+               if (!cb)
+                       break;
+               io_worker_cancel_cb(worker);
+       }
+
+       io_worker_release(worker);
         wait_for_completion(&worker->ref_done);
  
         raw_spin_lock(&wqe->lock);
@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
  
         init_task_work(&worker->create_work, func);
         worker->create_index = acct->index;
-       if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+       if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+               clear_bit_unlock(0, &worker->create_state);
                 return true;
+       }
         clear_bit_unlock(0, &worker->create_state);
  fail_release:
         io_worker_release(worker);
@@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work)
         struct io_worker *worker = container_of(work, struct io_worker, work);
         struct io_wqe_acct *acct = io_wqe_get_acct(worker);
  
-       if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
-               clear_bit_unlock(0, &worker->create_state);
-               io_worker_release(worker);
+       if (!io_queue_worker_create(worker, acct, create_worker_cont))
                 kfree(worker);
-       }
  }
  
  static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
  
         while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                 struct io_worker *worker;
-               struct io_wqe_acct *acct;
  
                 worker = container_of(cb, struct io_worker, create_work);
-               acct = io_wqe_get_acct(worker);
-               atomic_dec(&acct->nr_running);
-               raw_spin_lock(&worker->wqe->lock);
-               acct->nr_workers--;
-               raw_spin_unlock(&worker->wqe->lock);
-               io_worker_ref_put(wq);
-               clear_bit_unlock(0, &worker->create_state);
-               io_worker_release(worker);
+               io_worker_cancel_cb(worker);
         }
  
         rcu_read_lock();
diff --git a/fs/io-wq.h b/fs/io-wq.h

index bf5c4c5..41bf376 100644 (file)
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
         struct io_wq_work_node *last;
  };
  
+#define wq_list_for_each(pos, prv, head)                       \
+       for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv)                      \
+       for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list)    (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list)     do {                            \
+       (list)->first = NULL;                                   \
+} while (0)
+
  static inline void wq_list_add_after(struct io_wq_work_node *node,
                                      struct io_wq_work_node *pos,
                                      struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
         }
  }
  
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+                                   struct io_wq_work_list *list)
+{
+       node->next = list->first;
+       if (!node->next)
+               list->last = node;
+       WRITE_ONCE(list->first, node);
+}
+
  static inline void wq_list_cut(struct io_wq_work_list *list,
                                struct io_wq_work_node *last,
                                struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
         last->next = NULL;
  }
  
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+                                   struct io_wq_work_node *to)
+{
+       list->last->next = to->next;
+       to->next = list->first;
+       INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+                                 struct io_wq_work_node *to)
+{
+       if (!wq_list_empty(list)) {
+               __wq_list_splice(list, to);
+               return true;
+       }
+       return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+                                    struct io_wq_work_node *stack)
+{
+       node->next = stack->next;
+       stack->next = node;
+}
+
  static inline void wq_list_del(struct io_wq_work_list *list,
                                struct io_wq_work_node *node,
                                struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
         wq_list_cut(list, node, prev);
  }
  
-#define wq_list_for_each(pos, prv, head)                       \
-       for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+       struct io_wq_work_node *node = stack->next;
  
-#define wq_list_empty(list)    (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list)     do {                            \
-       (list)->first = NULL;                                   \
-       (list)->last = NULL;                                    \
-} while (0)
+       stack->next = node->next;
+       return node;
+}
  
  struct io_wq_work {
         struct io_wq_work_node list;
diff --git a/fs/io_uring.c b/fs/io_uring.c

index bc18af5..3a4af97 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -103,11 +103,14 @@
  
  #define IORING_MAX_REG_BUFFERS (1U << 14)
  
-#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
-                               IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
-                               IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+                         IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS        (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
  #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
-                               REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+                               REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+                               REQ_F_ASYNC_DATA)
  
  #define IO_TCTX_REFS_CACHE_NR  (1U << 10)
  
@@ -195,8 +198,10 @@ struct io_rings {
  };
  
  enum io_uring_cmd_flags {
-       IO_URING_F_NONBLOCK             = 1,
-       IO_URING_F_COMPLETE_DEFER       = 2,
+       IO_URING_F_COMPLETE_DEFER       = 1,
+       IO_URING_F_UNLOCKED             = 2,
+       /* int's last bit, sign checks are usually faster than a bit test */
+       IO_URING_F_NONBLOCK             = INT_MIN,
  };
  
  struct io_mapped_ubuf {
@@ -305,26 +310,16 @@ struct io_submit_link {
  };
  
  struct io_submit_state {
-       struct blk_plug         plug;
+       /* inline/task_work completion list, under ->uring_lock */
+       struct io_wq_work_node  free_list;
+       /* batch completion logic */
+       struct io_wq_work_list  compl_reqs;
         struct io_submit_link   link;
  
-       /*
-        * io_kiocb alloc cache
-        */
-       void                    *reqs[IO_REQ_CACHE_SIZE];
-       unsigned int            free_reqs;
-
         bool                    plug_started;
-
-       /*
-        * Batch completion logic
-        */
-       struct io_kiocb         *compl_reqs[IO_COMPL_BATCH];
-       unsigned int            compl_nr;
-       /* inline/task_work completion list, under ->uring_lock */
-       struct list_head        free_list;
-
-       unsigned int            ios_left;
+       bool                    need_plug;
+       unsigned short          submit_nr;
+       struct blk_plug         plug;
  };
  
  struct io_ring_ctx {
@@ -368,6 +363,7 @@ struct io_ring_ctx {
                  * uring_lock, and updated through io_uring_register(2)
                  */
                 struct io_rsrc_node     *rsrc_node;
+               int                     rsrc_cached_refs;
                 struct io_file_table    file_table;
                 unsigned                nr_user_files;
                 unsigned                nr_user_bufs;
@@ -384,7 +380,7 @@ struct io_ring_ctx {
         } ____cacheline_aligned_in_smp;
  
         /* IRQ completion list, under ->completion_lock */
-       struct list_head        locked_free_list;
+       struct io_wq_work_list  locked_free_list;
         unsigned int            locked_free_nr;
  
         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
@@ -399,7 +395,6 @@ struct io_ring_ctx {
                 unsigned                cached_cq_tail;
                 unsigned                cq_entries;
                 struct eventfd_ctx      *cq_ev_fd;
-               struct wait_queue_head  poll_wait;
                 struct wait_queue_head  cq_wait;
                 unsigned                cq_extra;
                 atomic_t                cq_timeouts;
@@ -417,7 +412,7 @@ struct io_ring_ctx {
                  * For SQPOLL, only the single threaded io_sq_thread() will
                  * manipulate the list, hence no extra locking is needed there.
                  */
-               struct list_head        iopoll_list;
+               struct io_wq_work_list  iopoll_list;
                 struct hlist_head       *cancel_hash;
                 unsigned                cancel_hash_bits;
                 bool                    poll_multi_queue;
@@ -580,7 +575,6 @@ struct io_sr_msg {
         int                             msg_flags;
         int                             bgid;
         size_t                          len;
-       struct io_buffer                *kbuf;
  };
  
  struct io_open {
@@ -692,11 +686,6 @@ struct io_hardlink {
         int                             flags;
  };
  
-struct io_completion {
-       struct file                     *file;
-       u32                             cflags;
-};
-
  struct io_async_connect {
         struct sockaddr_storage         address;
  };
@@ -710,11 +699,15 @@ struct io_async_msghdr {
         struct sockaddr_storage         addr;
  };
  
-struct io_async_rw {
-       struct iovec                    fast_iov[UIO_FASTIOV];
-       const struct iovec              *free_iovec;
+struct io_rw_state {
         struct iov_iter                 iter;
         struct iov_iter_state           iter_state;
+       struct iovec                    fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+       struct io_rw_state              s;
+       const struct iovec              *free_iovec;
         size_t                          bytes_done;
         struct wait_page_queue          wpq;
  };
@@ -741,9 +734,9 @@ enum {
         REQ_F_CREDS_BIT,
         REQ_F_REFCOUNT_BIT,
         REQ_F_ARM_LTIMEOUT_BIT,
+       REQ_F_ASYNC_DATA_BIT,
         /* keep async read/write and isreg together and in order */
-       REQ_F_NOWAIT_READ_BIT,
-       REQ_F_NOWAIT_WRITE_BIT,
+       REQ_F_SUPPORT_NOWAIT_BIT,
         REQ_F_ISREG_BIT,
  
         /* not a real bit, just to check we're not overflowing the space */
@@ -784,10 +777,8 @@ enum {
         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
         /* caller should reissue async */
         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
-       /* supports async reads */
-       REQ_F_NOWAIT_READ       = BIT(REQ_F_NOWAIT_READ_BIT),
-       /* supports async writes */
-       REQ_F_NOWAIT_WRITE      = BIT(REQ_F_NOWAIT_WRITE_BIT),
+       /* supports async reads/writes */
+       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
         /* regular file */
         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
         /* has creds assigned */
@@ -796,6 +787,8 @@ enum {
         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
         /* there is a linked timeout that has to be armed */
         REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+       /* ->async_data allocated */
+       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
  };
  
  struct async_poll {
@@ -852,39 +845,41 @@ struct io_kiocb {
                 struct io_mkdir         mkdir;
                 struct io_symlink       symlink;
                 struct io_hardlink      hardlink;
-               /* use only after cleaning per-op data, see io_clean_op() */
-               struct io_completion    compl;
         };
  
-       /* opcode allocated if it needs to store data for async defer */
-       void                            *async_data;
         u8                              opcode;
         /* polled IO has completed */
         u8                              iopoll_completed;
-
         u16                             buf_index;
+       unsigned int                    flags;
+
+       u64                             user_data;
         u32                             result;
+       u32                             cflags;
  
         struct io_ring_ctx              *ctx;
-       unsigned int                    flags;
-       atomic_t                        refs;
         struct task_struct              *task;
-       u64                             user_data;
  
-       struct io_kiocb                 *link;
         struct percpu_ref               *fixed_rsrc_refs;
+       /* store used ubuf, so we can prevent reloading */
+       struct io_mapped_ubuf           *imu;
  
-       /* used with ctx->iopoll_list with reads/writes */
-       struct list_head                inflight_entry;
+       /* used by request caches, completion batching and iopoll */
+       struct io_wq_work_node          comp_list;
+       atomic_t                        refs;
+       struct io_kiocb                 *link;
         struct io_task_work             io_task_work;
         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
         struct hlist_node               hash_node;
+       /* internal polling, see IORING_FEAT_FAST_POLL */
         struct async_poll               *apoll;
+       /* opcode allocated if it needs to store data for async defer */
+       void                            *async_data;
         struct io_wq_work               work;
+       /* custom credentials, valid IFF REQ_F_CREDS is set */
         const struct cred               *creds;
-
-       /* store used ubuf, so we can prevent reloading */
-       struct io_mapped_ubuf           *imu;
+       /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+       struct io_buffer                *kbuf;
  };
  
  struct io_tctx_node {
@@ -902,12 +897,12 @@ struct io_defer_entry {
  struct io_op_def {
         /* needs req->file assigned */
         unsigned                needs_file : 1;
+       /* should block plug */
+       unsigned                plug : 1;
         /* hash wq insertion if file is a regular file */
         unsigned                hash_reg_file : 1;
         /* unbound wq insertion if file is a non-regular file */
         unsigned                unbound_nonreg_file : 1;
-       /* opcode is not supported by this kernel */
-       unsigned                not_supported : 1;
         /* set if opcode supports polled "wait" */
         unsigned                pollin : 1;
         unsigned                pollout : 1;
@@ -915,8 +910,8 @@ struct io_op_def {
         unsigned                buffer_select : 1;
         /* do prep async if is going to be punted */
         unsigned                needs_async_setup : 1;
-       /* should block plug */
-       unsigned                plug : 1;
+       /* opcode is not supported by this kernel */
+       unsigned                not_supported : 1;
         /* size of async data needed, if any */
         unsigned short          async_size;
  };
@@ -1080,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
  static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
  
  static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                long res, unsigned int cflags);
+                                s32 res, u32 cflags);
  static void io_put_req(struct io_kiocb *req);
  static void io_put_req_deferred(struct io_kiocb *req);
  static void io_dismantle_req(struct io_kiocb *req);
@@ -1095,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
  static void io_rsrc_put_work(struct work_struct *work);
  
  static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
  static int io_req_prep_async(struct io_kiocb *req);
  
  static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1167,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req)
         atomic_inc(&req->refs);
  }
  
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+       if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+               __io_submit_flush_completions(ctx);
+}
+
  static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
  {
         if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1180,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
         __io_req_set_refcount(req, 1);
  }
  
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH      100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+                                         struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
  {
-       struct io_ring_ctx *ctx = req->ctx;
+       struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+       if (ref) {
+               if (ref == &ctx->rsrc_node->refs)
+                       ctx->rsrc_cached_refs++;
+               else
+                       percpu_ref_put(ref);
+       }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+       if (req->fixed_rsrc_refs)
+               percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       if (ctx->rsrc_cached_refs) {
+               percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+               ctx->rsrc_cached_refs = 0;
+       }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+       percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
  
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+                                       struct io_ring_ctx *ctx)
+{
         if (!req->fixed_rsrc_refs) {
                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
-               percpu_ref_get(req->fixed_rsrc_refs);
+               ctx->rsrc_cached_refs--;
+               if (unlikely(ctx->rsrc_cached_refs < 0))
+                       io_rsrc_refs_refill(ctx);
         }
  }
  
@@ -1219,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
         return false;
  }
  
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+       return req->flags & REQ_F_ASYNC_DATA;
+}
+
  static inline void req_set_fail(struct io_kiocb *req)
  {
         req->flags |= REQ_F_FAIL;
@@ -1230,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
         req->result = res;
  }
  
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
  {
         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
  
@@ -1242,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
         return !req->timeout.off;
  }
  
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
  {
         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
                                                 fallback_work.work);
@@ -1255,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work)
                 req->io_task_work.func(req, &locked);
  
         if (locked) {
-               if (ctx->submit_state.compl_nr)
-                       io_submit_flush_completions(ctx);
+               io_submit_flush_completions(ctx);
                 mutex_unlock(&ctx->uring_lock);
         }
         percpu_ref_put(&ctx->refs);
-
  }
  
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
  {
         struct io_ring_ctx *ctx;
         int hash_bits;
@@ -1300,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
         ctx->flags = p->flags;
         init_waitqueue_head(&ctx->sqo_sq_wait);
         INIT_LIST_HEAD(&ctx->sqd_list);
-       init_waitqueue_head(&ctx->poll_wait);
         INIT_LIST_HEAD(&ctx->cq_overflow_list);
         init_completion(&ctx->ref_comp);
         xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1309,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
         init_waitqueue_head(&ctx->cq_wait);
         spin_lock_init(&ctx->completion_lock);
         spin_lock_init(&ctx->timeout_lock);
-       INIT_LIST_HEAD(&ctx->iopoll_list);
+       INIT_WQ_LIST(&ctx->iopoll_list);
         INIT_LIST_HEAD(&ctx->defer_list);
         INIT_LIST_HEAD(&ctx->timeout_list);
         INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1318,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
         init_llist_head(&ctx->rsrc_put_llist);
         INIT_LIST_HEAD(&ctx->tctx_list);
-       INIT_LIST_HEAD(&ctx->submit_state.free_list);
-       INIT_LIST_HEAD(&ctx->locked_free_list);
+       ctx->submit_state.free_list.next = NULL;
+       INIT_WQ_LIST(&ctx->locked_free_list);
         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+       INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
         return ctx;
  err:
         kfree(ctx->dummy_ubuf);
@@ -1348,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
         return false;
  }
  
-#define FFS_ASYNC_READ         0x1UL
-#define FFS_ASYNC_WRITE                0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG              0x4UL
-#else
-#define FFS_ISREG              0x0UL
-#endif
-#define FFS_MASK               ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT             0x1UL
+#define FFS_ISREG              0x2UL
+#define FFS_MASK               ~(FFS_NOWAIT|FFS_ISREG)
  
  static inline bool io_req_ffs_set(struct io_kiocb *req)
  {
-       return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+       return req->flags & REQ_F_FIXED_FILE;
  }
  
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
  {
         if (!(req->flags & REQ_F_INFLIGHT)) {
                 req->flags |= REQ_F_INFLIGHT;
@@ -1440,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req)
         }
  }
  
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+       struct io_submit_state *state = &req->ctx->submit_state;
+
+       wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_kiocb *link = io_prep_linked_timeout(req);
         struct io_uring_task *tctx = req->task->io_uring;
  
-       /* must not take the lock, NULL it as a precaution */
-       locked = NULL;
-
         BUG_ON(!tctx);
         BUG_ON(!tctx->io_wq);
  
@@ -1489,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
         }
  }
  
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
  {
         while (!list_empty(&ctx->defer_list)) {
                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1503,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
         }
  }
  
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
         __must_hold(&ctx->completion_lock)
  {
         u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1536,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
         spin_unlock_irq(&ctx->timeout_lock);
  }
  
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
  {
         if (ctx->off_timeout_used)
                 io_flush_timeouts(ctx);
@@ -1606,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
          */
         if (wq_has_sleeper(&ctx->cq_wait))
                 wake_up_all(&ctx->cq_wait);
-       if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
-               wake_up(&ctx->sq_data->wait);
         if (io_should_trigger_evfd(ctx))
                 eventfd_signal(ctx->cq_ev_fd, 1);
-       if (waitqueue_active(&ctx->poll_wait))
-               wake_up_interruptible(&ctx->poll_wait);
  }
  
  static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1625,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
         }
         if (io_should_trigger_evfd(ctx))
                 eventfd_signal(ctx->cq_ev_fd, 1);
-       if (waitqueue_active(&ctx->poll_wait))
-               wake_up_interruptible(&ctx->poll_wait);
  }
  
  /* Returns true if there are no backlogged entries after the flush */
@@ -1722,7 +1758,7 @@ static inline void io_get_task_refs(int nr)
  }
  
  static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
-                                    long res, unsigned int cflags)
+                                    s32 res, u32 cflags)
  {
         struct io_overflow_cqe *ocqe;
  
@@ -1750,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
  }
  
  static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                         long res, unsigned int cflags)
+                                         s32 res, u32 cflags)
  {
         struct io_uring_cqe *cqe;
  
@@ -1773,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
  
  /* not as hot to bloat with inlining */
  static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-                                         long res, unsigned int cflags)
+                                         s32 res, u32 cflags)
  {
         return __io_cqring_fill_event(ctx, user_data, res, cflags);
  }
  
-static void io_req_complete_post(struct io_kiocb *req, long res,
-                                unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+                                u32 cflags)
  {
         struct io_ring_ctx *ctx = req->ctx;
  
@@ -1798,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
                                 req->link = NULL;
                         }
                 }
+               io_req_put_rsrc(req, ctx);
                 io_dismantle_req(req);
                 io_put_task(req->task, 1);
-               list_add(&req->inflight_entry, &ctx->locked_free_list);
+               wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
                 ctx->locked_free_nr++;
-       } else {
-               if (!percpu_ref_tryget(&ctx->refs))
-                       req = NULL;
         }
         io_commit_cqring(ctx);
         spin_unlock(&ctx->completion_lock);
-
-       if (req) {
-               io_cqring_ev_posted(ctx);
-               percpu_ref_put(&ctx->refs);
-       }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
-       return req->flags & IO_REQ_CLEAN_FLAGS;
+       io_cqring_ev_posted(ctx);
  }
  
-static void io_req_complete_state(struct io_kiocb *req, long res,
-                                 unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+                                        u32 cflags)
  {
-       if (io_req_needs_clean(req))
-               io_clean_op(req);
         req->result = res;
-       req->compl.cflags = cflags;
+       req->cflags = cflags;
         req->flags |= REQ_F_COMPLETE_INLINE;
  }
  
  static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
-                                    long res, unsigned cflags)
+                                    s32 res, u32 cflags)
  {
         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
                 io_req_complete_state(req, res, cflags);
@@ -1839,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
                 io_req_complete_post(req, res, cflags);
  }
  
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
  {
         __io_req_complete(req, 0, res, 0);
  }
  
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
  {
         req_set_fail(req);
         io_req_complete_post(req, res, 0);
@@ -1878,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
                                         struct io_submit_state *state)
  {
         spin_lock(&ctx->completion_lock);
-       list_splice_init(&ctx->locked_free_list, &state->free_list);
+       wq_list_splice(&ctx->locked_free_list, &state->free_list);
         ctx->locked_free_nr = 0;
         spin_unlock(&ctx->completion_lock);
  }
@@ -1887,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
  static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
  {
         struct io_submit_state *state = &ctx->submit_state;
-       int nr;
  
         /*
          * If we have more than a batch's worth of requests in our IRQ side
@@ -1896,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
          */
         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
                 io_flush_cached_locked_reqs(ctx, state);
-
-       nr = state->free_reqs;
-       while (!list_empty(&state->free_list)) {
-               struct io_kiocb *req = list_first_entry(&state->free_list,
-                                       struct io_kiocb, inflight_entry);
-
-               list_del(&req->inflight_entry);
-               state->reqs[nr++] = req;
-               if (nr == ARRAY_SIZE(state->reqs))
-                       break;
-       }
-
-       state->free_reqs = nr;
-       return nr != 0;
+       return !!state->free_list.next;
  }
  
  /*
@@ -1918,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
   * Because of that, io_alloc_req() should be called only under ->uring_lock
   * and with extra caution to not get a request that is still worked on.
   */
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
         __must_hold(&ctx->uring_lock)
  {
         struct io_submit_state *state = &ctx->submit_state;
         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+       void *reqs[IO_REQ_ALLOC_BATCH];
+       struct io_kiocb *req;
         int ret, i;
  
-       BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
-       if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
-               goto got_req;
+       if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+               return true;
  
-       ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
-                                   state->reqs);
+       ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
  
         /*
          * Bulk alloc is all-or-nothing. If we fail to get a batch,
          * retry single alloc to be on the safe side.
          */
         if (unlikely(ret <= 0)) {
-               state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-               if (!state->reqs[0])
-                       return NULL;
+               reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+               if (!reqs[0])
+                       return false;
                 ret = 1;
         }
  
-       for (i = 0; i < ret; i++)
-               io_preinit_req(state->reqs[i], ctx);
-       state->free_reqs = ret;
-got_req:
-       state->free_reqs--;
-       return state->reqs[state->free_reqs];
+       percpu_ref_get_many(&ctx->refs, ret);
+       for (i = 0; i < ret; i++) {
+               req = reqs[i];
+
+               io_preinit_req(req, ctx);
+               wq_stack_add_head(&req->comp_list, &state->free_list);
+       }
+       return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+       if (unlikely(!ctx->submit_state.free_list.next))
+               return __io_alloc_req_refill(ctx);
+       return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+       struct io_wq_work_node *node;
+
+       node = wq_stack_extract(&ctx->submit_state.free_list);
+       return container_of(node, struct io_kiocb, comp_list);
  }
  
  static inline void io_put_file(struct file *file)
@@ -1958,35 +1983,28 @@ static inline void io_put_file(struct file *file)
                 fput(file);
  }
  
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
  {
         unsigned int flags = req->flags;
  
-       if (io_req_needs_clean(req))
+       if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
                 io_clean_op(req);
         if (!(flags & REQ_F_FIXED_FILE))
                 io_put_file(req->file);
-       if (req->fixed_rsrc_refs)
-               percpu_ref_put(req->fixed_rsrc_refs);
-       if (req->async_data) {
-               kfree(req->async_data);
-               req->async_data = NULL;
-       }
  }
  
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
  {
         struct io_ring_ctx *ctx = req->ctx;
  
+       io_req_put_rsrc(req, ctx);
         io_dismantle_req(req);
         io_put_task(req->task, 1);
  
         spin_lock(&ctx->completion_lock);
-       list_add(&req->inflight_entry, &ctx->locked_free_list);
+       wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
         ctx->locked_free_nr++;
         spin_unlock(&ctx->completion_lock);
-
-       percpu_ref_put(&ctx->refs);
  }
  
  static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2072,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req)
         return posted;
  }
  
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       bool posted;
+
+       spin_lock(&ctx->completion_lock);
+       posted = io_disarm_next(req);
+       if (posted)
+               io_commit_cqring(req->ctx);
+       spin_unlock(&ctx->completion_lock);
+       if (posted)
+               io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
  {
         struct io_kiocb *nxt;
  
+       if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+               return NULL;
         /*
          * If LINK is set, we have dependent requests in this chain. If we
          * didn't fail this request, queue the first one up, moving any other
          * dependencies to the next request. In case of failure, fail the rest
          * of the chain.
          */
-       if (req->flags & IO_DISARM_MASK) {
-               struct io_ring_ctx *ctx = req->ctx;
-               bool posted;
-
-               spin_lock(&ctx->completion_lock);
-               posted = io_disarm_next(req);
-               if (posted)
-                       io_commit_cqring(req->ctx);
-               spin_unlock(&ctx->completion_lock);
-               if (posted)
-                       io_cqring_ev_posted(ctx);
-       }
+       if (unlikely(req->flags & IO_DISARM_MASK))
+               __io_req_find_next_prep(req);
         nxt = req->link;
         req->link = NULL;
         return nxt;
  }
  
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
-       if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
-               return NULL;
-       return __io_req_find_next(req);
-}
-
  static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
  {
         if (!ctx)
                 return;
         if (*locked) {
-               if (ctx->submit_state.compl_nr)
-                       io_submit_flush_completions(ctx);
+               io_submit_flush_completions(ctx);
                 mutex_unlock(&ctx->uring_lock);
                 *locked = false;
         }
@@ -2129,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb)
         while (1) {
                 struct io_wq_work_node *node;
  
-               if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+               if (!tctx->task_list.first && locked)
                         io_submit_flush_completions(ctx);
  
                 spin_lock_irq(&tctx->task_lock);
@@ -2192,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
          * will do the job.
          */
         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
-       if (!task_work_add(tsk, &tctx->task_work, notify)) {
-               wake_up_process(tsk);
+       if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+               if (notify == TWA_NONE)
+                       wake_up_process(tsk);
                 return;
         }
  
@@ -2271,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
         io_free_req(req);
  }
  
-struct req_batch {
-       struct task_struct      *task;
-       int                     task_refs;
-       int                     ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+                               struct io_wq_work_node *node)
+       __must_hold(&ctx->uring_lock)
  {
-       rb->task_refs = 0;
-       rb->ctx_refs = 0;
-       rb->task = NULL;
-}
+       struct task_struct *task = NULL;
+       int task_refs = 0;
  
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
-                                    struct req_batch *rb)
-{
-       if (rb->ctx_refs)
-               percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
-       if (rb->task)
-               io_put_task(rb->task, rb->task_refs);
-}
+       do {
+               struct io_kiocb *req = container_of(node, struct io_kiocb,
+                                                   comp_list);
  
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
-                             struct io_submit_state *state)
-{
-       io_queue_next(req);
-       io_dismantle_req(req);
+               if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+                       node = req->comp_list.next;
+                       if (!req_ref_put_and_test(req))
+                               continue;
+               }
  
-       if (req->task != rb->task) {
-               if (rb->task)
-                       io_put_task(rb->task, rb->task_refs);
-               rb->task = req->task;
-               rb->task_refs = 0;
-       }
-       rb->task_refs++;
-       rb->ctx_refs++;
+               io_req_put_rsrc_locked(req, ctx);
+               io_queue_next(req);
+               io_dismantle_req(req);
  
-       if (state->free_reqs != ARRAY_SIZE(state->reqs))
-               state->reqs[state->free_reqs++] = req;
-       else
-               list_add(&req->inflight_entry, &state->free_list);
+               if (req->task != task) {
+                       if (task)
+                               io_put_task(task, task_refs);
+                       task = req->task;
+                       task_refs = 0;
+               }
+               task_refs++;
+               node = req->comp_list.next;
+               wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+       } while (node);
+
+       if (task)
+               io_put_task(task, task_refs);
  }
  
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
         __must_hold(&ctx->uring_lock)
  {
+       struct io_wq_work_node *node, *prev;
         struct io_submit_state *state = &ctx->submit_state;
-       int i, nr = state->compl_nr;
-       struct req_batch rb;
  
         spin_lock(&ctx->completion_lock);
-       for (i = 0; i < nr; i++) {
-               struct io_kiocb *req = state->compl_reqs[i];
+       wq_list_for_each(node, prev, &state->compl_reqs) {
+               struct io_kiocb *req = container_of(node, struct io_kiocb,
+                                                   comp_list);
  
                 __io_cqring_fill_event(ctx, req->user_data, req->result,
-                                       req->compl.cflags);
+                                       req->cflags);
         }
         io_commit_cqring(ctx);
         spin_unlock(&ctx->completion_lock);
         io_cqring_ev_posted(ctx);
  
-       io_init_req_batch(&rb);
-       for (i = 0; i < nr; i++) {
-               struct io_kiocb *req = state->compl_reqs[i];
-
-               if (req_ref_put_and_test(req))
-                       io_req_free_batch(&rb, req, &ctx->submit_state);
-       }
-
-       io_req_free_batch_finish(ctx, &rb);
-       state->compl_nr = 0;
+       io_free_batch_list(ctx, state->compl_reqs.first);
+       INIT_WQ_LIST(&state->compl_reqs);
  }
  
  /*
@@ -2401,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
  
  static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
  {
-       struct io_buffer *kbuf;
-
         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
                 return 0;
-       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
-       return io_put_kbuf(req, kbuf);
+       return io_put_kbuf(req, req->kbuf);
  }
  
  static inline bool io_run_task_work(void)
@@ -2420,50 +2419,22 @@ static inline bool io_run_task_work(void)
         return false;
  }
  
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                              struct list_head *done)
-{
-       struct req_batch rb;
-       struct io_kiocb *req;
-
-       /* order with ->result store in io_complete_rw_iopoll() */
-       smp_rmb();
-
-       io_init_req_batch(&rb);
-       while (!list_empty(done)) {
-               req = list_first_entry(done, struct io_kiocb, inflight_entry);
-               list_del(&req->inflight_entry);
-
-               __io_cqring_fill_event(ctx, req->user_data, req->result,
-                                       io_put_rw_kbuf(req));
-               (*nr_events)++;
-
-               if (req_ref_put_and_test(req))
-                       io_req_free_batch(&rb, req, &ctx->submit_state);
-       }
-
-       io_commit_cqring(ctx);
-       io_cqring_ev_posted_iopoll(ctx);
-       io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                       long min)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
  {
-       struct io_kiocb *req, *tmp;
-       LIST_HEAD(done);
-       bool spin;
+       struct io_wq_work_node *pos, *start, *prev;
+       unsigned int poll_flags = BLK_POLL_NOSLEEP;
+       DEFINE_IO_COMP_BATCH(iob);
+       int nr_events = 0;
  
         /*
          * Only spin for completions if we don't have multiple devices hanging
-        * off our complete list, and we're under the requested amount.
+        * off our complete list.
          */
-       spin = !ctx->poll_multi_queue && *nr_events < min;
+       if (ctx->poll_multi_queue || force_nonspin)
+               poll_flags |= BLK_POLL_ONESHOT;
  
-       list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+       wq_list_for_each(pos, start, &ctx->iopoll_list) {
+               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
                 struct kiocb *kiocb = &req->rw.kiocb;
                 int ret;
  
@@ -2472,47 +2443,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                  * If we find a request that requires polling, break out
                  * and complete those lists first, if we have entries there.
                  */
-               if (READ_ONCE(req->iopoll_completed)) {
-                       list_move_tail(&req->inflight_entry, &done);
-                       continue;
-               }
-               if (!list_empty(&done))
+               if (READ_ONCE(req->iopoll_completed))
                         break;
  
-               ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+               ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
                 if (unlikely(ret < 0))
                         return ret;
                 else if (ret)
-                       spin = false;
+                       poll_flags |= BLK_POLL_ONESHOT;
  
                 /* iopoll may have completed current req */
-               if (READ_ONCE(req->iopoll_completed))
-                       list_move_tail(&req->inflight_entry, &done);
+               if (!rq_list_empty(iob.req_list) ||
+                   READ_ONCE(req->iopoll_completed))
+                       break;
         }
  
-       if (!list_empty(&done))
-               io_iopoll_complete(ctx, nr_events, &done);
+       if (!rq_list_empty(iob.req_list))
+               iob.complete(&iob);
+       else if (!pos)
+               return 0;
+
+       prev = start;
+       wq_list_for_each_resume(pos, prev) {
+               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
  
-       return 0;
+               /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+               if (!smp_load_acquire(&req->iopoll_completed))
+                       break;
+               __io_cqring_fill_event(ctx, req->user_data, req->result,
+                                       io_put_rw_kbuf(req));
+               nr_events++;
+       }
+
+       if (unlikely(!nr_events))
+               return 0;
+
+       io_commit_cqring(ctx);
+       io_cqring_ev_posted_iopoll(ctx);
+       pos = start ? start->next : ctx->iopoll_list.first;
+       wq_list_cut(&ctx->iopoll_list, prev, start);
+       io_free_batch_list(ctx, pos);
+       return nr_events;
  }
  
  /*
   * We can't just wait for polled events to come to us, we have to actively
   * find and complete them.
   */
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
  {
         if (!(ctx->flags & IORING_SETUP_IOPOLL))
                 return;
  
         mutex_lock(&ctx->uring_lock);
-       while (!list_empty(&ctx->iopoll_list)) {
-               unsigned int nr_events = 0;
-
-               io_do_iopoll(ctx, &nr_events, 0);
-
+       while (!wq_list_empty(&ctx->iopoll_list)) {
                 /* let it sleep and repeat later if can't complete a request */
-               if (nr_events == 0)
+               if (io_do_iopoll(ctx, true) == 0)
                         break;
                 /*
                  * Ensure we allow local-to-the-cpu processing to take place,
@@ -2559,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
                  * forever, while the workqueue is stuck trying to acquire the
                  * very same mutex.
                  */
-               if (list_empty(&ctx->iopoll_list)) {
+               if (wq_list_empty(&ctx->iopoll_list)) {
                         u32 tail = ctx->cached_cq_tail;
  
                         mutex_unlock(&ctx->uring_lock);
@@ -2568,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
  
                         /* some requests don't go through iopoll_list */
                         if (tail != ctx->cached_cq_tail ||
-                           list_empty(&ctx->iopoll_list))
+                           wq_list_empty(&ctx->iopoll_list))
                                 break;
                 }
-               ret = io_do_iopoll(ctx, &nr_events, min);
-       } while (!ret && nr_events < min && !need_resched());
+               ret = io_do_iopoll(ctx, !min);
+               if (ret < 0)
+                       break;
+               nr_events += ret;
+               ret = 0;
+       } while (nr_events < min && !need_resched());
  out:
         mutex_unlock(&ctx->uring_lock);
         return ret;
@@ -2597,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
  {
         struct io_async_rw *rw = req->async_data;
  
-       if (!rw)
+       if (!req_has_async_data(req))
                 return !io_req_prep_async(req);
-       iov_iter_restore(&rw->iter, &rw->iter_state);
+       iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
         return true;
  }
  
@@ -2643,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
  {
         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
                 kiocb_end_write(req);
-       if (res != req->result) {
+       if (unlikely(res != req->result)) {
                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
                     io_rw_should_reissue(req)) {
                         req->flags |= REQ_F_REISSUE;
@@ -2658,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
  static void io_req_task_complete(struct io_kiocb *req, bool *locked)
  {
         unsigned int cflags = io_put_rw_kbuf(req);
-       long res = req->result;
+       int res = req->result;
  
         if (*locked) {
-               struct io_ring_ctx *ctx = req->ctx;
-               struct io_submit_state *state = &ctx->submit_state;
-
                 io_req_complete_state(req, res, cflags);
-               state->compl_reqs[state->compl_nr++] = req;
-               if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
-                       io_submit_flush_completions(ctx);
+               io_req_add_compl_list(req);
         } else {
                 io_req_complete_post(req, res, cflags);
         }
@@ -2681,7 +2666,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
         __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
  }
  
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
  {
         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
  
@@ -2692,7 +2677,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
         io_req_task_work_add(req);
  }
  
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
  {
         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
  
@@ -2703,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
                         req->flags |= REQ_F_REISSUE;
                         return;
                 }
+               req->result = res;
         }
  
-       WRITE_ONCE(req->result, res);
-       /* order with io_iopoll_complete() checking ->result */
-       smp_wmb();
-       WRITE_ONCE(req->iopoll_completed, 1);
+       /* order with io_iopoll_complete() checking ->iopoll_completed */
+       smp_store_release(&req->iopoll_completed, 1);
  }
  
  /*
@@ -2717,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
   * find it from a io_do_iopoll() thread before the issuer is done
   * accessing the kiocb cookie.
   */
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
-       const bool in_async = io_wq_current_is_worker();
+       const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
  
         /* workqueue context doesn't hold uring_lock, grab it now */
-       if (unlikely(in_async))
+       if (unlikely(needs_lock))
                 mutex_lock(&ctx->uring_lock);
  
         /*
@@ -2731,23 +2715,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
          * how we do polling eventually, not spinning if we're on potentially
          * different devices.
          */
-       if (list_empty(&ctx->iopoll_list)) {
+       if (wq_list_empty(&ctx->iopoll_list)) {
                 ctx->poll_multi_queue = false;
         } else if (!ctx->poll_multi_queue) {
                 struct io_kiocb *list_req;
-               unsigned int queue_num0, queue_num1;
  
-               list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
-                                               inflight_entry);
-
-               if (list_req->file != req->file) {
+               list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+                                       comp_list);
+               if (list_req->file != req->file)
                         ctx->poll_multi_queue = true;
-               } else {
-                       queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
-                       queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
-                       if (queue_num0 != queue_num1)
-                               ctx->poll_multi_queue = true;
-               }
         }
  
         /*
@@ -2755,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
          * it to the front so we find it first.
          */
         if (READ_ONCE(req->iopoll_completed))
-               list_add(&req->inflight_entry, &ctx->iopoll_list);
+               wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
         else
-               list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+               wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
  
-       if (unlikely(in_async)) {
+       if (unlikely(needs_lock)) {
                 /*
                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
                  * in sq thread task context or in io worker task context. If
@@ -2784,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
   * any file. For now, just ensure that anything potentially problematic is done
   * inline.
   */
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
  {
-       umode_t mode = file_inode(file)->i_mode;
-
         if (S_ISBLK(mode)) {
                 if (IS_ENABLED(CONFIG_BLOCK) &&
                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2807,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
         /* any ->read/write should understand O_NONBLOCK */
         if (file->f_flags & O_NONBLOCK)
                 return true;
+       return file->f_mode & FMODE_NOWAIT;
+}
  
-       if (!(file->f_mode & FMODE_NOWAIT))
-               return false;
-
-       if (rw == READ)
-               return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+       umode_t mode = file_inode(file)->i_mode;
+       unsigned int res = 0;
  
-       return file->f_op->write_iter != NULL;
+       if (S_ISREG(mode))
+               res |= FFS_ISREG;
+       if (__io_file_supports_nowait(file, mode))
+               res |= FFS_NOWAIT;
+       return res;
  }
  
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
  {
-       if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
-               return true;
-       else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
-               return true;
-
-       return __io_file_supports_nowait(req->file, rw);
+       return req->flags & REQ_F_SUPPORT_NOWAIT;
  }
  
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-                     int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct kiocb *kiocb = &req->rw.kiocb;
@@ -2836,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         unsigned ioprio;
         int ret;
  
-       if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
-               req->flags |= REQ_F_ISREG;
+       if (!io_req_ffs_set(req))
+               req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
  
         kiocb->ki_pos = READ_ONCE(sqe->off);
         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
                 req->flags |= REQ_F_CUR_POS;
                 kiocb->ki_pos = file->f_pos;
         }
-       kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
-       kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+       kiocb->ki_flags = iocb_flags(file);
         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
         if (unlikely(ret))
                 return ret;
@@ -2856,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
          * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
          */
         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
-           ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
+           ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
                 req->flags |= REQ_F_NOWAIT;
  
-       ioprio = READ_ONCE(sqe->ioprio);
-       if (ioprio) {
-               ret = ioprio_check_cap(ioprio);
-               if (ret)
-                       return ret;
-
-               kiocb->ki_ioprio = ioprio;
-       } else
-               kiocb->ki_ioprio = get_current_ioprio();
-
         if (ctx->flags & IORING_SETUP_IOPOLL) {
-               if (!(kiocb->ki_flags & IOCB_DIRECT) ||
-                   !kiocb->ki_filp->f_op->iopoll)
+               if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
                         return -EOPNOTSUPP;
  
                 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2883,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                 kiocb->ki_complete = io_complete_rw;
         }
  
-       if (req->opcode == IORING_OP_READ_FIXED ||
-           req->opcode == IORING_OP_WRITE_FIXED) {
-               req->imu = NULL;
-               io_req_set_rsrc_node(req);
+       ioprio = READ_ONCE(sqe->ioprio);
+       if (ioprio) {
+               ret = ioprio_check_cap(ioprio);
+               if (ret)
+                       return ret;
+
+               kiocb->ki_ioprio = ioprio;
+       } else {
+               kiocb->ki_ioprio = get_current_ioprio();
         }
  
+       req->imu = NULL;
         req->rw.addr = READ_ONCE(sqe->addr);
         req->rw.len = READ_ONCE(sqe->len);
         req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2912,7 +2884,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
                 ret = -EINTR;
                 fallthrough;
         default:
-               kiocb->ki_complete(kiocb, ret, 0);
+               kiocb->ki_complete(kiocb, ret);
         }
  }
  
@@ -2923,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
         struct io_async_rw *io = req->async_data;
  
         /* add previously done IO, if any */
-       if (io && io->bytes_done > 0) {
+       if (req_has_async_data(req) && io->bytes_done > 0) {
                 if (ret < 0)
                         ret = io->bytes_done;
                 else
@@ -2946,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
                         struct io_ring_ctx *ctx = req->ctx;
  
                         req_set_fail(req);
-                       if (!(issue_flags & IO_URING_F_NONBLOCK)) {
+                       if (issue_flags & IO_URING_F_UNLOCKED) {
                                 mutex_lock(&ctx->uring_lock);
                                 __io_req_complete(req, issue_flags, ret, cflags);
                                 mutex_unlock(&ctx->uring_lock);
@@ -3017,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
  
  static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
  {
-       struct io_ring_ctx *ctx = req->ctx;
         struct io_mapped_ubuf *imu = req->imu;
         u16 index, buf_index = req->buf_index;
  
         if (likely(!imu)) {
+               struct io_ring_ctx *ctx = req->ctx;
+
                 if (unlikely(buf_index >= ctx->nr_user_bufs))
                         return -EFAULT;
+               io_req_set_rsrc_node(req, ctx);
                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
                 imu = READ_ONCE(ctx->user_bufs[index]);
                 req->imu = imu;
@@ -3050,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
  }
  
  static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
-                                         int bgid, struct io_buffer *kbuf,
-                                         bool needs_lock)
+                                         int bgid, unsigned int issue_flags)
  {
+       struct io_buffer *kbuf = req->kbuf;
         struct io_buffer *head;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
  
         if (req->flags & REQ_F_BUFFER_SELECTED)
                 return kbuf;
@@ -3074,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
                 }
                 if (*len > kbuf->len)
                         *len = kbuf->len;
+               req->flags |= REQ_F_BUFFER_SELECTED;
+               req->kbuf = kbuf;
         } else {
                 kbuf = ERR_PTR(-ENOBUFS);
         }
  
         io_ring_submit_unlock(req->ctx, needs_lock);
-
         return kbuf;
  }
  
  static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
-                                       bool needs_lock)
+                                       unsigned int issue_flags)
  {
         struct io_buffer *kbuf;
         u16 bgid;
  
-       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
         bgid = req->buf_index;
-       kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+       kbuf = io_buffer_select(req, len, bgid, issue_flags);
         if (IS_ERR(kbuf))
                 return kbuf;
-       req->rw.addr = (u64) (unsigned long) kbuf;
-       req->flags |= REQ_F_BUFFER_SELECTED;
         return u64_to_user_ptr(kbuf->addr);
  }
  
  #ifdef CONFIG_COMPAT
  static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
-                               bool needs_lock)
+                               unsigned int issue_flags)
  {
         struct compat_iovec __user *uiov;
         compat_ssize_t clen;
@@ -3117,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
                 return -EINVAL;
  
         len = clen;
-       buf = io_rw_buffer_select(req, &len, needs_lock);
+       buf = io_rw_buffer_select(req, &len, issue_flags);
         if (IS_ERR(buf))
                 return PTR_ERR(buf);
         iov[0].iov_base = buf;
@@ -3127,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
  #endif
  
  static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-                                     bool needs_lock)
+                                     unsigned int issue_flags)
  {
         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
         void __user *buf;
@@ -3139,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
         len = iov[0].iov_len;
         if (len < 0)
                 return -EINVAL;
-       buf = io_rw_buffer_select(req, &len, needs_lock);
+       buf = io_rw_buffer_select(req, &len, issue_flags);
         if (IS_ERR(buf))
                 return PTR_ERR(buf);
         iov[0].iov_base = buf;
@@ -3148,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
  }
  
  static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-                                   bool needs_lock)
+                                   unsigned int issue_flags)
  {
         if (req->flags & REQ_F_BUFFER_SELECTED) {
-               struct io_buffer *kbuf;
+               struct io_buffer *kbuf = req->kbuf;
  
-               kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                 iov[0].iov_len = kbuf->len;
                 return 0;
@@ -3163,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
  
  #ifdef CONFIG_COMPAT
         if (req->ctx->compat)
-               return io_compat_import(req, iov, needs_lock);
+               return io_compat_import(req, iov, issue_flags);
  #endif
  
-       return __io_iov_buffer_select(req, iov, needs_lock);
+       return __io_iov_buffer_select(req, iov, issue_flags);
  }
  
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
-                          struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+                                      struct io_rw_state *s,
+                                      unsigned int issue_flags)
  {
-       void __user *buf = u64_to_user_ptr(req->rw.addr);
-       size_t sqe_len = req->rw.len;
+       struct iov_iter *iter = &s->iter;
         u8 opcode = req->opcode;
+       struct iovec *iovec;
+       void __user *buf;
+       size_t sqe_len;
         ssize_t ret;
  
-       if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-               *iovec = NULL;
-               return io_import_fixed(req, rw, iter);
-       }
+       BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+       if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+               return ERR_PTR(io_import_fixed(req, rw, iter));
  
         /* buffer index only valid with fixed read/write, or buffer select  */
-       if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
-               return -EINVAL;
+       if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+               return ERR_PTR(-EINVAL);
+
+       buf = u64_to_user_ptr(req->rw.addr);
+       sqe_len = req->rw.len;
  
         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
                 if (req->flags & REQ_F_BUFFER_SELECT) {
-                       buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+                       buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
                         if (IS_ERR(buf))
-                               return PTR_ERR(buf);
+                               return ERR_CAST(buf);
                         req->rw.len = sqe_len;
                 }
  
-               ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
-               *iovec = NULL;
-               return ret;
+               ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+               return ERR_PTR(ret);
         }
  
+       iovec = s->fast_iov;
         if (req->flags & REQ_F_BUFFER_SELECT) {
-               ret = io_iov_buffer_select(req, *iovec, needs_lock);
+               ret = io_iov_buffer_select(req, iovec, issue_flags);
                 if (!ret)
-                       iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
-               *iovec = NULL;
-               return ret;
+                       iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+               return ERR_PTR(ret);
         }
  
-       return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+       ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
                               req->ctx->compat);
+       if (unlikely(ret < 0))
+               return ERR_PTR(ret);
+       return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+                                 struct iovec **iovec, struct io_rw_state *s,
+                                 unsigned int issue_flags)
+{
+       *iovec = __io_import_iovec(rw, req, s, issue_flags);
+       if (unlikely(IS_ERR(*iovec)))
+               return PTR_ERR(*iovec);
+
+       iov_iter_save_state(&s->iter, &s->iter_state);
+       return 0;
  }
  
  static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3233,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
          */
         if (kiocb->ki_flags & IOCB_HIPRI)
                 return -EOPNOTSUPP;
-       if (kiocb->ki_flags & IOCB_NOWAIT)
+       if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+           !(kiocb->ki_filp->f_flags & O_NONBLOCK))
                 return -EAGAIN;
  
         while (iov_iter_count(iter)) {
@@ -3279,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
  {
         struct io_async_rw *rw = req->async_data;
  
-       memcpy(&rw->iter, iter, sizeof(*iter));
+       memcpy(&rw->s.iter, iter, sizeof(*iter));
         rw->free_iovec = iovec;
         rw->bytes_done = 0;
         /* can only be fixed buffers, no need to do anything */
@@ -3288,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
         if (!iovec) {
                 unsigned iov_off = 0;
  
-               rw->iter.iov = rw->fast_iov;
+               rw->s.iter.iov = rw->s.fast_iov;
                 if (iter->iov != fast_iov) {
                         iov_off = iter->iov - fast_iov;
-                       rw->iter.iov += iov_off;
+                       rw->s.iter.iov += iov_off;
                 }
-               if (rw->fast_iov != fast_iov)
-                       memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+               if (rw->s.fast_iov != fast_iov)
+                       memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
                                sizeof(struct iovec) * iter->nr_segs);
         } else {
                 req->flags |= REQ_F_NEED_CLEANUP;
         }
  }
  
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
  {
         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
-       return req->async_data == NULL;
+       if (req->async_data) {
+               req->flags |= REQ_F_ASYNC_DATA;
+               return false;
+       }
+       return true;
  }
  
  static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-                            const struct iovec *fast_iov,
-                            struct iov_iter *iter, bool force)
+                            struct io_rw_state *s, bool force)
  {
         if (!force && !io_op_defs[req->opcode].needs_async_setup)
                 return 0;
-       if (!req->async_data) {
+       if (!req_has_async_data(req)) {
                 struct io_async_rw *iorw;
  
                 if (io_alloc_async_data(req)) {
@@ -3322,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
                         return -ENOMEM;
                 }
  
-               io_req_map_rw(req, iovec, fast_iov, iter);
+               io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
                 iorw = req->async_data;
                 /* we've copied and mapped the iter, ensure state is saved */
-               iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+               iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
         }
         return 0;
  }
@@ -3333,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
  static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
  {
         struct io_async_rw *iorw = req->async_data;
-       struct iovec *iov = iorw->fast_iov;
+       struct iovec *iov;
         int ret;
  
-       ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+       /* submission path, ->uring_lock should already be taken */
+       ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
         if (unlikely(ret < 0))
                 return ret;
  
@@ -3344,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
         iorw->free_iovec = iov;
         if (iov)
                 req->flags |= REQ_F_NEED_CLEANUP;
-       iov_iter_save_state(&iorw->iter, &iorw->iter_state);
         return 0;
  }
  
@@ -3352,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         if (unlikely(!(req->file->f_mode & FMODE_READ)))
                 return -EBADF;
-       return io_prep_rw(req, sqe, READ);
+       return io_prep_rw(req, sqe);
  }
  
  /*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
   * when we initially tried to do the IO with the iocb armed our waitqueue.
   * This gets called when the page is unlocked, and we generally expect that to
   * happen when the page IO is completed and the page is now uptodate. This will
@@ -3428,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
  
  static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
  {
-       if (req->file->f_op->read_iter)
+       if (likely(req->file->f_op->read_iter))
                 return call_read_iter(req->file, &req->rw.kiocb, iter);
         else if (req->file->f_op->read)
                 return loop_rw_iter(READ, req, iter);
@@ -3444,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req)
  
  static int io_read(struct io_kiocb *req, unsigned int issue_flags)
  {
-       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct io_rw_state __s, *s = &__s;
+       struct iovec *iovec;
         struct kiocb *kiocb = &req->rw.kiocb;
-       struct iov_iter __iter, *iter = &__iter;
-       struct io_async_rw *rw = req->async_data;
         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-       struct iov_iter_state __state, *state;
+       struct io_async_rw *rw;
         ssize_t ret, ret2;
  
-       if (rw) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
+       if (!req_has_async_data(req)) {
+               ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+               if (unlikely(ret < 0))
+                       return ret;
+       } else {
+               rw = req->async_data;
+               s = &rw->s;
                 /*
                  * We come here from an earlier attempt, restore our state to
                  * match in case it doesn't. It's cheap enough that we don't
                  * need to make this conditional.
                  */
-               iov_iter_restore(iter, state);
+               iov_iter_restore(&s->iter, &s->iter_state);
                 iovec = NULL;
-       } else {
-               ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
-               if (ret < 0)
-                       return ret;
-               state = &__state;
-               iov_iter_save_state(iter, state);
         }
-       req->result = iov_iter_count(iter);
+       req->result = iov_iter_count(&s->iter);
  
-       /* Ensure we clear previously set non-block flag */
-       if (!force_nonblock)
-               kiocb->ki_flags &= ~IOCB_NOWAIT;
-       else
+       if (force_nonblock) {
+               /* If the file doesn't support async, just async punt */
+               if (unlikely(!io_file_supports_nowait(req))) {
+                       ret = io_setup_async_rw(req, iovec, s, true);
+                       return ret ?: -EAGAIN;
+               }
                 kiocb->ki_flags |= IOCB_NOWAIT;
-
-       /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_nowait(req, READ)) {
-               ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-               return ret ?: -EAGAIN;
+       } else {
+               /* Ensure we clear previously set non-block flag */
+               kiocb->ki_flags &= ~IOCB_NOWAIT;
         }
  
         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
@@ -3489,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                 return ret;
         }
  
-       ret = io_iter_do_read(req, iter);
+       ret = io_iter_do_read(req, &s->iter);
  
         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                 req->flags &= ~REQ_F_REISSUE;
@@ -3502,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                 ret = 0;
         } else if (ret == -EIOCBQUEUED) {
                 goto out_free;
-       } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+       } else if (ret == req->result || ret <= 0 || !force_nonblock ||
                    (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
                 /* read all, failed, already did sync or don't want to retry */
                 goto done;
@@ -3513,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
          * untouched in case of error. Restore it and we'll advance it
          * manually if we need to.
          */
-       iov_iter_restore(iter, state);
+       iov_iter_restore(&s->iter, &s->iter_state);
  
-       ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+       ret2 = io_setup_async_rw(req, iovec, s, true);
         if (ret2)
                 return ret2;
  
         iovec = NULL;
         rw = req->async_data;
+       s = &rw->s;
         /*
          * Now use our persistent iterator and state, if we aren't already.
          * We've restored and mapped the iter to match.
          */
-       if (iter != &rw->iter) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
-       }
  
         do {
                 /*
@@ -3536,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                  * above or inside this loop. Advance the iter by the bytes
                  * that were consumed.
                  */
-               iov_iter_advance(iter, ret);
-               if (!iov_iter_count(iter))
+               iov_iter_advance(&s->iter, ret);
+               if (!iov_iter_count(&s->iter))
                         break;
                 rw->bytes_done += ret;
-               iov_iter_save_state(iter, state);
+               iov_iter_save_state(&s->iter, &s->iter_state);
  
                 /* if we can retry, do so with the callbacks armed */
                 if (!io_rw_should_retry(req)) {
@@ -3554,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                  * desired page gets unlocked. We can also get a partial read
                  * here, and if we do, then just retry at the new offset.
                  */
-               ret = io_iter_do_read(req, iter);
+               ret = io_iter_do_read(req, &s->iter);
                 if (ret == -EIOCBQUEUED)
                         return 0;
                 /* we got some bytes, but not all. retry. */
                 kiocb->ki_flags &= ~IOCB_WAITQ;
-               iov_iter_restore(iter, state);
+               iov_iter_restore(&s->iter, &s->iter_state);
         } while (ret > 0);
  done:
         kiocb_done(kiocb, ret, issue_flags);
@@ -3574,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                 return -EBADF;
-       return io_prep_rw(req, sqe, WRITE);
+       req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
+       return io_prep_rw(req, sqe);
  }
  
  static int io_write(struct io_kiocb *req, unsigned int issue_flags)
  {
-       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct io_rw_state __s, *s = &__s;
+       struct iovec *iovec;
         struct kiocb *kiocb = &req->rw.kiocb;
-       struct iov_iter __iter, *iter = &__iter;
-       struct io_async_rw *rw = req->async_data;
         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-       struct iov_iter_state __state, *state;
         ssize_t ret, ret2;
  
-       if (rw) {
-               iter = &rw->iter;
-               state = &rw->iter_state;
-               iov_iter_restore(iter, state);
+       if (!req_has_async_data(req)) {
+               ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+               if (unlikely(ret < 0))
+                       return ret;
+       } else {
+               struct io_async_rw *rw = req->async_data;
+
+               s = &rw->s;
+               iov_iter_restore(&s->iter, &s->iter_state);
                 iovec = NULL;
-       } else {
-               ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
-               if (ret < 0)
-                       return ret;
-               state = &__state;
-               iov_iter_save_state(iter, state);
         }
-       req->result = iov_iter_count(iter);
+       req->result = iov_iter_count(&s->iter);
  
-       /* Ensure we clear previously set non-block flag */
-       if (!force_nonblock)
-               kiocb->ki_flags &= ~IOCB_NOWAIT;
-       else
-               kiocb->ki_flags |= IOCB_NOWAIT;
+       if (force_nonblock) {
+               /* If the file doesn't support async, just async punt */
+               if (unlikely(!io_file_supports_nowait(req)))
+                       goto copy_iov;
  
-       /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_nowait(req, WRITE))
-               goto copy_iov;
+               /* file path doesn't support NOWAIT for non-direct_IO */
+               if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+                   (req->flags & REQ_F_ISREG))
+                       goto copy_iov;
  
-       /* file path doesn't support NOWAIT for non-direct_IO */
-       if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
-           (req->flags & REQ_F_ISREG))
-               goto copy_iov;
+               kiocb->ki_flags |= IOCB_NOWAIT;
+       } else {
+               /* Ensure we clear previously set non-block flag */
+               kiocb->ki_flags &= ~IOCB_NOWAIT;
+       }
  
         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
         if (unlikely(ret))
@@ -3634,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
         }
         kiocb->ki_flags |= IOCB_WRITE;
  
-       if (req->file->f_op->write_iter)
-               ret2 = call_write_iter(req->file, kiocb, iter);
+       if (likely(req->file->f_op->write_iter))
+               ret2 = call_write_iter(req->file, kiocb, &s->iter);
         else if (req->file->f_op->write)
-               ret2 = loop_rw_iter(WRITE, req, iter);
+               ret2 = loop_rw_iter(WRITE, req, &s->iter);
         else
                 ret2 = -EINVAL;
  
@@ -3657,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
                 goto done;
         if (!force_nonblock || ret2 != -EAGAIN) {
                 /* IOPOLL retry should happen for io-wq threads */
-               if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+               if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
                         goto copy_iov;
  done:
                 kiocb_done(kiocb, ret2, issue_flags);
         } else {
  copy_iov:
-               iov_iter_restore(iter, state);
-               ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+               iov_iter_restore(&s->iter, &s->iter_state);
+               ret = io_setup_async_rw(req, iovec, s, false);
                 return ret ?: -EAGAIN;
         }
  out_free:
@@ -3800,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
         return 0;
  }
  
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_mkdir *mkd = &req->mkdir;
         int ret;
@@ -3849,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
         return 0;
  }
  
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_symlink *sl = &req->symlink;
         int ret;
@@ -3899,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req,
         return 0;
  }
  
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_hardlink *lnk = &req->hardlink;
         int ret;
@@ -4318,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
         struct io_ring_ctx *ctx = req->ctx;
         struct io_buffer *head;
         int ret = 0;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
  
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
  
         lockdep_assert_held(&ctx->uring_lock);
  
@@ -4333,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
  
         /* complete before unlock, IOPOLL may need the lock */
         __io_req_complete(req, issue_flags, ret, 0);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
         return 0;
  }
  
@@ -4405,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
         struct io_ring_ctx *ctx = req->ctx;
         struct io_buffer *head, *list;
         int ret = 0;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
  
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
  
         lockdep_assert_held(&ctx->uring_lock);
  
@@ -4423,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
                 req_set_fail(req);
         /* complete before unlock, IOPOLL may need the lock */
         __io_req_complete(req, issue_flags, ret, 0);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
         return 0;
  }
  
@@ -4756,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
         if (unlikely(!sock))
                 return -ENOTSOCK;
  
-       kmsg = req->async_data;
-       if (!kmsg) {
+       if (req_has_async_data(req)) {
+               kmsg = req->async_data;
+       } else {
                 ret = io_sendmsg_copy_hdr(req, &iomsg);
                 if (ret)
                         return ret;
@@ -4916,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
  }
  
  static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
-                                              bool needs_lock)
+                                              unsigned int issue_flags)
  {
         struct io_sr_msg *sr = &req->sr_msg;
-       struct io_buffer *kbuf;
-
-       kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
-       if (IS_ERR(kbuf))
-               return kbuf;
  
-       sr->kbuf = kbuf;
-       req->flags |= REQ_F_BUFFER_SELECTED;
-       return kbuf;
+       return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
  }
  
  static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
  {
-       return io_put_kbuf(req, req->sr_msg.kbuf);
+       return io_put_kbuf(req, req->kbuf);
  }
  
  static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4980,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
         if (unlikely(!sock))
                 return -ENOTSOCK;
  
-       kmsg = req->async_data;
-       if (!kmsg) {
+       if (req_has_async_data(req)) {
+               kmsg = req->async_data;
+       } else {
                 ret = io_recvmsg_copy_hdr(req, &iomsg);
                 if (ret)
                         return ret;
@@ -4989,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
         }
  
         if (req->flags & REQ_F_BUFFER_SELECT) {
-               kbuf = io_recv_buffer_select(req, !force_nonblock);
+               kbuf = io_recv_buffer_select(req, issue_flags);
                 if (IS_ERR(kbuf))
                         return PTR_ERR(kbuf);
                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5041,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
                 return -ENOTSOCK;
  
         if (req->flags & REQ_F_BUFFER_SELECT) {
-               kbuf = io_recv_buffer_select(req, !force_nonblock);
+               kbuf = io_recv_buffer_select(req, issue_flags);
                 if (IS_ERR(kbuf))
                         return PTR_ERR(kbuf);
                 buf = u64_to_user_ptr(kbuf->addr);
@@ -5172,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
         int ret;
         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
  
-       if (req->async_data) {
+       if (req_has_async_data(req)) {
                 io = req->async_data;
         } else {
                 ret = move_addr_to_kernel(req->connect.addr,
@@ -5188,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
         ret = __sys_connect_file(req->file, &io->address,
                                         req->connect.addr_len, file_flags);
         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
-               if (req->async_data)
+               if (req_has_async_data(req))
                         return -EAGAIN;
                 if (io_alloc_async_data(req)) {
                         ret = -ENOMEM;
@@ -5348,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
         return !(flags & IORING_CQE_F_MORE);
  }
  
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
-       __must_hold(&req->ctx->completion_lock)
-{
-       bool done;
-
-       done = __io_poll_complete(req, mask);
-       io_commit_cqring(req->ctx);
-       return done;
-}
-
  static void io_poll_task_func(struct io_kiocb *req, bool *locked)
  {
         struct io_ring_ctx *ctx = req->ctx;
@@ -5479,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
                 req_ref_get(req);
                 poll->wait.private = req;
+
                 *poll_ptr = poll;
+               if (req->opcode == IORING_OP_POLL_ADD)
+                       req->flags |= REQ_F_ASYNC_DATA;
         }
  
         pt->nr_entries++;
@@ -5603,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
         struct async_poll *apoll;
         struct io_poll_table ipt;
         __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
-       int rw;
  
-       if (!req->file || !file_can_poll(req->file))
-               return IO_APOLL_ABORTED;
-       if (req->flags & REQ_F_POLLED)
-               return IO_APOLL_ABORTED;
         if (!def->pollin && !def->pollout)
                 return IO_APOLL_ABORTED;
+       if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+               return IO_APOLL_ABORTED;
  
         if (def->pollin) {
-               rw = READ;
                 mask |= POLLIN | POLLRDNORM;
  
                 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5621,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
                     (req->sr_msg.msg_flags & MSG_ERRQUEUE))
                         mask &= ~POLLIN;
         } else {
-               rw = WRITE;
                 mask |= POLLOUT | POLLWRNORM;
         }
  
-       /* if we can't nonblock try, then no point in arming a poll handler */
-       if (!io_file_supports_nowait(req, rw))
-               return IO_APOLL_ABORTED;
-
         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
         if (unlikely(!apoll))
                 return IO_APOLL_ABORTED;
@@ -5689,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
  /*
   * Returns true if we found and killed one or more poll requests
   */
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
-                              bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+                                     struct task_struct *tsk, bool cancel_all)
  {
         struct hlist_node *tmp;
         struct io_kiocb *req;
@@ -5844,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
  
         if (mask) { /* no async, we'd stolen it */
                 ipt.error = 0;
-               done = io_poll_complete(req, mask);
+               done = __io_poll_complete(req, mask);
+               io_commit_cqring(req->ctx);
         }
         spin_unlock(&ctx->completion_lock);
  
@@ -5920,7 +5889,10 @@ err:
  
  static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
  {
-       req_set_fail(req);
+       struct io_timeout_data *data = req->async_data;
+
+       if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+               req_set_fail(req);
         io_req_complete_post(req, -ETIME, 0);
  }
  
@@ -6126,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         if (off && is_timeout_link)
                 return -EINVAL;
         flags = READ_ONCE(sqe->timeout_flags);
-       if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+       if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+                     IORING_TIMEOUT_ETIME_SUCCESS))
                 return -EINVAL;
         /* more than one clock specified is invalid, obviously */
         if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6137,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         if (unlikely(off && !req->ctx->off_timeout_used))
                 req->ctx->off_timeout_used = true;
  
-       if (!req->async_data && io_alloc_async_data(req))
+       if (WARN_ON_ONCE(req_has_async_data(req)))
+               return -EFAULT;
+       if (io_alloc_async_data(req))
                 return -ENOMEM;
  
         data = req->async_data;
@@ -6294,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
         u64 sqe_addr = req->cancel.addr;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
         struct io_tctx_node *node;
         int ret;
  
@@ -6302,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
                 goto done;
  
         /* slow path, try all io-wq's */
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
         ret = -ENOENT;
         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                 struct io_uring_task *tctx = node->task->io_uring;
@@ -6311,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
                 if (ret != -ENOENT)
                         break;
         }
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
  done:
         if (ret < 0)
                 req_set_fail(req);
@@ -6338,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
  static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
         struct io_uring_rsrc_update2 up;
         int ret;
  
@@ -6347,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
         up.tags = 0;
         up.resv = 0;
  
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
                                         &up, req->rsrc_update.nr_args);
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
  
         if (ret < 0)
                 req_set_fail(req);
@@ -6446,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req)
  {
         if (!io_op_defs[req->opcode].needs_async_setup)
                 return 0;
-       if (WARN_ON_ONCE(req->async_data))
+       if (WARN_ON_ONCE(req_has_async_data(req)))
                 return -EFAULT;
         if (io_alloc_async_data(req))
                 return -EAGAIN;
@@ -6478,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
         return seq;
  }
  
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
  {
-       struct io_kiocb *pos;
         struct io_ring_ctx *ctx = req->ctx;
         struct io_defer_entry *de;
         int ret;
-       u32 seq;
-
-       if (req->flags & REQ_F_FAIL) {
-               io_req_complete_fail_submit(req);
-               return true;
-       }
-
-       /*
-        * If we need to drain a request in the middle of a link, drain the
-        * head request and the next request/link after the current link.
-        * Considering sequential execution of links, IOSQE_IO_DRAIN will be
-        * maintained for every request of our link.
-        */
-       if (ctx->drain_next) {
-               req->flags |= REQ_F_IO_DRAIN;
-               ctx->drain_next = false;
-       }
-       /* not interested in head, start from the first linked */
-       io_for_each_link(pos, req->link) {
-               if (pos->flags & REQ_F_IO_DRAIN) {
-                       ctx->drain_next = true;
-                       req->flags |= REQ_F_IO_DRAIN;
-                       break;
-               }
-       }
+       u32 seq = io_get_sequence(req);
  
         /* Still need defer if there is pending req in defer list. */
-       if (likely(list_empty_careful(&ctx->defer_list) &&
-               !(req->flags & REQ_F_IO_DRAIN))) {
+       if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
                 ctx->drain_active = false;
-               return false;
+               io_req_task_queue(req);
+               return;
         }
  
-       seq = io_get_sequence(req);
-       /* Still a chance to pass the sequence check */
-       if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
-               return false;
-
         ret = io_req_prep_async(req);
-       if (ret)
-               goto fail;
+       if (ret) {
+fail:
+               io_req_complete_failed(req, ret);
+               return;
+       }
         io_prep_async_link(req);
         de = kmalloc(sizeof(*de), GFP_KERNEL);
         if (!de) {
                 ret = -ENOMEM;
-fail:
-               io_req_complete_failed(req, ret);
-               return true;
+               goto fail;
         }
  
         spin_lock(&ctx->completion_lock);
         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                 spin_unlock(&ctx->completion_lock);
                 kfree(de);
-               io_queue_async_work(req, NULL);
-               return true;
+               goto queue;
         }
  
         trace_io_uring_defer(ctx, req, req->user_data);
@@ -6547,23 +6495,13 @@ fail:
         de->seq = seq;
         list_add_tail(&de->list, &ctx->defer_list);
         spin_unlock(&ctx->completion_lock);
-       return true;
  }
  
  static void io_clean_op(struct io_kiocb *req)
  {
         if (req->flags & REQ_F_BUFFER_SELECTED) {
-               switch (req->opcode) {
-               case IORING_OP_READV:
-               case IORING_OP_READ_FIXED:
-               case IORING_OP_READ:
-                       kfree((void *)(unsigned long)req->rw.addr);
-                       break;
-               case IORING_OP_RECVMSG:
-               case IORING_OP_RECV:
-                       kfree(req->sr_msg.kbuf);
-                       break;
-               }
+               kfree(req->kbuf);
+               req->kbuf = NULL;
         }
  
         if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6628,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req)
         }
         if (req->flags & REQ_F_CREDS)
                 put_cred(req->creds);
-
+       if (req->flags & REQ_F_ASYNC_DATA) {
+               kfree(req->async_data);
+               req->async_data = NULL;
+       }
         req->flags &= ~IO_REQ_CLEAN_FLAGS;
  }
  
  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
  {
-       struct io_ring_ctx *ctx = req->ctx;
         const struct cred *creds = NULL;
         int ret;
  
-       if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+       if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
                 creds = override_creds(req->creds);
  
         switch (req->opcode) {
@@ -6761,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
         if (ret)
                 return ret;
         /* If the op doesn't have a file, we're not polling for it */
-       if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
-               io_iopoll_req_issued(req);
+       if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+               io_iopoll_req_issued(req, issue_flags);
  
         return 0;
  }
@@ -6778,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
  static void io_wq_submit_work(struct io_wq_work *work)
  {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+       unsigned int issue_flags = IO_URING_F_UNLOCKED;
+       bool needs_poll = false;
         struct io_kiocb *timeout;
         int ret = 0;
  
@@ -6792,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
                 io_queue_linked_timeout(timeout);
  
         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
-       if (work->flags & IO_WQ_WORK_CANCEL)
-               ret = -ECANCELED;
+       if (work->flags & IO_WQ_WORK_CANCEL) {
+               io_req_task_queue_fail(req, -ECANCELED);
+               return;
+       }
  
-       if (!ret) {
-               do {
-                       ret = io_issue_sqe(req, 0);
-                       /*
-                        * We can get EAGAIN for polled IO even though we're
-                        * forcing a sync submission from here, since we can't
-                        * wait for request slots on the block side.
-                        */
-                       if (ret != -EAGAIN)
-                               break;
-                       cond_resched();
-               } while (1);
+       if (req->flags & REQ_F_FORCE_ASYNC) {
+               const struct io_op_def *def = &io_op_defs[req->opcode];
+               bool opcode_poll = def->pollin || def->pollout;
+
+               if (opcode_poll && file_can_poll(req->file)) {
+                       needs_poll = true;
+                       issue_flags |= IO_URING_F_NONBLOCK;
+               }
         }
  
+       do {
+               ret = io_issue_sqe(req, issue_flags);
+               if (ret != -EAGAIN)
+                       break;
+               /*
+                * We can get EAGAIN for iopolled IO even though we're
+                * forcing a sync submission from here, since we can't
+                * wait for request slots on the block side.
+                */
+               if (!needs_poll) {
+                       cond_resched();
+                       continue;
+               }
+
+               if (io_arm_poll_handler(req) == IO_APOLL_OK)
+                       return;
+               /* aborted or ready, in either case retry blocking */
+               needs_poll = false;
+               issue_flags &= ~IO_URING_F_NONBLOCK;
+       } while (1);
+
         /* avoid locking problems by failing it from a clean context */
         if (ret)
                 io_req_task_queue_fail(req, ret);
@@ -6832,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
  {
         unsigned long file_ptr = (unsigned long) file;
  
-       if (__io_file_supports_nowait(file, READ))
-               file_ptr |= FFS_ASYNC_READ;
-       if (__io_file_supports_nowait(file, WRITE))
-               file_ptr |= FFS_ASYNC_WRITE;
-       if (S_ISREG(file_inode(file)->i_mode))
-               file_ptr |= FFS_ISREG;
+       file_ptr |= io_file_get_flags(file);
         file_slot->file_ptr = file_ptr;
  }
  
@@ -6854,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
         file = (struct file *) (file_ptr & FFS_MASK);
         file_ptr &= ~FFS_MASK;
         /* mask in overlapping REQ_F and FFS bits */
-       req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
-       io_req_set_rsrc_node(req);
+       req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+       io_req_set_rsrc_node(req, ctx);
         return file;
  }
  
@@ -6947,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
         io_put_req(req);
  }
  
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+       __must_hold(&req->ctx->uring_lock)
+{
+       struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+       switch (io_arm_poll_handler(req)) {
+       case IO_APOLL_READY:
+               if (linked_timeout) {
+                       io_queue_linked_timeout(linked_timeout);
+                       linked_timeout = NULL;
+               }
+               io_req_task_queue(req);
+               break;
+       case IO_APOLL_ABORTED:
+               /*
+                * Queued up for async execution, worker will release
+                * submit reference when the iocb is actually submitted.
+                */
+               io_queue_async_work(req, NULL);
+               break;
+       }
+
+       if (linked_timeout)
+               io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
         __must_hold(&req->ctx->uring_lock)
  {
         struct io_kiocb *linked_timeout;
         int ret;
  
-issue_sqe:
         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
  
+       if (req->flags & REQ_F_COMPLETE_INLINE) {
+               io_req_add_compl_list(req);
+               return;
+       }
         /*
          * We async punt it if the file wasn't marked NOWAIT, or if the file
          * doesn't support non-blocking read/write attempts
          */
         if (likely(!ret)) {
-               if (req->flags & REQ_F_COMPLETE_INLINE) {
-                       struct io_ring_ctx *ctx = req->ctx;
-                       struct io_submit_state *state = &ctx->submit_state;
-
-                       state->compl_reqs[state->compl_nr++] = req;
-                       if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
-                               io_submit_flush_completions(ctx);
-                       return;
-               }
-
                 linked_timeout = io_prep_linked_timeout(req);
                 if (linked_timeout)
                         io_queue_linked_timeout(linked_timeout);
         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-               linked_timeout = io_prep_linked_timeout(req);
-
-               switch (io_arm_poll_handler(req)) {
-               case IO_APOLL_READY:
-                       if (linked_timeout)
-                               io_queue_linked_timeout(linked_timeout);
-                       goto issue_sqe;
-               case IO_APOLL_ABORTED:
-                       /*
-                        * Queued up for async execution, worker will release
-                        * submit reference when the iocb is actually submitted.
-                        */
-                       io_queue_async_work(req, NULL);
-                       break;
-               }
-
-               if (linked_timeout)
-                       io_queue_linked_timeout(linked_timeout);
+               io_queue_sqe_arm_apoll(req);
         } else {
                 io_req_complete_failed(req, ret);
         }
  }
  
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
         __must_hold(&req->ctx->uring_lock)
  {
-       if (unlikely(req->ctx->drain_active) && io_drain_req(req))
-               return;
-
-       if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
-               __io_queue_sqe(req);
-       } else if (req->flags & REQ_F_FAIL) {
+       if (req->flags & REQ_F_FAIL) {
                 io_req_complete_fail_submit(req);
+       } else if (unlikely(req->ctx->drain_active)) {
+               io_drain_req(req);
         } else {
                 int ret = io_req_prep_async(req);
  
@@ -7018,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
         }
  }
  
+static inline void io_queue_sqe(struct io_kiocb *req)
+       __must_hold(&req->ctx->uring_lock)
+{
+       if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+               __io_queue_sqe(req);
+       else
+               io_queue_sqe_fallback(req);
+}
+
  /*
   * Check SQE restrictions (opcode and flags).
   *
@@ -7027,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
                                         struct io_kiocb *req,
                                         unsigned int sqe_flags)
  {
-       if (likely(!ctx->restricted))
-               return true;
-
         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
                 return false;
  
@@ -7044,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
         return true;
  }
  
+static void io_init_req_drain(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_kiocb *head = ctx->submit_state.link.head;
+
+       ctx->drain_active = true;
+       if (head) {
+               /*
+                * If we need to drain a request in the middle of a link, drain
+                * the head request and the next request/link after the current
+                * link. Considering sequential execution of links,
+                * IOSQE_IO_DRAIN will be maintained for every request of our
+                * link.
+                */
+               head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+               ctx->drain_next = true;
+       }
+}
+
  static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                        const struct io_uring_sqe *sqe)
         __must_hold(&ctx->uring_lock)
  {
-       struct io_submit_state *state;
         unsigned int sqe_flags;
-       int personality, ret = 0;
+       int personality;
+       u8 opcode;
  
         /* req is partially pre-initialised, see io_preinit_req() */
-       req->opcode = READ_ONCE(sqe->opcode);
+       req->opcode = opcode = READ_ONCE(sqe->opcode);
         /* same numerical values with corresponding REQ_F_*, safe to copy */
         req->flags = sqe_flags = READ_ONCE(sqe->flags);
         req->user_data = READ_ONCE(sqe->user_data);
@@ -7061,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         req->fixed_rsrc_refs = NULL;
         req->task = current;
  
-       /* enforce forwards compatibility on users */
-       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+       if (unlikely(opcode >= IORING_OP_LAST)) {
+               req->opcode = 0;
                 return -EINVAL;
-       if (unlikely(req->opcode >= IORING_OP_LAST))
-               return -EINVAL;
-       if (!io_check_restriction(ctx, req, sqe_flags))
-               return -EACCES;
+       }
+       if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+               /* enforce forwards compatibility on users */
+               if (sqe_flags & ~SQE_VALID_FLAGS)
+                       return -EINVAL;
+               if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+                   !io_op_defs[opcode].buffer_select)
+                       return -EOPNOTSUPP;
+               if (sqe_flags & IOSQE_IO_DRAIN)
+                       io_init_req_drain(req);
+       }
+       if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+               if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+                       return -EACCES;
+               /* knock it to the slow queue path, will be drained there */
+               if (ctx->drain_active)
+                       req->flags |= REQ_F_FORCE_ASYNC;
+               /* if there is no link, we're at "next" request and need to drain */
+               if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+                       ctx->drain_next = false;
+                       ctx->drain_active = true;
+                       req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+               }
+       }
  
-       if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
-           !io_op_defs[req->opcode].buffer_select)
-               return -EOPNOTSUPP;
-       if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
-               ctx->drain_active = true;
+       if (io_op_defs[opcode].needs_file) {
+               struct io_submit_state *state = &ctx->submit_state;
+
+               /*
+                * Plug now if we have more than 2 IO left after this, and the
+                * target is potentially a read/write to block based storage.
+                */
+               if (state->need_plug && io_op_defs[opcode].plug) {
+                       state->plug_started = true;
+                       state->need_plug = false;
+                       blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+               }
+
+               req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+                                       (sqe_flags & IOSQE_FIXED_FILE));
+               if (unlikely(!req->file))
+                       return -EBADF;
+       }
  
         personality = READ_ONCE(sqe->personality);
         if (personality) {
@@ -7083,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                 get_cred(req->creds);
                 req->flags |= REQ_F_CREDS;
         }
-       state = &ctx->submit_state;
-
-       /*
-        * Plug now if we have more than 1 IO left after this, and the target
-        * is potentially a read/write to block based storage.
-        */
-       if (!state->plug_started && state->ios_left > 1 &&
-           io_op_defs[req->opcode].plug) {
-               blk_start_plug(&state->plug);
-               state->plug_started = true;
-       }
-
-       if (io_op_defs[req->opcode].needs_file) {
-               req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
-                                       (sqe_flags & IOSQE_FIXED_FILE));
-               if (unlikely(!req->file))
-                       ret = -EBADF;
-       }
  
-       state->ios_left--;
-       return ret;
+       return io_req_prep(req, sqe);
  }
  
  static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7115,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
  
         ret = io_init_req(ctx, req, sqe);
         if (unlikely(ret)) {
-fail_req:
+               trace_io_uring_req_failed(sqe, ret);
+
                 /* fail even hard links since we don't submit */
                 if (link->head) {
                         /*
@@ -7138,10 +7133,6 @@ fail_req:
                         return ret;
                 }
                 req_fail_link_node(req, ret);
-       } else {
-               ret = io_req_prep(req, sqe);
-               if (unlikely(ret))
-                       goto fail_req;
         }
  
         /* don't need @sqe from now on */
@@ -7171,33 +7162,32 @@ fail_req:
                 link->last->link = req;
                 link->last = req;
  
+               if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+                       return 0;
                 /* last request of a link, enqueue the link */
-               if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
-                       link->head = NULL;
-                       io_queue_sqe(head);
-               }
-       } else {
-               if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
-                       link->head = req;
-                       link->last = req;
-               } else {
-                       io_queue_sqe(req);
-               }
+               link->head = NULL;
+               req = head;
+       } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+               link->head = req;
+               link->last = req;
+               return 0;
         }
  
+       io_queue_sqe(req);
         return 0;
  }
  
  /*
   * Batched submission is done, ensure local IO is flushed out.
   */
-static void io_submit_state_end(struct io_submit_state *state,
-                               struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
  {
+       struct io_submit_state *state = &ctx->submit_state;
+
         if (state->link.head)
                 io_queue_sqe(state->link.head);
-       if (state->compl_nr)
-               io_submit_flush_completions(ctx);
+       /* flush only after queuing links as they can generate completions */
+       io_submit_flush_completions(ctx);
         if (state->plug_started)
                 blk_finish_plug(&state->plug);
  }
@@ -7209,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state,
                                   unsigned int max_ios)
  {
         state->plug_started = false;
-       state->ios_left = max_ios;
+       state->need_plug = max_ios > 2;
+       state->submit_nr = max_ios;
         /* set only head, no need to init link_last in advance */
         state->link.head = NULL;
  }
@@ -7261,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
         __must_hold(&ctx->uring_lock)
  {
+       unsigned int entries = io_sqring_entries(ctx);
         int submitted = 0;
  
+       if (unlikely(!entries))
+               return 0;
         /* make sure SQ entry isn't read before tail */
-       nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-       if (!percpu_ref_tryget_many(&ctx->refs, nr))
-               return -EAGAIN;
+       nr = min3(nr, ctx->sq_entries, entries);
         io_get_task_refs(nr);
  
         io_submit_state_start(&ctx->submit_state, nr);
-       while (submitted < nr) {
+       do {
                 const struct io_uring_sqe *sqe;
                 struct io_kiocb *req;
  
-               req = io_alloc_req(ctx);
-               if (unlikely(!req)) {
+               if (unlikely(!io_alloc_req_refill(ctx))) {
                         if (!submitted)
                                 submitted = -EAGAIN;
                         break;
                 }
+               req = io_alloc_req(ctx);
                 sqe = io_get_sqe(ctx);
                 if (unlikely(!sqe)) {
-                       list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+                       wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
                         break;
                 }
                 /* will complete beyond this point, count as submitted */
                 submitted++;
                 if (io_submit_sqe(ctx, req, sqe))
                         break;
-       }
+       } while (submitted < nr);
  
         if (unlikely(submitted != nr)) {
                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
                 int unused = nr - ref_used;
  
                 current->io_uring->cached_refs += unused;
-               percpu_ref_put_many(&ctx->refs, unused);
         }
  
-       io_submit_state_end(&ctx->submit_state, ctx);
+       io_submit_state_end(ctx);
          /* Commit SQ ring head once we've consumed and submitted all SQEs */
         io_commit_sqring(ctx);
  
@@ -7338,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
  
-       if (!list_empty(&ctx->iopoll_list) || to_submit) {
-               unsigned nr_events = 0;
+       if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
                 const struct cred *creds = NULL;
  
                 if (ctx->sq_creds != current_cred())
                         creds = override_creds(ctx->sq_creds);
  
                 mutex_lock(&ctx->uring_lock);
-               if (!list_empty(&ctx->iopoll_list))
-                       io_do_iopoll(ctx, &nr_events, 0);
+               if (!wq_list_empty(&ctx->iopoll_list))
+                       io_do_iopoll(ctx, true);
  
                 /*
                  * Don't submit if refs are dying, good for io_uring_register(),
@@ -7367,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
         return ret;
  }
  
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
  {
         struct io_ring_ctx *ctx;
         unsigned sq_thread_idle = 0;
@@ -7424,7 +7414,7 @@ static int io_sq_thread(void *data)
                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                         int ret = __io_sq_thread(ctx, cap_entries);
  
-                       if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+                       if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
                                 sqt_spin = true;
                 }
                 if (io_run_task_work())
@@ -7445,7 +7435,7 @@ static int io_sq_thread(void *data)
                                 io_ring_set_wakeup_flag(ctx);
  
                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                                   !list_empty_careful(&ctx->iopoll_list)) {
+                                   !wq_list_empty(&ctx->iopoll_list)) {
                                         needs_sched = false;
                                         break;
                                 }
@@ -7621,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size)
         kfree(table);
  }
  
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
  {
         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
         size_t init_size = size;
@@ -7650,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
         kfree(ref_node);
  }
  
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
  {
         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7696,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
  
  static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
                                 struct io_rsrc_data *data_to_kill)
+       __must_hold(&ctx->uring_lock)
  {
         WARN_ON_ONCE(!ctx->rsrc_backup_node);
         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
  
+       io_rsrc_refs_drop(ctx);
+
         if (data_to_kill) {
                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
  
@@ -7727,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
  }
  
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+                                     struct io_ring_ctx *ctx)
  {
         int ret;
  
@@ -7783,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
         kfree(data);
  }
  
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
-                             u64 __user *utags, unsigned nr,
-                             struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+                                    u64 __user *utags, unsigned nr,
+                                    struct io_rsrc_data **pdata)
  {
         struct io_rsrc_data *data;
         int ret = -ENOMEM;
@@ -8353,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                  unsigned int issue_flags, u32 slot_index)
  {
         struct io_ring_ctx *ctx = req->ctx;
-       bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
         bool needs_switch = false;
         struct io_fixed_file *file_slot;
         int ret = -EBADF;
  
-       io_ring_submit_lock(ctx, !force_nonblock);
+       io_ring_submit_lock(ctx, needs_lock);
         if (file->f_op == &io_uring_fops)
                 goto err;
         ret = -ENXIO;
@@ -8399,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
  err:
         if (needs_switch)
                 io_rsrc_node_switch(ctx, ctx->file_data);
-       io_ring_submit_unlock(ctx, !force_nonblock);
+       io_ring_submit_unlock(ctx, needs_lock);
         if (ret)
                 fput(file);
         return ret;
@@ -8409,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
  {
         unsigned int offset = req->close.file_slot - 1;
         struct io_ring_ctx *ctx = req->ctx;
+       bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
         struct io_fixed_file *file_slot;
         struct file *file;
         int ret, i;
  
-       io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_lock(ctx, needs_lock);
         ret = -ENXIO;
         if (unlikely(!ctx->file_data))
                 goto out;
@@ -8439,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
         io_rsrc_node_switch(ctx, ctx->file_data);
         ret = 0;
  out:
-       io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+       io_ring_submit_unlock(ctx, needs_lock);
         return ret;
  }
  
@@ -8555,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
         return io_wq_create(concurrency, &data);
  }
  
-static int io_uring_alloc_task_context(struct task_struct *task,
-                                      struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+                                             struct io_ring_ctx *ctx)
  {
         struct io_uring_task *tctx;
         int ret;
@@ -8603,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk)
         tsk->io_uring = NULL;
  }
  
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
-                               struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+                                      struct io_uring_params *p)
  {
         int ret;
  
@@ -9215,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
         }
  }
  
-static void io_req_cache_free(struct list_head *list)
-{
-       struct io_kiocb *req, *nxt;
-
-       list_for_each_entry_safe(req, nxt, list, inflight_entry) {
-               list_del(&req->inflight_entry);
-               kmem_cache_free(req_cachep, req);
-       }
-}
-
  static void io_req_caches_free(struct io_ring_ctx *ctx)
  {
         struct io_submit_state *state = &ctx->submit_state;
+       int nr = 0;
  
         mutex_lock(&ctx->uring_lock);
+       io_flush_cached_locked_reqs(ctx, state);
  
-       if (state->free_reqs) {
-               kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
-               state->free_reqs = 0;
-       }
+       while (state->free_list.next) {
+               struct io_wq_work_node *node;
+               struct io_kiocb *req;
  
-       io_flush_cached_locked_reqs(ctx, state);
-       io_req_cache_free(&state->free_list);
+               node = wq_stack_extract(&state->free_list);
+               req = container_of(node, struct io_kiocb, comp_list);
+               kmem_cache_free(req_cachep, req);
+               nr++;
+       }
+       if (nr)
+               percpu_ref_put_many(&ctx->refs, nr);
         mutex_unlock(&ctx->uring_lock);
  }
  
@@ -9247,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
                 wait_for_completion(&data->done);
  }
  
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
  {
         io_sq_thread_finish(ctx);
  
@@ -9256,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
                 ctx->mm_account = NULL;
         }
  
+       io_rsrc_refs_drop(ctx);
         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
         io_wait_rsrc_data(ctx->buf_data);
         io_wait_rsrc_data(ctx->file_data);
@@ -9279,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
         if (ctx->rsrc_backup_node)
                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
         flush_delayed_work(&ctx->rsrc_put_work);
+       flush_delayed_work(&ctx->fallback_work);
  
         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9309,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
         struct io_ring_ctx *ctx = file->private_data;
         __poll_t mask = 0;
  
-       poll_wait(file, &ctx->poll_wait, wait);
+       poll_wait(file, &ctx->cq_wait, wait);
         /*
          * synchronizes with barrier from wq_has_sleeper call in
          * io_commit_cqring
@@ -9356,7 +9349,7 @@ struct io_tctx_exit {
         struct io_ring_ctx              *ctx;
  };
  
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
  {
         struct io_uring_task *tctx = current->io_uring;
         struct io_tctx_exit *work;
@@ -9371,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
         complete(&work->completion);
  }
  
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
  {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
  
         return req->ctx == data;
  }
  
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
  {
         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
         unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9407,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work)
                         io_sq_thread_unpark(sqd);
                 }
  
+               io_req_caches_free(ctx);
+
                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
                         /* there is little hope left, don't run it too often */
                         interval = HZ * 60;
@@ -9433,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work)
                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                 if (WARN_ON_ONCE(ret))
                         continue;
-               wake_up_process(node->task);
  
                 mutex_unlock(&ctx->uring_lock);
                 wait_for_completion(&exit.completion);
@@ -9447,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work)
  }
  
  /* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
-                            bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+                                   struct task_struct *tsk, bool cancel_all)
  {
         struct io_kiocb *req, *tmp;
         int canceled = 0;
@@ -9470,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
         return canceled != 0;
  }
  
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
         unsigned long index;
         struct creds *creds;
@@ -9532,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
         return ret;
  }
  
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
-                                 struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+                                        struct task_struct *task,
+                                        bool cancel_all)
  {
         struct io_defer_entry *de;
         LIST_HEAD(list);
@@ -9558,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
         return true;
  }
  
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
  {
         struct io_tctx_node *node;
         enum io_wq_cancel cret;
@@ -9582,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
         return ret;
  }
  
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
-                                        struct task_struct *task,
-                                        bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+                                               struct task_struct *task,
+                                               bool cancel_all)
  {
         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
         struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9608,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                 /* SQPOLL thread does its own polling */
                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
                     (ctx->sq_data && ctx->sq_data->thread == current)) {
-                       while (!list_empty_careful(&ctx->iopoll_list)) {
+                       while (!wq_list_empty(&ctx->iopoll_list)) {
                                 io_iopoll_try_reap_events(ctx);
                                 ret = true;
                         }
@@ -9683,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  /*
   * Remove this io_uring_file -> task mapping.
   */
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
  {
         struct io_uring_task *tctx = current->io_uring;
         struct io_tctx_node *node;
@@ -9706,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index)
         kfree(node);
  }
  
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
  {
         struct io_wq *wq = tctx->io_wq;
         struct io_tctx_node *node;
@@ -9733,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
         return percpu_counter_sum(&tctx->inflight);
  }
  
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
  {
         struct io_uring_task *tctx = task->io_uring;
         unsigned int refs = tctx->cached_refs;
@@ -9749,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
   * Find any io_uring ctx that this task has registered or done IO on, and cancel
   * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
   */
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+                                          struct io_sq_data *sqd)
  {
         struct io_uring_task *tctx = current->io_uring;
         struct io_ring_ctx *ctx;
@@ -9842,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
  
  #ifdef CONFIG_MMU
  
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
  {
         size_t sz = vma->vm_end - vma->vm_start;
         unsigned long pfn;
@@ -10027,7 +10023,7 @@ out_fput:
  }
  
  #ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
                 const struct cred *cred)
  {
         struct user_namespace *uns = seq_user_ns(m);
@@ -10059,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
         return 0;
  }
  
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+                                         struct seq_file *m)
  {
         struct io_sq_data *sq = NULL;
+       struct io_overflow_cqe *ocqe;
+       struct io_rings *r = ctx->rings;
+       unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+       unsigned int sq_head = READ_ONCE(r->sq.head);
+       unsigned int sq_tail = READ_ONCE(r->sq.tail);
+       unsigned int cq_head = READ_ONCE(r->cq.head);
+       unsigned int cq_tail = READ_ONCE(r->cq.tail);
+       unsigned int sq_entries, cq_entries;
         bool has_lock;
-       int i;
+       unsigned int i;
+
+       /*
+        * we may get imprecise sqe and cqe info if uring is actively running
+        * since we get cached_sq_head and cached_cq_tail without uring_lock
+        * and sq_tail and cq_head are changed by userspace. But it's ok since
+        * we usually use these info when it is stuck.
+        */
+       seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+       seq_printf(m, "SqHead:\t%u\n", sq_head);
+       seq_printf(m, "SqTail:\t%u\n", sq_tail);
+       seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+       seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+       seq_printf(m, "CqHead:\t%u\n", cq_head);
+       seq_printf(m, "CqTail:\t%u\n", cq_tail);
+       seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+       seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+       sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+       for (i = 0; i < sq_entries; i++) {
+               unsigned int entry = i + sq_head;
+               unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+               struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
+
+               if (sq_idx > sq_mask)
+                       continue;
+               sqe = &ctx->sq_sqes[sq_idx];
+               seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+                          sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+                          sqe->user_data);
+       }
+       seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+       cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+       for (i = 0; i < cq_entries; i++) {
+               unsigned int entry = i + cq_head;
+               struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+               seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+                          entry & cq_mask, cqe->user_data, cqe->res,
+                          cqe->flags);
+       }
  
         /*
          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10105,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                 xa_for_each(&ctx->personalities, index, cred)
                         io_uring_show_cred(m, index, cred);
         }
-       seq_printf(m, "PollList:\n");
+       if (has_lock)
+               mutex_unlock(&ctx->uring_lock);
+
+       seq_puts(m, "PollList:\n");
         spin_lock(&ctx->completion_lock);
         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                 struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10115,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
                                         req->task->task_works != NULL);
         }
+
+       seq_puts(m, "CqOverflowList:\n");
+       list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+               struct io_uring_cqe *cqe = &ocqe->cqe;
+
+               seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
+                          cqe->user_data, cqe->res, cqe->flags);
+
+       }
+
         spin_unlock(&ctx->completion_lock);
-       if (has_lock)
-               mutex_unlock(&ctx->uring_lock);
  }
  
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
  {
         struct io_ring_ctx *ctx = f->private_data;
  
@@ -10144,8 +10199,8 @@ static const struct file_operations io_uring_fops = {
  #endif
  };
  
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
-                                 struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+                                        struct io_uring_params *p)
  {
         struct io_rings *rings;
         size_t size, sq_array_offset;
@@ -10234,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
         return file;
  }
  
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
-                          struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+                                 struct io_uring_params __user *params)
  {
         struct io_ring_ctx *ctx;
         struct file *file;
@@ -10393,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
         return io_uring_setup(entries, params);
  }
  
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+                          unsigned nr_args)
  {
         struct io_uring_probe *p;
         size_t size;
@@ -10449,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
         return id;
  }
  
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
-                                   unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+                                          void __user *arg, unsigned int nr_args)
  {
         struct io_uring_restriction *res;
         size_t size;
@@ -10584,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
         return __io_register_rsrc_update(ctx, type, &up, up.nr);
  }
  
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
                             unsigned int size, unsigned int type)
  {
         struct io_uring_rsrc_register rr;
@@ -10610,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
         return -EINVAL;
  }
  
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
-                               unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+                                      void __user *arg, unsigned len)
  {
         struct io_uring_task *tctx = current->io_uring;
         cpumask_var_t new_mask;
@@ -10637,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
         return ret;
  }
  
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
  {
         struct io_uring_task *tctx = current->io_uring;
  
@@ -10647,8 +10703,8 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
         return io_wq_cpu_affinity(tctx->io_wq, NULL);
  }
  
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
-                                       void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+                                              void __user *arg)
         __must_hold(&ctx->uring_lock)
  {
         struct io_tctx_node *node;
@@ -10753,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op)
         }
  }
  
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
  {
         long ret;
  
@@ -10768,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
          */
         mutex_unlock(&ctx->uring_lock);
         do {
-               ret = wait_for_completion_interruptible(&ctx->ref_comp);
-               if (!ret)
+               ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+               if (ret) {
+                       ret = min(0L, ret);
                         break;
+               }
+
                 ret = io_run_task_work_sig();
+               io_req_caches_free(ctx);
         } while (ret >= 0);
         mutex_lock(&ctx->uring_lock);
  
@@ -11002,6 +11062,8 @@ static int __init io_uring_init(void)
  
         /* should fit into one byte */
         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+       BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+       BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
  
         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c

index 4ecd255..811c898 100644 (file)
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -38,8 +38,7 @@ struct iomap_dio {
                 struct {
                         struct iov_iter         *iter;
                         struct task_struct      *waiter;
-                       struct request_queue    *last_queue;
-                       blk_qc_t                cookie;
+                       struct bio              *poll_bio;
                 } submit;
  
                 /* used for aio completion: */
@@ -49,29 +48,20 @@ struct iomap_dio {
         };
  };
  
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
-       struct request_queue *q = READ_ONCE(kiocb->private);
-
-       if (!q)
-               return 0;
-       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
  static void iomap_dio_submit_bio(const struct iomap_iter *iter,
                 struct iomap_dio *dio, struct bio *bio, loff_t pos)
  {
         atomic_inc(&dio->ref);
  
-       if (dio->iocb->ki_flags & IOCB_HIPRI)
+       if (dio->iocb->ki_flags & IOCB_HIPRI) {
                 bio_set_polled(bio, dio->iocb);
+               dio->submit.poll_bio = bio;
+       }
  
-       dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
         if (dio->dops && dio->dops->submit_io)
-               dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+               dio->dops->submit_io(iter, bio, pos);
         else
-               dio->submit.cookie = submit_bio(bio);
+               submit_bio(bio);
  }
  
  ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -135,7 +125,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
         struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
         struct kiocb *iocb = dio->iocb;
  
-       iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+       iocb->ki_complete(iocb, iomap_dio_complete(dio));
  }
  
  /*
@@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
                 } else if (dio->flags & IOMAP_DIO_WRITE) {
                         struct inode *inode = file_inode(dio->iocb->ki_filp);
  
+                       WRITE_ONCE(dio->iocb->private, NULL);
                         INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
                         queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
                 } else {
+                       WRITE_ONCE(dio->iocb->private, NULL);
                         iomap_dio_complete_work(&dio->aio.work);
                 }
         }
@@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
         if (!iov_iter_count(dio->submit.iter))
                 goto out;
  
+       /*
+        * We can only poll for single bio I/Os.
+        */
+       if (need_zeroout ||
+           ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+               dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
         if (need_zeroout) {
                 /* zero out from the start of the block to the write offset */
                 pad = pos & (fs_block_size - 1);
@@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
  
                 nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
                                                  BIO_MAX_VECS);
+               /*
+                * We can only poll for single bio I/Os.
+                */
+               if (nr_pages)
+                       dio->iocb->ki_flags &= ~IOCB_HIPRI;
                 iomap_dio_submit_bio(iter, dio, bio, pos);
                 pos += n;
         } while (nr_pages);
@@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  
         dio->submit.iter = iter;
         dio->submit.waiter = current;
-       dio->submit.cookie = BLK_QC_T_NONE;
-       dio->submit.last_queue = NULL;
+       dio->submit.poll_bio = NULL;
  
         if (iov_iter_rw(iter) == READ) {
                 if (iomi.pos >= dio->i_size)
@@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         inode_dio_begin(inode);
  
         blk_start_plug(&plug);
-       while ((ret = iomap_iter(&iomi, ops)) > 0)
+       while ((ret = iomap_iter(&iomi, ops)) > 0) {
                 iomi.processed = iomap_dio_iter(&iomi, dio);
+
+               /*
+                * We can only poll for single bio I/Os.
+                */
+               iocb->ki_flags &= ~IOCB_HIPRI;
+       }
+
         blk_finish_plug(&plug);
  
         /*
@@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         if (dio->flags & IOMAP_DIO_WRITE_FUA)
                 dio->flags &= ~IOMAP_DIO_NEED_SYNC;
  
-       WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
-       WRITE_ONCE(iocb->private, dio->submit.last_queue);
+       WRITE_ONCE(iocb->private, dio->submit.poll_bio);
  
         /*
          * We are about to drop our additional submission reference, which
@@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                         if (!READ_ONCE(dio->submit.waiter))
                                 break;
  
-                       if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                           !dio->submit.last_queue ||
-                           !blk_poll(dio->submit.last_queue,
-                                        dio->submit.cookie, true))
+                       if (!dio->submit.poll_bio ||
+                           !bio_poll(dio->submit.poll_bio, NULL, 0))
                                 blk_io_schedule();
                 }
                 __set_current_state(TASK_RUNNING);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c

index 176580f..104ae69 100644 (file)
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
  #include <linux/buffer_head.h>
  #include <linux/mempool.h>
  #include <linux/seq_file.h>
+#include <linux/writeback.h>
  #include "jfs_incore.h"
  #include "jfs_superblock.h"
  #include "jfs_filsys.h"
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c

index bde787c..8b9a72a 100644 (file)
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
                 goto out;
         }
  
-       VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+       VolumeSize = sb_bdev_nr_blocks(sb);
         if (VolumeSize) {
                 if (newLVSize > VolumeSize) {
                         printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         txQuiesce(sb);
  
         /* Reset size of direct inode */
-       sbi->direct_inode->i_size =  i_size_read(sb->s_bdev->bd_inode);
+       sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
  
         if (sbi->mntflag & JFS_INLINELOG) {
                 /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c

index 9030aea..24cbc99 100644 (file)
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
                 }
                 case Opt_resize_nosize:
                 {
-                       *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits;
+                       *newLVSize = sb_bdev_nr_blocks(sb);
                         if (*newLVSize == 0)
                                 pr_err("JFS: Cannot determine volume size\n");
                         break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
                 ret = -ENOMEM;
                 goto out_unload;
         }
-       inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+       inode->i_size = bdev_nr_bytes(sb->s_bdev);
         inode->i_mapping->a_ops = &jfs_metapage_aops;
         inode_fake_hash(inode);
         mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
diff --git a/fs/locks.c b/fs/locks.c

index 3d6fb4a..0fca9d6 100644 (file)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
  /*
   *  linux/fs/locks.c
   *
- *  Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- *  Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases.  For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
   *
- *  Deadlock detection added.
- *  FIXME: one thing isn't handled yet:
- *     - mandatory locks (requires lots of changes elsewhere)
- *  Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- *  Converted file_lock_table to a linked list from an array, which eliminates
- *  the limits on how many active file locks are open.
- *  Chad Page (pageone@netcom.com), November 27, 1994
- *
- *  Removed dependency on file descriptors. dup()'ed file descriptors now
- *  get the same locks as the original file descriptors, and a close() on
- *  any file descriptor removes ALL the locks on the file for the current
- *  process. Since locks still depend on the process id, locks are inherited
- *  after an exec() but not after a fork(). This agrees with POSIX, and both
- *  BSD and SVR4 practice.
- *  Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- *  Scrapped free list which is redundant now that we allocate locks
- *  dynamically with kmalloc()/kfree().
- *  Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- *  Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- *  FL_POSIX locks are created with calls to fcntl() and lockf() through the
- *  fcntl() system call. They have the semantics described above.
- *
- *  FL_FLOCK locks are created with calls to flock(), through the flock()
- *  system call, which is new. Old C libraries implement flock() via fcntl()
- *  and will continue to use the old, broken implementation.
- *
- *  FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- *  with a file pointer (filp). As a result they can be shared by a parent
- *  process and its children after a fork(). They are removed when the last
- *  file descriptor referring to the file pointer is closed (unless explicitly
- *  unlocked).
- *
- *  FL_FLOCK locks never deadlock, an existing lock is always removed before
- *  upgrading from shared to exclusive (or vice versa). When this happens
- *  any processes blocked by the current lock are woken up and allowed to
- *  run before the new lock is applied.
- *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- *  Removed some race conditions in flock_lock_file(), marked other possible
- *  races. Just grep for FIXME to see them.
- *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- *  Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- *  once we've checked for blocking and deadlocking.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- *  Initial implementation of mandatory locks. SunOS turned out to be
- *  a rotten model, so I implemented the "obvious" semantics.
- *  See 'Documentation/filesystems/mandatory-locking.rst' for details.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- *  check if a file has mandatory locks, used by mmap(), open() and creat() to
- *  see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- *  Manual, Section 2.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- *  Tidied up block list handling. Added '/proc/locks' interface.
- *  Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- *  Fixed deadlock condition for pathological code that mixes calls to
- *  flock() and fcntl().
- *  Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- *  Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- *  for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- *  guarantee sensible behaviour in the case where file system modules might
- *  be compiled with different options than the kernel itself.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- *  Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- *  (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- *  Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- *  locks. Changed process synchronisation to avoid dereferencing locks that
- *  have already been freed.
- *  Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- *  Made the block list a circular list to minimise searching in the list.
- *  Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- *  Made mandatory locking a mount option. Default is not to allow mandatory
- *  locking.
- *  Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- *  Some adaptations for NFS support.
- *  Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- *  Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- *  Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- *  Use slab allocator instead of kmalloc/kfree.
- *  Use generic list implementation from <linux/list.h>.
- *  Sped up posix_locks_deadlock by only considering blocked locks.
- *  Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- *  Leases and LOCK_MAND
- *  Matthew Wilcox <willy@debian.org>, June, 2000.
- *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
   *
   * Locking conflicts and dependencies:
   * If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
  }
  
  static inline int flock_translate_cmd(int cmd) {
-       if (cmd & LOCK_MAND)
-               return cmd & (LOCK_MAND | LOCK_RW);
         switch (cmd) {
         case LOCK_SH:
                 return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
          */
         if (caller_fl->fl_file == sys_fl->fl_file)
                 return false;
-       if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
-               return false;
  
         return locks_conflict(caller_fl, sys_fl);
  }
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
   *     - %LOCK_SH -- a shared lock.
   *     - %LOCK_EX -- an exclusive lock.
   *     - %LOCK_UN -- remove an existing lock.
- *     - %LOCK_MAND -- a 'mandatory' flock.
- *       This exists to emulate Windows Share Modes.
+ *     - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
   *
- *     %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- *     processes read and write access respectively.
+ *     %LOCK_MAND support has been removed from the kernel.
   */
  SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  {
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
         cmd &= ~LOCK_NB;
         unlock = (cmd == LOCK_UN);
  
-       if (!unlock && !(cmd & LOCK_MAND) &&
-           !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+       if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+               goto out_putf;
+
+       /*
+        * LOCK_MAND locks were broken for a long time in that they never
+        * conflicted with one another and didn't prevent any sort of open,
+        * read or write activity.
+        *
+        * Just ignore these requests now, to preserve legacy behavior, but
+        * throw a warning to let people know that they don't actually work.
+        */
+       if (cmd & LOCK_MAND) {
+               pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+               error = 0;
                 goto out_putf;
+       }
  
         lock = flock_make_lock(f.file, cmd, NULL);
         if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
         struct inode *inode = NULL;
         unsigned int fl_pid;
         struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+       int type;
  
         fl_pid = locks_translate_pid(fl, proc_pidns);
         /*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                 seq_printf(f, " %s ",
                              (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
         } else if (IS_FLOCK(fl)) {
-               if (fl->fl_type & LOCK_MAND) {
-                       seq_puts(f, "FLOCK  MSNFS     ");
-               } else {
-                       seq_puts(f, "FLOCK  ADVISORY  ");
-               }
+               seq_puts(f, "FLOCK  ADVISORY  ");
         } else if (IS_LEASE(fl)) {
                 if (fl->fl_flags & FL_DELEG)
                         seq_puts(f, "DELEG  ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
         } else {
                 seq_puts(f, "UNKNOWN UNKNOWN  ");
         }
-       if (fl->fl_type & LOCK_MAND) {
-               seq_printf(f, "%s ",
-                              (fl->fl_type & LOCK_READ)
-                              ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
-                              : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
-       } else {
-               int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+       type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
  
-               seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
-                                    (type == F_RDLCK) ? "READ" : "UNLCK");
-       }
+       seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+                            (type == F_RDLCK) ? "READ" : "UNLCK");
         if (inode) {
                 /* userspace relies on this representation of dev_t */
                 seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/namei.c b/fs/namei.c

index 1946d96..1f9d218 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
         int error = get_write_access(inode);
         if (error)
                 return error;
-       /*
-        * Refuse to truncate files with mandatory locks held on them.
-        */
+
         error = security_path_truncate(path);
         if (!error) {
                 error = do_truncate(mnt_userns, path->dentry, 0,
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c

index acb1d22..5e56da7 100644 (file)
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
         d->bdev = bdev;
  
  
-       d->len = i_size_read(d->bdev->bd_inode);
+       d->len = bdev_nr_bytes(d->bdev);
         d->map = bl_map_simple;
  
         printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
                 return PTR_ERR(bdev);
         d->bdev = bdev;
  
-       d->len = i_size_read(d->bdev->bd_inode);
+       d->len = bdev_nr_bytes(d->bdev);
         d->map = bl_map_simple;
         d->pr_key = v->scsi.pr_key;
  
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c

index 2e894fe..7a5f287 100644 (file)
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
                         res = (long) dreq->count;
                         WARN_ON_ONCE(dreq->count < 0);
                 }
-               dreq->iocb->ki_complete(dreq->iocb, res, 0);
+               dreq->iocb->ki_complete(dreq->iocb, res);
         }
  
         complete(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c

index aa353fd..24e7dcc 100644 (file)
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
         if (!(fl->fl_flags & FL_FLOCK))
                 return -ENOLCK;
  
-       /*
-        * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
-        * any standard. In principle we might be able to support LOCK_MAND
-        * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
-        * NFS code is not set up for it.
-        */
-       if (fl->fl_type & LOCK_MAND)
-               return -EINVAL;
-
         if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
                 is_local = 1;
  
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig

index 6e9ea4e..3d1d172 100644 (file)
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT
         depends on NFSD_V4 && BLOCK
         select NFSD_PNFS
         select EXPORTFS_BLOCK_OPS
-       select SCSI_COMMON
         help
           This option enables support for the exporting pNFS SCSI layouts
           in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c

index c99dee9..e5c0982 100644 (file)
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -9,9 +9,6 @@
  #include <linux/pr.h>
  
  #include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
  
  #include "blocklayoutxdr.h"
  #include "pnfs.h"
@@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
  #endif /* CONFIG_NFSD_BLOCKLAYOUT */
  
  #ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
-               struct pnfs_block_volume *b)
-{
-       struct request_queue *q = bdev->bd_disk->queue;
-       struct request *rq;
-       struct scsi_request *req;
-       /*
-        * The allocation length (passed in bytes 3 and 4 of the INQUIRY
-        * command descriptor block) specifies the number of bytes that have
-        * been allocated for the data-in buffer.
-        * 252 is the highest one-byte value that is a multiple of 4.
-        * 65532 is the highest two-byte value that is a multiple of 4.
-        */
-       size_t bufflen = 252, maxlen = 65532, len, id_len;
-       u8 *buf, *d, type, assoc;
-       int retries = 1, error;
-
-       if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
-               return -EINVAL;
-
-again:
-       buf = kzalloc(bufflen, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-
-       rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
-       if (IS_ERR(rq)) {
-               error = -ENOMEM;
-               goto out_free_buf;
-       }
-       req = scsi_req(rq);
-
-       error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
-       if (error)
-               goto out_put_request;
-
-       req->cmd[0] = INQUIRY;
-       req->cmd[1] = 1;
-       req->cmd[2] = 0x83;
-       req->cmd[3] = bufflen >> 8;
-       req->cmd[4] = bufflen & 0xff;
-       req->cmd_len = COMMAND_SIZE(INQUIRY);
-
-       blk_execute_rq(NULL, rq, 1);
-       if (req->result) {
-               pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
-                       req->result);
-               error = -EIO;
-               goto out_put_request;
-       }
-
-       len = (buf[2] << 8) + buf[3] + 4;
-       if (len > bufflen) {
-               if (len <= maxlen && retries--) {
-                       blk_put_request(rq);
-                       kfree(buf);
-                       bufflen = len;
-                       goto again;
-               }
-               pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
-                       len);
-               goto out_put_request;
-       }
-
-       d = buf + 4;
-       for (d = buf + 4; d < buf + len; d += id_len + 4) {
-               id_len = d[3];
-               type = d[1] & 0xf;
-               assoc = (d[1] >> 4) & 0x3;
-
-               /*
-                * We only care about a EUI-64 and NAA designator types
-                * with LU association.
-                */
-               if (assoc != 0x00)
-                       continue;
-               if (type != 0x02 && type != 0x03)
-                       continue;
-               if (id_len != 8 && id_len != 12 && id_len != 16)
-                       continue;
-
-               b->scsi.code_set = PS_CODE_SET_BINARY;
-               b->scsi.designator_type = type == 0x02 ?
-                       PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
-               b->scsi.designator_len = id_len;
-               memcpy(b->scsi.designator, d + 4, id_len);
-
-               /*
-                * If we found a 8 or 12 byte descriptor continue on to
-                * see if a 16 byte one is available.  If we find a
-                * 16 byte descriptor we're done.
-                */
-               if (id_len == 16)
-                       break;
-       }
-
-out_put_request:
-       blk_put_request(rq);
-out_free_buf:
-       kfree(buf);
-       return error;
-}
-
  #define NFSD_MDS_PR_KEY                0x0100000000000000ULL
  
  /*
@@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
         return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
  }
  
+static const u8 designator_types[] = {
+       PS_DESIGNATOR_EUI64,
+       PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+       int ret, i;
+
+       for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+               u8 type = designator_types[i];
+
+               ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+               if (ret > 0) {
+                       b->scsi.code_set = PS_CODE_SET_BINARY;
+                       b->scsi.designator_type = type;
+                       b->scsi.designator_len = ret;
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
  static int
  nfsd4_block_get_device_info_scsi(struct super_block *sb,
                 struct nfs4_client *clp,
@@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
         struct pnfs_block_deviceaddr *dev;
         struct pnfs_block_volume *b;
         const struct pr_ops *ops;
-       int error;
+       int ret;
  
         dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
                       sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
         b->type = PNFS_BLOCK_VOLUME_SCSI;
         b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
  
-       error = nfsd4_scsi_identify_device(sb->s_bdev, b);
-       if (error)
-               return error;
+       ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+       if (ret < 0)
+               goto out_free_dev;
  
+       ret = -EINVAL;
         ops = sb->s_bdev->bd_disk->fops->pr_ops;
         if (!ops) {
                 pr_err("pNFS: device %s does not support PRs.\n",
                         sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
         }
  
-       error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
-       if (error) {
+       ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+       if (ret) {
                 pr_err("pNFS: failed to register key for device %s.\n",
                         sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
         }
  
-       error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+       ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
                         PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
-       if (error) {
+       if (ret) {
                 pr_err("pNFS: failed to reserve device %s.\n",
                         sb->s_id);
-               return -EINVAL;
+               goto out_free_dev;
         }
  
         return 0;
+
+out_free_dev:
+       kfree(dev);
+       return ret;
  }
  
  static __be32
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c

index a97873f..6d1b5bb 100644 (file)
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
  #ifdef CONFIG_NFSD_SCSILAYOUT
         if (sb->s_export_op->map_blocks &&
             sb->s_export_op->commit_blocks &&
-           sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
-               blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+           sb->s_bdev &&
+           sb->s_bdev->bd_disk->fops->pr_ops &&
+           sb->s_bdev->bd_disk->fops->get_unique_id)
                 exp->ex_layout_types |= 1 << LAYOUT_SCSI;
  #endif
  }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c

index 640ac8f..1d0583c 100644 (file)
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
                 goto out;
  
         ret = -ERANGE;
-       if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+       if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
                 goto out;
  
         segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c

index f6b2d28..3134c0e 100644 (file)
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
         int ret;
  
         ret = -ERANGE;
-       devsize = i_size_read(sb->s_bdev->bd_inode);
+       devsize = bdev_nr_bytes(sb->s_bdev);
         if (newsize > devsize)
                 goto out;
  
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c

index c8bfc01..1bfcb5d 100644 (file)
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
  {
         struct nilfs_super_block **sbp = nilfs->ns_sbp;
         struct buffer_head **sbh = nilfs->ns_sbh;
-       u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+       u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
         int valid[2], swp = 0;
  
         sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c

index ab4f336..373dbb6 100644 (file)
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
   * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
   */
  
+#include <linux/blkdev.h>
  #include <linux/backing-dev.h>
  #include <linux/buffer_head.h>
  #include <linux/gfp.h>
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c

index 0d7e948..5ae8de0 100644 (file)
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
         ntfs_debug("Set device block size to %i bytes (block size bits %i).",
                         blocksize, sb->s_blocksize_bits);
         /* Determine the size of the device in units of block_size bytes. */
-       if (!i_size_read(sb->s_bdev->bd_inode)) {
+       vol->nr_blocks = sb_bdev_nr_blocks(sb);
+       if (!vol->nr_blocks) {
                 if (!silent)
                         ntfs_error(sb, "Unable to determine device size.");
                 goto err_out_now;
         }
-       vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                       sb->s_blocksize_bits;
         /* Read the boot sector and return unlocked buffer head to it. */
         if (!(bh = read_ntfs_boot_sector(sb, silent))) {
                 if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                         goto err_out_now;
                 }
                 BUG_ON(blocksize != sb->s_blocksize);
-               vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits;
+               vol->nr_blocks = sb_bdev_nr_blocks(sb);
                 ntfs_debug("Changed device block size to %i bytes (block size "
                                 "bits %i) to match volume sector size.",
                                 blocksize, sb->s_blocksize_bits);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c

index 43b1451..a3cd3c3 100644 (file)
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,6 +8,7 @@
   */
  
  #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
  #include <linux/buffer_head.h>
  #include <linux/compat.h>
  #include <linux/falloc.h>
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c

index 859951d..a87ab3a 100644 (file)
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1046,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
         if (!ret && i2)
                 ret = writeback_inode(i2);
         if (!ret)
-               ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+               ret = sync_blockdev_nowait(sb->s_bdev);
         return ret;
  }
  
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c

index d41d769..2981320 100644 (file)
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -921,7 +921,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
  
         /* Parse boot. */
         err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
-                                 bdev->bd_inode->i_size);
+                                 bdev_nr_bytes(bdev));
         if (err)
                 goto out;
  
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c

index 8521942..481017e 100644 (file)
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
  {
         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
         struct journal_head *jh;
-       int ret;
+       int ret = 1;
  
         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
                 return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
         if (!buffer_jbd(bg_bh))
                 return 1;
  
-       jh = bh2jh(bg_bh);
-       spin_lock(&jh->b_state_lock);
-       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
-       if (bg)
-               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
-       else
-               ret = 1;
-       spin_unlock(&jh->b_state_lock);
+       jbd_lock_bh_journal_head(bg_bh);
+       if (buffer_jbd(bg_bh)) {
+               jh = bh2jh(bg_bh);
+               spin_lock(&jh->b_state_lock);
+               bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+               if (bg)
+                       ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+               else
+                       ret = 1;
+               spin_unlock(&jh->b_state_lock);
+       }
+       jbd_unlock_bh_journal_head(bg_bh);
  
         return ret;
  }
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c

index c1bb4c4..e5e3e50 100644 (file)
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
   *  Linux VFS inode operations.
   */
  
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
  #include <linux/fileattr.h>
  #include "protocol.h"
  #include "orangefs-kernel.h"
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c

index 2f2e430..8bb0a53 100644 (file)
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
  
  #include <linux/parser.h>
  #include <linux/hashtable.h>
+#include <linux/seq_file.h>
  
  /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
  static struct kmem_cache *orangefs_inode_cache;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c

index c88ac57..ac461a4 100644 (file)
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -272,14 +272,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
         kmem_cache_free(ovl_aio_request_cachep, aio_req);
  }
  
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
  {
         struct ovl_aio_req *aio_req = container_of(iocb,
                                                    struct ovl_aio_req, iocb);
         struct kiocb *orig_iocb = aio_req->orig_iocb;
  
         ovl_aio_cleanup_handler(aio_req);
-       orig_iocb->ki_complete(orig_iocb, res, res2);
+       orig_iocb->ki_complete(orig_iocb, res);
  }
  
  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c

index 04ce58c..5d1fbaf 100644 (file)
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
  static int __register_pstore_blk(struct pstore_device_info *dev,
                                  const char *devpath)
  {
-       struct inode *inode;
         int ret = -ENODEV;
  
         lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
                 goto err;
         }
  
-       inode = file_inode(psblk_file);
-       if (!S_ISBLK(inode->i_mode)) {
+       if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
                 pr_err("'%s' is not block device!\n", devpath);
                 goto err_fput;
         }
  
-       inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
-       dev->zone.total_size = i_size_read(inode);
+       dev->zone.total_size =
+               bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
  
         ret = __register_pstore_device(dev);
         if (ret)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c

index 2bcc9a6..052f143 100644 (file)
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
  #include <linux/namei.h>
  #include <linux/slab.h>
  #include <asm/current.h>
+#include <linux/blkdev.h>
  #include <linux/uaccess.h>
  #include <linux/kernel.h>
  #include <linux/security.h>
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c

index 65e7e56..e230234 100644 (file)
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
  #include <linux/uaccess.h>
  #include <linux/fs_context.h>
  #include <linux/fs_parser.h>
+#include <linux/seq_file.h>
  #include "internal.h"
  
  struct ramfs_mount_opts {
diff --git a/fs/read_write.c b/fs/read_write.c

index af057c5..0074afa 100644 (file)
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
         if (unlikely((ssize_t) count < 0))
                 return -EINVAL;
  
-       /*
-        * ranged mandatory locking does not apply to streams - it makes sense
-        * only for files where position has a meaning.
-        */
         if (ppos) {
                 loff_t pos = *ppos;
  
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c

index 58481f8..076f9ab 100644 (file)
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
  
                         if (!strcmp(arg, "auto")) {
                                 /* From JFS code, to auto-get the size. */
-                               *blocks =
-                                   i_size_read(s->s_bdev->bd_inode) >> s->
-                                   s_blocksize_bits;
+                               *blocks = sb_bdev_nr_blocks(s);
                         } else {
                                 *blocks = simple_strtoul(arg, &p, 0);
                                 if (*p != '\0') {
@@ -1986,9 +1984,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
          * smaller than the filesystem. If the check fails then abort and
          * scream, because bad stuff will happen otherwise.
          */
-       if (s->s_bdev && s->s_bdev->bd_inode
-           && i_size_read(s->s_bdev->bd_inode) <
-           sb_block_count(rs) * sb_blocksize(rs)) {
+       if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
                 SWARN(silent, s, "", "Filesystem cannot be "
                       "mounted because it is bigger than the device");
                 SWARN(silent, s, "", "You may need to run fsck "
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c

index 60d6951..bb44ff4 100644 (file)
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -16,6 +16,7 @@
  
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
+#include <linux/blkdev.h>
  #include <linux/fs.h>
  #include <linux/fs_context.h>
  #include <linux/fs_parser.h>
@@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
         /* Check the filesystem does not extend beyond the end of the
            block device */
         msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
-       if (msblk->bytes_used < 0 || msblk->bytes_used >
-                       i_size_read(sb->s_bdev->bd_inode))
+       if (msblk->bytes_used < 0 ||
+           msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
                 goto failed_mount;
  
         /* Check block size for sanity */
diff --git a/fs/sync.c b/fs/sync.c

index 1373a61..3ce8e21 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
   * High-level sync()-related operations
   */
  
+#include <linux/blkdev.h>
  #include <linux/kernel.h>
  #include <linux/file.h>
  #include <linux/fs.h>
@@ -22,25 +23,6 @@
                         SYNC_FILE_RANGE_WAIT_AFTER)
  
  /*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
-       if (wait)
-               sync_inodes_sb(sb);
-       else
-               writeback_inodes_sb(sb, WB_REASON_SYNC);
-
-       if (sb->s_op->sync_fs)
-               sb->s_op->sync_fs(sb, wait);
-       return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
   * Write out and wait upon all dirty data associated with this
   * superblock.  Filesystem data as well as the underlying block
   * device.  Takes the superblock lock.
@@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb)
         if (sb_rdonly(sb))
                 return 0;
  
-       ret = __sync_filesystem(sb, 0);
+       /*
+        * Do the filesystem syncing work.  For simple filesystems
+        * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+        * to submit I/O for these buffers via sync_blockdev().  This also
+        * speeds up the wait == 1 case since in that case write_inode()
+        * methods call sync_dirty_buffer() and thus effectively write one block
+        * at a time.
+        */
+       writeback_inodes_sb(sb, WB_REASON_SYNC);
+       if (sb->s_op->sync_fs)
+               sb->s_op->sync_fs(sb, 0);
+       ret = sync_blockdev_nowait(sb->s_bdev);
         if (ret < 0)
                 return ret;
-       return __sync_filesystem(sb, 1);
+
+       sync_inodes_sb(sb);
+       if (sb->s_op->sync_fs)
+               sb->s_op->sync_fs(sb, 1);
+       return sync_blockdev(sb->s_bdev);
  }
  EXPORT_SYMBOL(sync_filesystem);
  
@@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
                 sb->s_op->sync_fs(sb, *(int *)arg);
  }
  
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
-       filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
-       /*
-        * We keep the error status of individual mapping so that
-        * applications can catch the writeback error using fsync(2).
-        * See filemap_fdatawait_keep_errors() for details.
-        */
-       filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
  /*
   * Sync everything. We start by waking flusher threads so that most of
   * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +96,8 @@ void ksys_sync(void)
         iterate_supers(sync_inodes_one_sb, NULL);
         iterate_supers(sync_fs_one_sb, &nowait);
         iterate_supers(sync_fs_one_sb, &wait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
-       iterate_bdevs(fdatawait_one_bdev, NULL);
+       sync_bdevs(false);
+       sync_bdevs(true);
         if (unlikely(laptop_mode))
                 laptop_sync_completion();
  }
@@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work)
          */
         iterate_supers(sync_inodes_one_sb, &nowait);
         iterate_supers(sync_fs_one_sb, &nowait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
+       sync_bdevs(false);
         iterate_supers(sync_inodes_one_sb, &nowait);
         iterate_supers(sync_fs_one_sb, &nowait);
-       iterate_bdevs(fdatawrite_one_bdev, NULL);
+       sync_bdevs(false);
         printk("Emergency Sync complete\n");
         kfree(work);
  }
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c

index 22be7ae..c57b46a 100644 (file)
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
         .get_context            = ubifs_crypt_get_context,
         .set_context            = ubifs_crypt_set_context,
         .empty_dir              = ubifs_crypt_empty_dir,
-       .max_namelen            = UBIFS_MAX_NLEN,
  };
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c

index f1094cd..46d6971 100644 (file)
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
  
  unsigned long udf_get_last_block(struct super_block *sb)
  {
-       struct block_device *bdev = sb->s_bdev;
-       struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+       struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
         unsigned long lblock = 0;
  
         /*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
          * Try using the device size...
          */
         if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
-               lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+               lblock = sb_bdev_nr_blocks(sb);
  
         if (lblock)
                 return lblock - 1;
diff --git a/fs/udf/super.c b/fs/udf/super.c

index b2d7c57..34247fb 100644 (file)
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
         struct udf_inode_info *vati;
         uint32_t pos;
         struct virtualAllocationTable20 *vat20;
-       sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
-                         sb->s_blocksize_bits;
+       sector_t blocks = sb_bdev_nr_blocks(sb);
  
         udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
         if (!sbi->s_vat_inode &&
@@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
         int ret;
  
         if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-           udf_fixed_to_variable(block) >=
-           i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+           udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
                 return -EAGAIN;
  
         bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
                 last[last_count++] = *lastblock - 152;
  
         for (i = 0; i < last_count; i++) {
-               if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
-                               sb->s_blocksize_bits)
+               if (last[i] >= sb_bdev_nr_blocks(sb))
                         continue;
                 ret = udf_check_anchor_block(sb, last[i], fileset);
                 if (ret != -EAGAIN) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 7aa943e..62e7fbe 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
         .write_iter     = xfs_file_write_iter,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
         .unlocked_ioctl = xfs_file_ioctl,
  #ifdef CONFIG_COMPAT
         .compat_ioctl   = xfs_file_compat_ioctl,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c

index ddc346a..3ce5f47 100644 (file)
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
         .write_iter     = zonefs_file_write_iter,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
-       .iopoll         = iomap_dio_iopoll,
+       .iopoll         = iocb_bio_iopoll,
  };
  
  static struct kmem_cache *zonefs_inode_cachep;
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h

index 4a674db..fedc0df 100644 (file)
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -49,9 +49,15 @@ static inline void flush_cache_page(struct vm_area_struct *vma,
  static inline void flush_dcache_page(struct page *page)
  {
  }
+
+static inline void flush_dcache_folio(struct folio *folio) { }
  #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
  #endif
  
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio);
+#endif
  
  #ifndef flush_dcache_mmap_lock
  static inline void flush_dcache_mmap_lock(struct address_space *mapping)
diff --git a/include/linux/ata.h b/include/linux/ata.h

index 1b44f40..199e47e 100644 (file)
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -329,6 +329,7 @@ enum {
         ATA_LOG_SECURITY          = 0x06,
         ATA_LOG_SATA_SETTINGS     = 0x08,
         ATA_LOG_ZONED_INFORMATION = 0x09,
+       ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47,
  
         /* Identify device SATA settings log:*/
         ATA_LOG_DEVSLP_OFFSET     = 0x30,
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index ac7f231..9c14f0a 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -12,13 +12,13 @@
  #include <linux/kernel.h>
  #include <linux/fs.h>
  #include <linux/sched.h>
-#include <linux/blkdev.h>
  #include <linux/device.h>
  #include <linux/writeback.h>
-#include <linux/blk-cgroup.h>
  #include <linux/backing-dev-defs.h>
  #include <linux/slab.h>
  
+struct blkcg;
+
  static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
  {
         kref_get(&bdi->refcnt);
@@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
         return atomic_long_read(&bdi->tot_write_bandwidth);
  }
  
-static inline void __add_wb_stat(struct bdi_writeback *wb,
+static inline void wb_stat_mod(struct bdi_writeback *wb,
                                  enum wb_stat_item item, s64 amount)
  {
         percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
@@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb,
  
  static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
  {
-       __add_wb_stat(wb, item, 1);
+       wb_stat_mod(wb, item, 1);
  }
  
  static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
  {
-       __add_wb_stat(wb, item, -1);
+       wb_stat_mod(wb, item, -1);
  }
  
  static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
@@ -133,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb)
         return test_bit(WB_writeback_running, &wb->state);
  }
  
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-       struct super_block *sb;
-
-       if (!inode)
-               return &noop_backing_dev_info;
-
-       sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-       if (sb_is_blkdev_sb(sb))
-               return I_BDEV(inode)->bd_disk->bdi;
-#endif
-       return sb->s_bdi;
-}
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
  
  static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
  {
diff --git a/include/linux/bio.h b/include/linux/bio.h

index 00952e9..fe6bdfb 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -6,19 +6,10 @@
  #define __LINUX_BIO_H
  
  #include <linux/mempool.h>
-#include <linux/ioprio.h>
  /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
  #include <linux/blk_types.h>
  #include <linux/uio.h>
  
-#define BIO_DEBUG
-
-#ifdef BIO_DEBUG
-#define BIO_BUG_ON     BUG_ON
-#else
-#define BIO_BUG_ON
-#endif
-
  #define BIO_MAX_VECS           256U
  
  static inline unsigned int bio_max_segs(unsigned int nr_segs)
@@ -78,22 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
                bio_op(bio) == REQ_OP_WRITE_ZEROES;
  }
  
-static inline bool bio_mergeable(struct bio *bio)
-{
-       if (bio->bi_opf & REQ_NOMERGE_FLAGS)
-               return false;
-
-       return true;
-}
-
-static inline unsigned int bio_cur_bytes(struct bio *bio)
-{
-       if (bio_has_data(bio))
-               return bio_iovec(bio).bv_len;
-       else /* dataless requests such as discard */
-               return bio->bi_iter.bi_size;
-}
-
  static inline void *bio_data(struct bio *bio)
  {
         if (bio_has_data(bio))
@@ -102,25 +77,6 @@ static inline void *bio_data(struct bio *bio)
         return NULL;
  }
  
-/**
- * bio_full - check if the bio is full
- * @bio:       bio to check
- * @len:       length of one segment to be added
- *
- * Return true if @bio is full and one segment with @len bytes can't be
- * added to the bio, otherwise return false
- */
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-       if (bio->bi_vcnt >= bio->bi_max_vecs)
-               return true;
-
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
-               return true;
-
-       return false;
-}
-
  static inline bool bio_next_segment(const struct bio *bio,
                                     struct bvec_iter_all *iter)
  {
@@ -163,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio,
                 bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
  }
  
+void __bio_advance(struct bio *, unsigned bytes);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:       bio to advance
+ * @bytes:     number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+static inline void bio_advance(struct bio *bio, unsigned int nbytes)
+{
+       if (nbytes == bio->bi_iter.bi_size) {
+               bio->bi_iter.bi_size = 0;
+               return;
+       }
+       __bio_advance(bio, nbytes);
+}
+
  #define __bio_for_each_segment(bvl, bio, iter, start)                  \
         for (iter = (start);                                            \
              (iter).bi_size &&                                          \
@@ -265,37 +243,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
         bio->bi_flags &= ~(1U << bit);
  }
  
-static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
-{
-       *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-}
-
-static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
-{
-       struct bvec_iter iter = bio->bi_iter;
-       int idx;
-
-       bio_get_first_bvec(bio, bv);
-       if (bv->bv_len == bio->bi_iter.bi_size)
-               return;         /* this bio only has a single bvec */
-
-       bio_advance_iter(bio, &iter, iter.bi_size);
-
-       if (!iter.bi_bvec_done)
-               idx = iter.bi_idx - 1;
-       else    /* in the middle of bvec */
-               idx = iter.bi_idx;
-
-       *bv = bio->bi_io_vec[idx];
-
-       /*
-        * iter.bi_bvec_done records actual length of the last bvec
-        * if this bio ends in the middle of one io vector
-        */
-       if (iter.bi_bvec_done)
-               bv->bv_len = iter.bi_bvec_done;
-}
-
  static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
  {
         WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
@@ -424,7 +371,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
         return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
  }
  
-extern blk_qc_t submit_bio(struct bio *);
+void submit_bio(struct bio *bio);
  
  extern void bio_endio(struct bio *);
  
@@ -456,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
  struct request_queue;
  
  extern int submit_bio_wait(struct bio *bio);
-extern void bio_advance(struct bio *, unsigned);
-
  extern void bio_init(struct bio *bio, struct bio_vec *table,
                      unsigned short max_vecs);
  extern void bio_uninit(struct bio *);
@@ -469,12 +414,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
                            unsigned int, unsigned int);
  int bio_add_zone_append_page(struct bio *bio, struct page *page,
                              unsigned int len, unsigned int offset);
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
-               unsigned int len, unsigned int off, bool *same_page);
  void __bio_add_page(struct bio *bio, struct page *page,
                 unsigned int len, unsigned int off);
  int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-void bio_release_pages(struct bio *bio, bool mark_dirty);
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void __bio_release_pages(struct bio *bio, bool mark_dirty);
  extern void bio_set_pages_dirty(struct bio *bio);
  extern void bio_check_pages_dirty(struct bio *bio);
  
@@ -482,27 +426,16 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                                struct bio *src, struct bvec_iter *src_iter);
  extern void bio_copy_data(struct bio *dst, struct bio *src);
  extern void bio_free_pages(struct bio *bio);
-void bio_truncate(struct bio *bio, unsigned new_size);
  void guard_bio_eod(struct bio *bio);
  void zero_fill_bio(struct bio *bio);
  
-extern const char *bio_devname(struct bio *bio, char *buffer);
+static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
+{
+       if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+               __bio_release_pages(bio, mark_dirty);
+}
  
-#define bio_set_dev(bio, bdev)                                 \
-do {                                                   \
-       bio_clear_flag(bio, BIO_REMAPPED);              \
-       if ((bio)->bi_bdev != (bdev))                   \
-               bio_clear_flag(bio, BIO_THROTTLED);     \
-       (bio)->bi_bdev = (bdev);                        \
-       bio_associate_blkg(bio);                        \
-} while (0)
-
-#define bio_copy_dev(dst, src)                 \
-do {                                           \
-       bio_clear_flag(dst, BIO_REMAPPED);              \
-       (dst)->bi_bdev = (src)->bi_bdev;        \
-       bio_clone_blkg_association(dst, src);   \
-} while (0)
+extern const char *bio_devname(struct bio *bio, char *buffer);
  
  #define bio_dev(bio) \
         disk_devt((bio)->bi_bdev->bd_disk)
@@ -521,6 +454,22 @@ static inline void bio_clone_blkg_association(struct bio *dst,
                                               struct bio *src) { }
  #endif /* CONFIG_BLK_CGROUP */
  
+static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+       bio_clear_flag(bio, BIO_REMAPPED);
+       if (bio->bi_bdev != bdev)
+               bio_clear_flag(bio, BIO_THROTTLED);
+       bio->bi_bdev = bdev;
+       bio_associate_blkg(bio);
+}
+
+static inline void bio_copy_dev(struct bio *dst, struct bio *src)
+{
+       bio_clear_flag(dst, BIO_REMAPPED);
+       dst->bi_bdev = src->bi_bdev;
+       bio_clone_blkg_association(dst, src);
+}
+
  /*
   * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
   *
@@ -784,7 +733,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
   */
  static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
  {
-       bio->bi_opf |= REQ_HIPRI;
+       bio->bi_opf |= REQ_POLLED;
         if (!is_sync_kiocb(kiocb))
                 bio->bi_opf |= REQ_NOWAIT;
  }
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h

new file mode 100644 (file)

index 0000000..bbab65b
--- /dev/null
+++ b/include/linux/blk-crypto-profile.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_BLK_CRYPTO_PROFILE_H
+#define __LINUX_BLK_CRYPTO_PROFILE_H
+
+#include <linux/bio.h>
+#include <linux/blk-crypto.h>
+
+struct blk_crypto_profile;
+
+/**
+ * struct blk_crypto_ll_ops - functions to control inline encryption hardware
+ *
+ * Low-level operations for controlling inline encryption hardware.  This
+ * interface must be implemented by storage drivers that support inline
+ * encryption.  All functions may sleep, are serialized by profile->lock, and
+ * are never called while profile->dev (if set) is runtime-suspended.
+ */
+struct blk_crypto_ll_ops {
+
+       /**
+        * @keyslot_program: Program a key into the inline encryption hardware.
+        *
+        * Program @key into the specified @slot in the inline encryption
+        * hardware, overwriting any key that the keyslot may already contain.
+        * The keyslot is guaranteed to not be in-use by any I/O.
+        *
+        * This is required if the device has keyslots.  Otherwise (i.e. if the
+        * device is a layered device, or if the device is real hardware that
+        * simply doesn't have the concept of keyslots) it is never called.
+        *
+        * Must return 0 on success, or -errno on failure.
+        */
+       int (*keyslot_program)(struct blk_crypto_profile *profile,
+                              const struct blk_crypto_key *key,
+                              unsigned int slot);
+
+       /**
+        * @keyslot_evict: Evict a key from the inline encryption hardware.
+        *
+        * If the device has keyslots, this function must evict the key from the
+        * specified @slot.  The slot will contain @key, but there should be no
+        * need for the @key argument to be used as @slot should be sufficient.
+        * The keyslot is guaranteed to not be in-use by any I/O.
+        *
+        * If the device doesn't have keyslots itself, this function must evict
+        * @key from any underlying devices.  @slot won't be valid in this case.
+        *
+        * If there are no keyslots and no underlying devices, this function
+        * isn't required.
+        *
+        * Must return 0 on success, or -errno on failure.
+        */
+       int (*keyslot_evict)(struct blk_crypto_profile *profile,
+                            const struct blk_crypto_key *key,
+                            unsigned int slot);
+};
+
+/**
+ * struct blk_crypto_profile - inline encryption profile for a device
+ *
+ * This struct contains a storage device's inline encryption capabilities (e.g.
+ * the supported crypto algorithms), driver-provided functions to control the
+ * inline encryption hardware (e.g. programming and evicting keys), and optional
+ * device-independent keyslot management data.
+ */
+struct blk_crypto_profile {
+
+       /* public: Drivers must initialize the following fields. */
+
+       /**
+        * @ll_ops: Driver-provided functions to control the inline encryption
+        * hardware, e.g. program and evict keys.
+        */
+       struct blk_crypto_ll_ops ll_ops;
+
+       /**
+        * @max_dun_bytes_supported: The maximum number of bytes supported for
+        * specifying the data unit number (DUN).  Specifically, the range of
+        * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1.
+        */
+       unsigned int max_dun_bytes_supported;
+
+       /**
+        * @modes_supported: Array of bitmasks that specifies whether each
+        * combination of crypto mode and data unit size is supported.
+        * Specifically, the i'th bit of modes_supported[crypto_mode] is set if
+        * crypto_mode can be used with a data unit size of (1 << i).  Note that
+        * only data unit sizes that are powers of 2 can be supported.
+        */
+       unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX];
+
+       /**
+        * @dev: An optional device for runtime power management.  If the driver
+        * provides this device, it will be runtime-resumed before any function
+        * in @ll_ops is called and will remain resumed during the call.
+        */
+       struct device *dev;
+
+       /* private: The following fields shouldn't be accessed by drivers. */
+
+       /* Number of keyslots, or 0 if not applicable */
+       unsigned int num_slots;
+
+       /*
+        * Serializes all calls to functions in @ll_ops as well as all changes
+        * to @slot_hashtable.  This can also be taken in read mode to look up
+        * keyslots while ensuring that they can't be changed concurrently.
+        */
+       struct rw_semaphore lock;
+
+       /* List of idle slots, with least recently used slot at front */
+       wait_queue_head_t idle_slots_wait_queue;
+       struct list_head idle_slots;
+       spinlock_t idle_slots_lock;
+
+       /*
+        * Hash table which maps struct *blk_crypto_key to keyslots, so that we
+        * can find a key's keyslot in O(1) time rather than O(num_slots).
+        * Protected by 'lock'.
+        */
+       struct hlist_head *slot_hashtable;
+       unsigned int log_slot_ht_size;
+
+       /* Per-keyslot data */
+       struct blk_crypto_keyslot *slots;
+};
+
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+                           unsigned int num_slots);
+
+int devm_blk_crypto_profile_init(struct device *dev,
+                                struct blk_crypto_profile *profile,
+                                unsigned int num_slots);
+
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
+
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+                                   const struct blk_crypto_key *key,
+                                   struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+                               const struct blk_crypto_config *cfg);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+                          const struct blk_crypto_key *key);
+
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
+
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+                                      const struct blk_crypto_profile *child);
+
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+                                const struct blk_crypto_profile *reference);
+
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+                                   const struct blk_crypto_profile *src);
+
+#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h

new file mode 100644 (file)

index 0000000..8a038ea
--- /dev/null
+++ b/include/linux/blk-integrity.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BLK_INTEGRITY_H
+#define _LINUX_BLK_INTEGRITY_H
+
+#include <linux/blk-mq.h>
+
+struct request;
+
+enum blk_integrity_flags {
+       BLK_INTEGRITY_VERIFY            = 1 << 0,
+       BLK_INTEGRITY_GENERATE          = 1 << 1,
+       BLK_INTEGRITY_DEVICE_CAPABLE    = 1 << 2,
+       BLK_INTEGRITY_IP_CHECKSUM       = 1 << 3,
+};
+
+struct blk_integrity_iter {
+       void                    *prot_buf;
+       void                    *data_buf;
+       sector_t                seed;
+       unsigned int            data_size;
+       unsigned short          interval;
+       const char              *disk_name;
+};
+
+typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef void (integrity_prepare_fn) (struct request *);
+typedef void (integrity_complete_fn) (struct request *, unsigned int);
+
+struct blk_integrity_profile {
+       integrity_processing_fn         *generate_fn;
+       integrity_processing_fn         *verify_fn;
+       integrity_prepare_fn            *prepare_fn;
+       integrity_complete_fn           *complete_fn;
+       const char                      *name;
+};
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+void blk_integrity_register(struct gendisk *, struct blk_integrity *);
+void blk_integrity_unregister(struct gendisk *);
+int blk_integrity_compare(struct gendisk *, struct gendisk *);
+int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+                                  struct scatterlist *);
+int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+       struct blk_integrity *bi = &disk->queue->integrity;
+
+       if (!bi->profile)
+               return NULL;
+
+       return bi;
+}
+
+static inline struct blk_integrity *
+bdev_get_integrity(struct block_device *bdev)
+{
+       return blk_get_integrity(bdev->bd_disk);
+}
+
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+       return q->integrity.profile;
+}
+
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+                                                   unsigned int segs)
+{
+       q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+       return q->limits.max_integrity_segments;
+}
+
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:                blk_integrity profile for device
+ * @sectors:   Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
+static inline bool blk_integrity_rq(struct request *rq)
+{
+       return rq->cmd_flags & REQ_INTEGRITY;
+}
+
+/*
+ * Return the first bvec that contains integrity data.  Only drivers that are
+ * limited to a single integrity segment should use this helper.
+ */
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+       if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
+               return NULL;
+       return rq->bio->bi_integrity->bip_vec;
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+                                           struct bio *b)
+{
+       return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+                                         struct bio *b,
+                                         struct scatterlist *s)
+{
+       return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+       return NULL;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+       return NULL;
+}
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+       return false;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+       return 0;
+}
+static inline void blk_integrity_register(struct gendisk *d,
+                                        struct blk_integrity *b)
+{
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+                                                   unsigned int segs)
+{
+}
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+       return 0;
+}
+
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return 0;
+}
+static inline int blk_integrity_rq(struct request *rq)
+{
+       return 0;
+}
+
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+       return NULL;
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+#endif /* _LINUX_BLK_INTEGRITY_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index 13ba186..8682663 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -6,10 +6,226 @@
  #include <linux/sbitmap.h>
  #include <linux/srcu.h>
  #include <linux/lockdep.h>
+#include <linux/scatterlist.h>
+#include <linux/prefetch.h>
  
  struct blk_mq_tags;
  struct blk_flush_queue;
  
+#define BLKDEV_MIN_RQ  4
+#define BLKDEV_DEFAULT_RQ      128
+
+typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+
+/*
+ * request flags */
+typedef __u32 __bitwise req_flags_t;
+
+/* drive already may have started this one */
+#define RQF_STARTED            ((__force req_flags_t)(1 << 1))
+/* may not be passed by ioscheduler */
+#define RQF_SOFTBARRIER                ((__force req_flags_t)(1 << 3))
+/* request for flush sequence */
+#define RQF_FLUSH_SEQ          ((__force req_flags_t)(1 << 4))
+/* merge of different types, fail separately */
+#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
+/* track inflight for MQ */
+#define RQF_MQ_INFLIGHT                ((__force req_flags_t)(1 << 6))
+/* don't call prep for this one */
+#define RQF_DONTPREP           ((__force req_flags_t)(1 << 7))
+/* vaguely specified driver internal error.  Ignored by the block layer */
+#define RQF_FAILED             ((__force req_flags_t)(1 << 10))
+/* don't warn about errors */
+#define RQF_QUIET              ((__force req_flags_t)(1 << 11))
+/* elevator private data attached */
+#define RQF_ELVPRIV            ((__force req_flags_t)(1 << 12))
+/* account into disk and partition IO statistics */
+#define RQF_IO_STAT            ((__force req_flags_t)(1 << 13))
+/* runtime pm request */
+#define RQF_PM                 ((__force req_flags_t)(1 << 15))
+/* on IO scheduler merge hash */
+#define RQF_HASHED             ((__force req_flags_t)(1 << 16))
+/* track IO completion time */
+#define RQF_STATS              ((__force req_flags_t)(1 << 17))
+/* Look at ->special_vec for the actual data payload instead of the
+   bio chain. */
+#define RQF_SPECIAL_PAYLOAD    ((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
+/* ->timeout has been called, don't expire again */
+#define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
+/* queue has elevator attached */
+#define RQF_ELV                        ((__force req_flags_t)(1 << 22))
+
+/* flags that prevent us from merging requests: */
+#define RQF_NOMERGE_FLAGS \
+       (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+
+enum mq_rq_state {
+       MQ_RQ_IDLE              = 0,
+       MQ_RQ_IN_FLIGHT         = 1,
+       MQ_RQ_COMPLETE          = 2,
+};
+
+/*
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
+ */
+struct request {
+       struct request_queue *q;
+       struct blk_mq_ctx *mq_ctx;
+       struct blk_mq_hw_ctx *mq_hctx;
+
+       unsigned int cmd_flags;         /* op and common flags */
+       req_flags_t rq_flags;
+
+       int tag;
+       int internal_tag;
+
+       unsigned int timeout;
+
+       /* the following two fields are internal, NEVER access directly */
+       unsigned int __data_len;        /* total data len */
+       sector_t __sector;              /* sector cursor */
+
+       struct bio *bio;
+       struct bio *biotail;
+
+       union {
+               struct list_head queuelist;
+               struct request *rq_next;
+       };
+
+       struct gendisk *rq_disk;
+       struct block_device *part;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+       /* Time that the first bio started allocating this request. */
+       u64 alloc_time_ns;
+#endif
+       /* Time that this request was allocated for this IO. */
+       u64 start_time_ns;
+       /* Time that I/O was submitted to the device. */
+       u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+       unsigned short wbt_flags;
+#endif
+       /*
+        * rq sectors used for blk stats. It has the same value
+        * with blk_rq_sectors(rq), except that it never be zeroed
+        * by completion.
+        */
+       unsigned short stats_sectors;
+
+       /*
+        * Number of scatter-gather DMA addr+len pairs after
+        * physical address coalescing is performed.
+        */
+       unsigned short nr_phys_segments;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       unsigned short nr_integrity_segments;
+#endif
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+       struct bio_crypt_ctx *crypt_ctx;
+       struct blk_crypto_keyslot *crypt_keyslot;
+#endif
+
+       unsigned short write_hint;
+       unsigned short ioprio;
+
+       enum mq_rq_state state;
+       refcount_t ref;
+
+       unsigned long deadline;
+
+       /*
+        * The hash is used inside the scheduler, and killed once the
+        * request reaches the dispatch list. The ipi_list is only used
+        * to queue the request for softirq completion, which is long
+        * after the request has been unhashed (and even removed from
+        * the dispatch list).
+        */
+       union {
+               struct hlist_node hash; /* merge hash */
+               struct llist_node ipi_list;
+       };
+
+       /*
+        * The rb_node is only used inside the io scheduler, requests
+        * are pruned when moved to the dispatch queue. So let the
+        * completion_data share space with the rb_node.
+        */
+       union {
+               struct rb_node rb_node; /* sort/lookup */
+               struct bio_vec special_vec;
+               void *completion_data;
+               int error_count; /* for legacy drivers, don't use */
+       };
+
+
+       /*
+        * Three pointers are available for the IO schedulers, if they need
+        * more they have to dynamically allocate it.  Flush requests are
+        * never put on the IO scheduler. So let the flush fields share
+        * space with the elevator data.
+        */
+       union {
+               struct {
+                       struct io_cq            *icq;
+                       void                    *priv[2];
+               } elv;
+
+               struct {
+                       unsigned int            seq;
+                       struct list_head        list;
+                       rq_end_io_fn            *saved_end_io;
+               } flush;
+       };
+
+       union {
+               struct __call_single_data csd;
+               u64 fifo_time;
+       };
+
+       /*
+        * completion callback.
+        */
+       rq_end_io_fn *end_io;
+       void *end_io_data;
+};
+
+#define req_op(req) \
+       ((req)->cmd_flags & REQ_OP_MASK)
+
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+       return blk_op_is_passthrough(req_op(rq));
+}
+
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+       return req->ioprio;
+}
+
+#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)
+
+#define rq_dma_dir(rq) \
+       (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
+enum blk_eh_timer_return {
+       BLK_EH_DONE,            /* drivers has completed the command */
+       BLK_EH_RESET_TIMER,     /* reset timer and try again */
+};
+
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
+
  /**
   * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
   * block device
@@ -126,9 +342,6 @@ struct blk_mq_hw_ctx {
         unsigned long           queued;
         /** @run: Number of dispatched requests. */
         unsigned long           run;
-#define BLK_MQ_MAX_DISPATCH_ORDER      7
-       /** @dispatched: Number of dispatch requests by queue. */
-       unsigned long           dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
  
         /** @numa_node: NUMA node the storage adapter has been connected to. */
         unsigned int            numa_node;
@@ -148,13 +361,6 @@ struct blk_mq_hw_ctx {
         /** @kobj: Kernel object for sysfs. */
         struct kobject          kobj;
  
-       /** @poll_considered: Count times blk_poll() was called. */
-       unsigned long           poll_considered;
-       /** @poll_invoked: Count how many requests blk_poll() polled. */
-       unsigned long           poll_invoked;
-       /** @poll_success: Count how many polled requests were completed. */
-       unsigned long           poll_success;
-
  #ifdef CONFIG_BLK_DEBUG_FS
         /**
          * @debugfs_dir: debugfs directory for this hardware queue. Named
@@ -232,13 +438,11 @@ enum hctx_type {
   * @flags:        Zero or more BLK_MQ_F_* flags.
   * @driver_data:   Pointer to data owned by the block driver that created this
   *                tag set.
- * @active_queues_shared_sbitmap:
- *                number of active request queues per tag set.
- * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
- * @__breserved_tags:
- *                A shared reserved tags sbitmap, used over all hctx's
   * @tags:         Tag sets. One tag set per hardware queue. Has @nr_hw_queues
   *                elements.
+ * @shared_tags:
+ *                Shared set of tags. Has @nr_hw_queues elements. If set,
+ *                shared by all @tags.
   * @tag_list_lock: Serializes tag_list accesses.
   * @tag_list:     List of the request queues that use this tag set. See also
   *                request_queue.tag_set_list.
@@ -255,12 +459,11 @@ struct blk_mq_tag_set {
         unsigned int            timeout;
         unsigned int            flags;
         void                    *driver_data;
-       atomic_t                active_queues_shared_sbitmap;
  
-       struct sbitmap_queue    __bitmap_tags;
-       struct sbitmap_queue    __breserved_tags;
         struct blk_mq_tags      **tags;
  
+       struct blk_mq_tags      *shared_tags;
+
         struct mutex            tag_list_lock;
         struct list_head        tag_list;
  };
@@ -330,7 +533,7 @@ struct blk_mq_ops {
         /**
          * @poll: Called to poll for completion of a specific tag.
          */
-       int (*poll)(struct blk_mq_hw_ctx *);
+       int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
  
         /**
          * @complete: Mark the request as complete.
@@ -364,11 +567,6 @@ struct blk_mq_ops {
                              unsigned int);
  
         /**
-        * @initialize_rq_fn: Called from inside blk_get_request().
-        */
-       void (*initialize_rq_fn)(struct request *rq);
-
-       /**
          * @cleanup_rq: Called before freeing one request which isn't completed
          * yet, and usually for freeing the driver private data.
          */
@@ -432,6 +630,8 @@ enum {
         ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
                 << BLK_MQ_F_ALLOC_POLICY_START_BIT)
  
+#define BLK_MQ_NO_HCTX_IDX     (-1U)
+
  struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
                 struct lock_class_key *lkclass);
  #define blk_mq_alloc_disk(set, queuedata)                              \
@@ -451,8 +651,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                 unsigned int set_flags);
  void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
  
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
-
  void blk_mq_free_request(struct request *rq);
  
  bool blk_mq_queue_inflight(struct request_queue *q);
@@ -471,7 +669,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                 unsigned int op, blk_mq_req_flags_t flags,
                 unsigned int hctx_idx);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+       unsigned int nr_tags;
+       unsigned int nr_reserved_tags;
+
+       atomic_t active_queues;
+
+       struct sbitmap_queue bitmap_tags;
+       struct sbitmap_queue breserved_tags;
+
+       struct request **rqs;
+       struct request **static_rqs;
+       struct list_head page_list;
+
+       /*
+        * used to clear request reference in rqs[] before freeing one
+        * request pool
+        */
+       spinlock_t lock;
+};
+
+static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
+                                              unsigned int tag)
+{
+       if (tag < tags->nr_tags) {
+               prefetch(tags->rqs[tag]);
+               return tags->rqs[tag];
+       }
+
+       return NULL;
+}
  
  enum {
         BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -524,6 +755,35 @@ static inline void blk_mq_set_request_complete(struct request *rq)
  void blk_mq_start_request(struct request *rq);
  void blk_mq_end_request(struct request *rq, blk_status_t error);
  void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_comp_batch *ib);
+
+/*
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+}
+
+/*
+ * Batched completions only work when there is no I/O error and no special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+                                      struct io_comp_batch *iob, int ioerror,
+                                      void (*complete)(struct io_comp_batch *))
+{
+       if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+               return false;
+       if (!iob->complete)
+               iob->complete = complete;
+       else if (iob->complete != complete)
+               return false;
+       iob->need_ts |= blk_mq_need_time_stamp(req);
+       rq_list_add(&iob->req_list, req);
+       return true;
+}
  
  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
  void blk_mq_kick_requeue_list(struct request_queue *q);
@@ -605,16 +865,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
         for ((i) = 0; (i) < (hctx)->nr_ctx &&                           \
              ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
  
-static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
-               struct request *rq)
-{
-       if (rq->tag != -1)
-               return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
-
-       return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
-                       BLK_QC_T_INTERNAL;
-}
-
  static inline void blk_mq_cleanup_rq(struct request *rq)
  {
         if (rq->q->mq_ops->cleanup_rq)
@@ -633,8 +883,265 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
                 rq->rq_disk = bio->bi_bdev->bd_disk;
  }
  
-blk_qc_t blk_mq_submit_bio(struct bio *bio);
  void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                 struct lock_class_key *key);
  
+static inline bool rq_is_sync(struct request *rq)
+{
+       return op_is_sync(rq->cmd_flags);
+}
+
+void blk_rq_init(struct request_queue *q, struct request *rq);
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+               struct bio_set *bs, gfp_t gfp_mask,
+               int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
+void blk_rq_unprep_clone(struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request_queue *q,
+               struct request *rq);
+
+struct rq_map_data {
+       struct page **pages;
+       int page_order;
+       int nr_entries;
+       unsigned long offset;
+       int null_mapped;
+       int from_user;
+};
+
+int blk_rq_map_user(struct request_queue *, struct request *,
+               struct rq_map_data *, void __user *, unsigned long, gfp_t);
+int blk_rq_map_user_iov(struct request_queue *, struct request *,
+               struct rq_map_data *, const struct iov_iter *, gfp_t);
+int blk_rq_unmap_user(struct bio *);
+int blk_rq_map_kern(struct request_queue *, struct request *, void *,
+               unsigned int, gfp_t);
+int blk_rq_append_bio(struct request *rq, struct bio *bio);
+void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
+               rq_end_io_fn *);
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+               int at_head);
+
+struct req_iterator {
+       struct bvec_iter iter;
+       struct bio *bio;
+};
+
+#define __rq_for_each_bio(_bio, rq)    \
+       if ((rq->bio))                  \
+               for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
+
+#define rq_for_each_segment(bvl, _rq, _iter)                   \
+       __rq_for_each_bio(_iter.bio, _rq)                       \
+               bio_for_each_segment(bvl, _iter.bio, _iter.iter)
+
+#define rq_for_each_bvec(bvl, _rq, _iter)                      \
+       __rq_for_each_bio(_iter.bio, _rq)                       \
+               bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
+#define rq_iter_last(bvec, _iter)                              \
+               (_iter.bio->bi_next == NULL &&                  \
+                bio_iter_last(bvec, _iter.iter))
+
+/*
+ * blk_rq_pos()                        : the current sector
+ * blk_rq_bytes()              : bytes left in the entire request
+ * blk_rq_cur_bytes()          : bytes left in the current segment
+ * blk_rq_err_bytes()          : bytes left till the next error boundary
+ * blk_rq_sectors()            : sectors left in the entire request
+ * blk_rq_cur_sectors()                : sectors left in the current segment
+ * blk_rq_stats_sectors()      : sectors of the entire request used for stats
+ */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+       return rq->__sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+       return rq->__data_len;
+}
+
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+       if (!rq->bio)
+               return 0;
+       if (!bio_has_data(rq->bio))     /* dataless requests such as discard */
+               return rq->bio->bi_iter.bi_size;
+       return bio_iovec(rq->bio).bv_len;
+}
+
+unsigned int blk_rq_err_bytes(const struct request *rq);
+
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+       return blk_rq_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+       return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+       return rq->stats_sectors;
+}
+
+/*
+ * Some commands like WRITE SAME have a payload or data transfer size which
+ * is different from the size of the request.  Any driver that supports such
+ * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
+ * calculate the data transfer size.
+ */
+static inline unsigned int blk_rq_payload_bytes(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return rq->special_vec.bv_len;
+       return blk_rq_bytes(rq);
+}
+
+/*
+ * Return the first full biovec in the request.  The caller needs to check that
+ * there are any bvecs before calling this helper.
+ */
+static inline struct bio_vec req_bvec(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return rq->special_vec;
+       return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
+}
+
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+       unsigned int nr_bios = 0;
+       struct bio *bio;
+
+       __rq_for_each_bio(bio, rq)
+               nr_bios++;
+
+       return nr_bios;
+}
+
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
+/*
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ */
+bool blk_update_request(struct request *rq, blk_status_t error,
+                              unsigned int nr_bytes);
+void blk_abort_request(struct request *);
+
+/*
+ * Number of physical segments as sent to the device.
+ *
+ * Normally this is the number of discontiguous data segments sent by the
+ * submitter.  But for data-less command like discard we might have no
+ * actual data segments submitted, but the driver might have to add it's
+ * own special payload.  In that case we still return 1 here so that this
+ * special payload will be mapped.
+ */
+static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+{
+       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+               return 1;
+       return rq->nr_phys_segments;
+}
+
+/*
+ * Number of discard segments (or ranges) the driver needs to fill in.
+ * Each discard bio merged into a request is counted as one segment.
+ */
+static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
+{
+       return max_t(unsigned short, rq->nr_phys_segments, 1);
+}
+
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+               struct scatterlist *sglist, struct scatterlist **last_sg);
+static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+               struct scatterlist *sglist)
+{
+       struct scatterlist *last_sg = NULL;
+
+       return __blk_rq_map_sg(q, rq, sglist, &last_sg);
+}
+void blk_dump_rq_flags(struct request *, char *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
+bool blk_req_needs_zone_write_lock(struct request *rq);
+bool blk_req_zone_write_trylock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+       if (blk_req_needs_zone_write_lock(rq))
+               __blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+               __blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+       return rq->q->seq_zones_wlock &&
+               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+       if (!blk_req_needs_zone_write_lock(rq))
+               return true;
+       return !blk_req_zone_is_write_locked(rq);
+}
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+       return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+       return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+       return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
  #endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void rq_flush_dcache_pages(struct request *rq);
+#else
+static inline void rq_flush_dcache_pages(struct request *rq)
+{
+}
+#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+#endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index be622b5..fe065c3 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,8 +20,26 @@ struct cgroup_subsys_state;
  typedef void (bio_end_io_t) (struct bio *);
  struct bio_crypt_ctx;
  
+/*
+ * The basic unit of block I/O is a sector. It is used in a number of contexts
+ * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
+ * bytes. Variables of type sector_t represent an offset or size that is a
+ * multiple of 512 bytes. Hence these two constants.
+ */
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK            (PAGE_SECTORS - 1)
+
  struct block_device {
         sector_t                bd_start_sect;
+       sector_t                bd_nr_sectors;
         struct disk_stats __percpu *bd_stats;
         unsigned long           bd_stamp;
         bool                    bd_read_only;   /* read-only policy */
@@ -38,6 +56,7 @@ struct block_device {
         u8                      bd_partno;
         spinlock_t              bd_size_lock; /* for bd_inode->i_size updates */
         struct gendisk *        bd_disk;
+       struct request_queue *  bd_queue;
  
         /* The counter of freeze processes */
         int                     bd_fsfreeze_count;
@@ -208,6 +227,9 @@ static inline void bio_issue_init(struct bio_issue *issue,
                         ((u64)size << BIO_ISSUE_SIZE_SHIFT));
  }
  
+typedef unsigned int blk_qc_t;
+#define BLK_QC_T_NONE          -1U
+
  /*
   * main unit of I/O for the block layer and lower layers (ie drivers and
   * stacking drivers)
@@ -227,8 +249,8 @@ struct bio {
  
         struct bvec_iter        bi_iter;
  
+       blk_qc_t                bi_cookie;
         bio_end_io_t            *bi_end_io;
-
         void                    *bi_private;
  #ifdef CONFIG_BLK_CGROUP
         /*
@@ -384,7 +406,7 @@ enum req_flag_bits {
         /* command specific flags for REQ_OP_WRITE_ZEROES: */
         __REQ_NOUNMAP,          /* do not free blocks when zeroing */
  
-       __REQ_HIPRI,
+       __REQ_POLLED,           /* caller polls for completion using bio_poll */
  
         /* for driver use */
         __REQ_DRV,
@@ -409,7 +431,7 @@ enum req_flag_bits {
  #define REQ_CGROUP_PUNT                (1ULL << __REQ_CGROUP_PUNT)
  
  #define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
-#define REQ_HIPRI              (1ULL << __REQ_HIPRI)
+#define REQ_POLLED             (1ULL << __REQ_POLLED)
  
  #define REQ_DRV                        (1ULL << __REQ_DRV)
  #define REQ_SWAP               (1ULL << __REQ_SWAP)
@@ -431,8 +453,6 @@ enum stat_group {
  
  #define bio_op(bio) \
         ((bio)->bi_opf & REQ_OP_MASK)
-#define req_op(req) \
-       ((req)->cmd_flags & REQ_OP_MASK)
  
  /* obsolete, don't use in new code */
  static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
@@ -497,31 +517,6 @@ static inline int op_stat_group(unsigned int op)
         return op_is_write(op);
  }
  
-typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE          -1U
-#define BLK_QC_T_SHIFT         16
-#define BLK_QC_T_INTERNAL      (1U << 31)
-
-static inline bool blk_qc_t_valid(blk_qc_t cookie)
-{
-       return cookie != BLK_QC_T_NONE;
-}
-
-static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-{
-       return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
-}
-
-static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
-{
-       return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
-}
-
-static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
-{
-       return (cookie & BLK_QC_T_INTERNAL) != 0;
-}
-
  struct blk_rq_stat {
         u64 mean;
         u64 min;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 12b9dbc..bd4370b 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -3,8 +3,6 @@
  #define _LINUX_BLKDEV_H
  
  #include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/major.h>
  #include <linux/genhd.h>
  #include <linux/list.h>
  #include <linux/llist.h>
@@ -12,17 +10,11 @@
  #include <linux/timer.h>
  #include <linux/workqueue.h>
  #include <linux/wait.h>
-#include <linux/mempool.h>
-#include <linux/pfn.h>
  #include <linux/bio.h>
-#include <linux/stringify.h>
  #include <linux/gfp.h>
-#include <linux/smp.h>
  #include <linux/rcupdate.h>
  #include <linux/percpu-refcount.h>
-#include <linux/scatterlist.h>
  #include <linux/blkzoned.h>
-#include <linux/pm.h>
  #include <linux/sbitmap.h>
  
  struct module;
@@ -33,14 +25,12 @@ struct request;
  struct sg_io_hdr;
  struct blkcg_gq;
  struct blk_flush_queue;
+struct kiocb;
  struct pr_ops;
  struct rq_qos;
  struct blk_queue_stats;
  struct blk_stat_callback;
-struct blk_keyslot_manager;
-
-#define BLKDEV_MIN_RQ  4
-#define BLKDEV_MAX_RQ  128     /* Default maximum */
+struct blk_crypto_profile;
  
  /* Must be consistent with blk_mq_poll_stats_bkt() */
  #define BLK_MQ_POLL_STATS_BKTS 16
@@ -54,186 +44,13 @@ struct blk_keyslot_manager;
   */
  #define BLKCG_MAX_POLS         6
  
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
-
-/*
- * request flags */
-typedef __u32 __bitwise req_flags_t;
-
-/* drive already may have started this one */
-#define RQF_STARTED            ((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER                ((__force req_flags_t)(1 << 3))
-/* request for flush sequence */
-#define RQF_FLUSH_SEQ          ((__force req_flags_t)(1 << 4))
-/* merge of different types, fail separately */
-#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT                ((__force req_flags_t)(1 << 6))
-/* don't call prep for this one */
-#define RQF_DONTPREP           ((__force req_flags_t)(1 << 7))
-/* vaguely specified driver internal error.  Ignored by the block layer */
-#define RQF_FAILED             ((__force req_flags_t)(1 << 10))
-/* don't warn about errors */
-#define RQF_QUIET              ((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV            ((__force req_flags_t)(1 << 12))
-/* account into disk and partition IO statistics */
-#define RQF_IO_STAT            ((__force req_flags_t)(1 << 13))
-/* runtime pm request */
-#define RQF_PM                 ((__force req_flags_t)(1 << 15))
-/* on IO scheduler merge hash */
-#define RQF_HASHED             ((__force req_flags_t)(1 << 16))
-/* track IO completion time */
-#define RQF_STATS              ((__force req_flags_t)(1 << 17))
-/* Look at ->special_vec for the actual data payload instead of the
-   bio chain. */
-#define RQF_SPECIAL_PAYLOAD    ((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
-
-/* flags that prevent us from merging requests: */
-#define RQF_NOMERGE_FLAGS \
-       (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
-
-/*
- * Request state for blk-mq.
- */
-enum mq_rq_state {
-       MQ_RQ_IDLE              = 0,
-       MQ_RQ_IN_FLIGHT         = 1,
-       MQ_RQ_COMPLETE          = 2,
-};
-
-/*
- * Try to put the fields that are referenced together in the same cacheline.
- *
- * If you modify this structure, make sure to update blk_rq_init() and
- * especially blk_mq_rq_ctx_init() to take care of the added fields.
- */
-struct request {
-       struct request_queue *q;
-       struct blk_mq_ctx *mq_ctx;
-       struct blk_mq_hw_ctx *mq_hctx;
-
-       unsigned int cmd_flags;         /* op and common flags */
-       req_flags_t rq_flags;
-
-       int tag;
-       int internal_tag;
-
-       /* the following two fields are internal, NEVER access directly */
-       unsigned int __data_len;        /* total data len */
-       sector_t __sector;              /* sector cursor */
-
-       struct bio *bio;
-       struct bio *biotail;
-
-       struct list_head queuelist;
-
-       /*
-        * The hash is used inside the scheduler, and killed once the
-        * request reaches the dispatch list. The ipi_list is only used
-        * to queue the request for softirq completion, which is long
-        * after the request has been unhashed (and even removed from
-        * the dispatch list).
-        */
-       union {
-               struct hlist_node hash; /* merge hash */
-               struct llist_node ipi_list;
-       };
-
-       /*
-        * The rb_node is only used inside the io scheduler, requests
-        * are pruned when moved to the dispatch queue. So let the
-        * completion_data share space with the rb_node.
-        */
-       union {
-               struct rb_node rb_node; /* sort/lookup */
-               struct bio_vec special_vec;
-               void *completion_data;
-               int error_count; /* for legacy drivers, don't use */
-       };
-
-       /*
-        * Three pointers are available for the IO schedulers, if they need
-        * more they have to dynamically allocate it.  Flush requests are
-        * never put on the IO scheduler. So let the flush fields share
-        * space with the elevator data.
-        */
-       union {
-               struct {
-                       struct io_cq            *icq;
-                       void                    *priv[2];
-               } elv;
-
-               struct {
-                       unsigned int            seq;
-                       struct list_head        list;
-                       rq_end_io_fn            *saved_end_io;
-               } flush;
-       };
-
-       struct gendisk *rq_disk;
-       struct block_device *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
-       /* Time that the first bio started allocating this request. */
-       u64 alloc_time_ns;
-#endif
-       /* Time that this request was allocated for this IO. */
-       u64 start_time_ns;
-       /* Time that I/O was submitted to the device. */
-       u64 io_start_time_ns;
-
-#ifdef CONFIG_BLK_WBT
-       unsigned short wbt_flags;
-#endif
-       /*
-        * rq sectors used for blk stats. It has the same value
-        * with blk_rq_sectors(rq), except that it never be zeroed
-        * by completion.
-        */
-       unsigned short stats_sectors;
-
-       /*
-        * Number of scatter-gather DMA addr+len pairs after
-        * physical address coalescing is performed.
-        */
-       unsigned short nr_phys_segments;
-
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-       unsigned short nr_integrity_segments;
-#endif
-
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       struct bio_crypt_ctx *crypt_ctx;
-       struct blk_ksm_keyslot *crypt_keyslot;
-#endif
-
-       unsigned short write_hint;
-       unsigned short ioprio;
-
-       enum mq_rq_state state;
-       refcount_t ref;
-
-       unsigned int timeout;
-       unsigned long deadline;
-
-       union {
-               struct __call_single_data csd;
-               u64 fifo_time;
-       };
+static inline int blk_validate_block_size(unsigned int bsize)
+{
+       if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
+               return -EINVAL;
  
-       /*
-        * completion callback.
-        */
-       rq_end_io_fn *end_io;
-       void *end_io_data;
-};
+       return 0;
+}
  
  static inline bool blk_op_is_passthrough(unsigned int op)
  {
@@ -241,35 +58,6 @@ static inline bool blk_op_is_passthrough(unsigned int op)
         return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
  }
  
-static inline bool blk_rq_is_passthrough(struct request *rq)
-{
-       return blk_op_is_passthrough(req_op(rq));
-}
-
-static inline unsigned short req_get_ioprio(struct request *req)
-{
-       return req->ioprio;
-}
-
-#include <linux/elevator.h>
-
-struct blk_queue_ctx;
-
-struct bio_vec;
-
-enum blk_eh_timer_return {
-       BLK_EH_DONE,            /* drivers has completed the command */
-       BLK_EH_RESET_TIMER,     /* reset timer and try again */
-};
-
-enum blk_queue_state {
-       Queue_down,
-       Queue_up,
-};
-
-#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
-
  /*
   * Zoned block device models (zoned limit).
   *
@@ -370,6 +158,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
  
  #endif /* CONFIG_BLK_DEV_ZONED */
  
+/*
+ * Independent access ranges: struct blk_independent_access_range describes
+ * a range of contiguous sectors that can be accessed using device command
+ * execution resources that are independent from the resources used for
+ * other access ranges. This is typically found with single-LUN multi-actuator
+ * HDDs where each access range is served by a different set of heads.
+ * The set of independent ranges supported by the device is defined using
+ * struct blk_independent_access_ranges. The independent ranges must not overlap
+ * and must include all sectors within the disk capacity (no sector holes
+ * allowed).
+ * For a device with multiple ranges, requests targeting sectors in different
+ * ranges can be executed in parallel. A request can straddle an access range
+ * boundary.
+ */
+struct blk_independent_access_range {
+       struct kobject          kobj;
+       struct request_queue    *queue;
+       sector_t                sector;
+       sector_t                nr_sectors;
+};
+
+struct blk_independent_access_ranges {
+       struct kobject                          kobj;
+       bool                                    sysfs_registered;
+       unsigned int                            nr_ia_ranges;
+       struct blk_independent_access_range     ia_range[];
+};
+
  struct request_queue {
         struct request          *last_merge;
         struct elevator_queue   *elevator;
@@ -444,8 +260,7 @@ struct request_queue {
         unsigned int            dma_alignment;
  
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-       /* Inline crypto capabilities */
-       struct blk_keyslot_manager *ksm;
+       struct blk_crypto_profile *crypto_profile;
  #endif
  
         unsigned int            rq_timeout;
@@ -457,10 +272,9 @@ struct request_queue {
         struct timer_list       timeout;
         struct work_struct      timeout_work;
  
-       atomic_t                nr_active_requests_shared_sbitmap;
+       atomic_t                nr_active_requests_shared_tags;
  
-       struct sbitmap_queue    sched_bitmap_tags;
-       struct sbitmap_queue    sched_breserved_tags;
+       struct blk_mq_tags      *sched_shared_tags;
  
         struct list_head        icq_list;
  #ifdef CONFIG_BLK_CGROUP
@@ -536,6 +350,8 @@ struct request_queue {
          */
         struct mutex            mq_freeze_lock;
  
+       int                     quiesce_depth;
+
         struct blk_mq_tag_set   *tag_set;
         struct list_head        tag_set_list;
         struct bio_set          bio_split;
@@ -549,10 +365,14 @@ struct request_queue {
  
         bool                    mq_sysfs_init_done;
  
-       size_t                  cmd_size;
-
  #define BLK_MAX_WRITE_HINTS    5
         u64                     write_hints[BLK_MAX_WRITE_HINTS];
+
+       /*
+        * Independent sector access ranges. This is always NULL for
+        * devices that do not have multiple independent access ranges.
+        */
+       struct blk_independent_access_ranges *ia_ranges;
  };
  
  /* Keep blk_queue_flag_name[] in sync with the definitions below */
@@ -579,7 +399,6 @@ struct request_queue {
  #define QUEUE_FLAG_STATS       20      /* track IO start and completion times */
  #define QUEUE_FLAG_POLL_STATS  21      /* collecting stats for hybrid polling */
  #define QUEUE_FLAG_REGISTERED  22      /* queue has been registered to a disk */
-#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
  #define QUEUE_FLAG_QUIESCED    24      /* queue has been quiesced */
  #define QUEUE_FLAG_PCI_P2PDMA  25      /* device supports PCI p2p requests */
  #define QUEUE_FLAG_ZONE_RESETALL 26    /* supports Zone Reset All */
@@ -613,8 +432,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
  #define blk_queue_secure_erase(q) \
         (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
  #define blk_queue_dax(q)       test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
-#define blk_queue_scsi_passthrough(q)  \
-       test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
  #define blk_queue_pci_p2pdma(q)        \
         test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
  #ifdef CONFIG_BLK_RQ_ALLOC_TIME
@@ -638,11 +455,6 @@ extern void blk_clear_pm_only(struct request_queue *q);
  
  #define list_entry_rq(ptr)     list_entry((ptr), struct request, queuelist)
  
-#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)
-
-#define rq_dma_dir(rq) \
-       (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-
  #define dma_map_bvec(dev, bv, dir, attrs) \
         dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
         (dir), (attrs))
@@ -758,42 +570,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q)
  }
  #endif /* CONFIG_BLK_DEV_ZONED */
  
-static inline bool rq_is_sync(struct request *rq)
-{
-       return op_is_sync(rq->cmd_flags);
-}
-
-static inline bool rq_mergeable(struct request *rq)
-{
-       if (blk_rq_is_passthrough(rq))
-               return false;
-
-       if (req_op(rq) == REQ_OP_FLUSH)
-               return false;
-
-       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
-               return false;
-
-       if (req_op(rq) == REQ_OP_ZONE_APPEND)
-               return false;
-
-       if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
-               return false;
-       if (rq->rq_flags & RQF_NOMERGE_FLAGS)
-               return false;
-
-       return true;
-}
-
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
-       if (bio_page(a) == bio_page(b) &&
-           bio_offset(a) == bio_offset(b))
-               return true;
-
-       return false;
-}
-
  static inline unsigned int blk_queue_depth(struct request_queue *q)
  {
         if (q->queue_depth)
@@ -808,83 +584,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
  #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ)
  #define BLK_MIN_SG_TIMEOUT     (7 * HZ)
  
-struct rq_map_data {
-       struct page **pages;
-       int page_order;
-       int nr_entries;
-       unsigned long offset;
-       int null_mapped;
-       int from_user;
-};
-
-struct req_iterator {
-       struct bvec_iter iter;
-       struct bio *bio;
-};
-
  /* This should not be used directly - use rq_for_each_segment */
  #define for_each_bio(_bio)             \
         for (; _bio; _bio = _bio->bi_next)
-#define __rq_for_each_bio(_bio, rq)    \
-       if ((rq->bio))                  \
-               for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
  
-#define rq_for_each_segment(bvl, _rq, _iter)                   \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bio_for_each_segment(bvl, _iter.bio, _iter.iter)
-
-#define rq_for_each_bvec(bvl, _rq, _iter)                      \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
-
-#define rq_iter_last(bvec, _iter)                              \
-               (_iter.bio->bi_next == NULL &&                  \
-                bio_iter_last(bvec, _iter.iter))
-
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-extern void rq_flush_dcache_pages(struct request *rq);
-#else
-static inline void rq_flush_dcache_pages(struct request *rq)
-{
-}
-#endif
  
  extern int blk_register_queue(struct gendisk *disk);
  extern void blk_unregister_queue(struct gendisk *disk);
-blk_qc_t submit_bio_noacct(struct bio *bio);
-extern void blk_rq_init(struct request_queue *q, struct request *rq);
-extern void blk_put_request(struct request *);
-extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-                                      blk_mq_req_flags_t flags);
+void submit_bio_noacct(struct bio *bio);
+
  extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-                            struct bio_set *bs, gfp_t gfp_mask,
-                            int (*bio_ctr)(struct bio *, struct bio *, void *),
-                            void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
-extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-                                    struct request *rq);
-int blk_rq_append_bio(struct request *rq, struct bio *bio);
  extern void blk_queue_split(struct bio **);
  extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
  extern void blk_queue_exit(struct request_queue *q);
  extern void blk_sync_queue(struct request_queue *q);
-extern int blk_rq_map_user(struct request_queue *, struct request *,
-                          struct rq_map_data *, void __user *, unsigned long,
-                          gfp_t);
-extern int blk_rq_unmap_user(struct bio *);
-extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
-extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-                              struct rq_map_data *, const struct iov_iter *,
-                              gfp_t);
-extern void blk_execute_rq_nowait(struct gendisk *,
-                                 struct request *, int, rq_end_io_fn *);
-
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
-                           int at_head);
  
  /* Helper to convert REQ_OP_XXX to its string format XXX */
  extern const char *blk_op_str(unsigned int op);
@@ -892,69 +605,17 @@ extern const char *blk_op_str(unsigned int op);
  int blk_status_to_errno(blk_status_t status);
  blk_status_t errno_to_blk_status(int errno);
  
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
+/* only poll the hardware once, don't continue until a completion was found */
+#define BLK_POLL_ONESHOT               (1 << 0)
+/* do not sleep to wait for the expected completion time */
+#define BLK_POLL_NOSLEEP               (1 << 1)
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+                       unsigned int flags);
  
  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
  {
-       return bdev->bd_disk->queue;    /* this is never NULL */
-}
-
-/*
- * The basic unit of block I/O is a sector. It is used in a number of contexts
- * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
- * bytes. Variables of type sector_t represent an offset or size that is a
- * multiple of 512 bytes. Hence these two constants.
- */
-#ifndef SECTOR_SHIFT
-#define SECTOR_SHIFT 9
-#endif
-#ifndef SECTOR_SIZE
-#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-#endif
-
-#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
-#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
-#define SECTOR_MASK            (PAGE_SECTORS - 1)
-
-/*
- * blk_rq_pos()                        : the current sector
- * blk_rq_bytes()              : bytes left in the entire request
- * blk_rq_cur_bytes()          : bytes left in the current segment
- * blk_rq_err_bytes()          : bytes left till the next error boundary
- * blk_rq_sectors()            : sectors left in the entire request
- * blk_rq_cur_sectors()                : sectors left in the current segment
- * blk_rq_stats_sectors()      : sectors of the entire request used for stats
- */
-static inline sector_t blk_rq_pos(const struct request *rq)
-{
-       return rq->__sector;
-}
-
-static inline unsigned int blk_rq_bytes(const struct request *rq)
-{
-       return rq->__data_len;
-}
-
-static inline int blk_rq_cur_bytes(const struct request *rq)
-{
-       return rq->bio ? bio_cur_bytes(rq->bio) : 0;
-}
-
-extern unsigned int blk_rq_err_bytes(const struct request *rq);
-
-static inline unsigned int blk_rq_sectors(const struct request *rq)
-{
-       return blk_rq_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
-{
-       return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
-{
-       return rq->stats_sectors;
+       return bdev->bd_queue;  /* this is never NULL */
  }
  
  #ifdef CONFIG_BLK_DEV_ZONED
@@ -973,42 +634,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
         return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
                                      bio->bi_iter.bi_sector);
  }
-
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
-       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
-       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
-}
  #endif /* CONFIG_BLK_DEV_ZONED */
  
-/*
- * Some commands like WRITE SAME have a payload or data transfer size which
- * is different from the size of the request.  Any driver that supports such
- * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
- * calculate the data transfer size.
- */
-static inline unsigned int blk_rq_payload_bytes(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return rq->special_vec.bv_len;
-       return blk_rq_bytes(rq);
-}
-
-/*
- * Return the first full biovec in the request.  The caller needs to check that
- * there are any bvecs before calling this helper.
- */
-static inline struct bio_vec req_bvec(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return rq->special_vec;
-       return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
-}
-
  static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                      int op)
  {
@@ -1048,47 +675,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
         return min(q->limits.max_sectors, chunk_sectors);
  }
  
-static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
-                                                 sector_t offset)
-{
-       struct request_queue *q = rq->q;
-
-       if (blk_rq_is_passthrough(rq))
-               return q->limits.max_hw_sectors;
-
-       if (!q->limits.chunk_sectors ||
-           req_op(rq) == REQ_OP_DISCARD ||
-           req_op(rq) == REQ_OP_SECURE_ERASE)
-               return blk_queue_get_max_sectors(q, req_op(rq));
-
-       return min(blk_max_size_offset(q, offset, 0),
-                       blk_queue_get_max_sectors(q, req_op(rq)));
-}
-
-static inline unsigned int blk_rq_count_bios(struct request *rq)
-{
-       unsigned int nr_bios = 0;
-       struct bio *bio;
-
-       __rq_for_each_bio(bio, rq)
-               nr_bios++;
-
-       return nr_bios;
-}
-
-void blk_steal_bios(struct bio_list *list, struct request *rq);
-
-/*
- * Request completion related functions.
- *
- * blk_update_request() completes given number of bytes and updates
- * the request without completing it.
- */
-extern bool blk_update_request(struct request *rq, blk_status_t error,
-                              unsigned int nr_bytes);
-
-extern void blk_abort_request(struct request *);
-
  /*
   * Access functions for manipulating queue properties
   */
@@ -1133,46 +719,24 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
  extern void blk_queue_update_dma_alignment(struct request_queue *, int);
  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
  extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-extern void blk_queue_required_elevator_features(struct request_queue *q,
-                                                unsigned int features);
-extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
-                                             struct device *dev);
  
-/*
- * Number of physical segments as sent to the device.
- *
- * Normally this is the number of discontiguous data segments sent by the
- * submitter.  But for data-less command like discard we might have no
- * actual data segments submitted, but the driver might have to add it's
- * own special payload.  In that case we still return 1 here so that this
- * special payload will be mapped.
- */
-static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
-{
-       if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
-               return 1;
-       return rq->nr_phys_segments;
-}
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
+void disk_set_independent_access_ranges(struct gendisk *disk,
+                               struct blk_independent_access_ranges *iars);
  
  /*
- * Number of discard segments (or ranges) the driver needs to fill in.
- * Each discard bio merged into a request is counted as one segment.
+ * Elevator features for blk_queue_required_elevator_features:
   */
-static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
-{
-       return max_t(unsigned short, rq->nr_phys_segments, 1);
-}
-
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
-               struct scatterlist *sglist, struct scatterlist **last_sg);
-static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
-               struct scatterlist *sglist)
-{
-       struct scatterlist *last_sg = NULL;
+/* Supports zoned block devices sequential write constraint */
+#define ELEVATOR_F_ZBD_SEQ_WRITE       (1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE            (1U << 1)
  
-       return __blk_rq_map_sg(q, rq, sglist, &last_sg);
-}
-extern void blk_dump_rq_flags(struct request *, char *);
+extern void blk_queue_required_elevator_features(struct request_queue *q,
+                                                unsigned int features);
+extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
+                                             struct device *dev);
  
  bool __must_check blk_get_queue(struct request_queue *);
  extern void blk_put_queue(struct request_queue *);
@@ -1187,19 +751,24 @@ extern void blk_set_queue_dying(struct request_queue *);
   * as the lock contention for request_queue lock is reduced.
   *
   * It is ok not to disable preemption when adding the request to the plug list
- * or when attempting a merge, because blk_schedule_flush_list() will only flush
- * the plug list when the task sleeps by itself. For details, please see
- * schedule() where blk_schedule_flush_plug() is called.
+ * or when attempting a merge. For details, please see schedule() where
+ * blk_flush_plug() is called.
   */
  struct blk_plug {
-       struct list_head mq_list; /* blk-mq requests */
-       struct list_head cb_list; /* md requires an unplug callback */
+       struct request *mq_list; /* blk-mq requests */
+
+       /* if ios_left is > 1, we can batch tag/rq allocations */
+       struct request *cached_rq;
+       unsigned short nr_ios;
+
         unsigned short rq_count;
+
         bool multiple_queues;
+       bool has_elevator;
         bool nowait;
+
+       struct list_head cb_list; /* md requires an unplug callback */
  };
-#define BLK_MAX_REQUEST_COUNT 16
-#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
  
  struct blk_plug_cb;
  typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
@@ -1211,32 +780,17 @@ struct blk_plug_cb {
  extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                              void *data, int size);
  extern void blk_start_plug(struct blk_plug *);
+extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
  extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *, bool);
-
-static inline void blk_flush_plug(struct task_struct *tsk)
-{
-       struct blk_plug *plug = tsk->plug;
  
-       if (plug)
-               blk_flush_plug_list(plug, false);
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *tsk)
-{
-       struct blk_plug *plug = tsk->plug;
-
-       if (plug)
-               blk_flush_plug_list(plug, true);
-}
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
  
  static inline bool blk_needs_flush_plug(struct task_struct *tsk)
  {
         struct blk_plug *plug = tsk->plug;
  
         return plug &&
-                (!list_empty(&plug->mq_list) ||
-                !list_empty(&plug->cb_list));
+                (plug->mq_list || !list_empty(&plug->cb_list));
  }
  
  int blkdev_issue_flush(struct block_device *bdev);
@@ -1245,23 +799,23 @@ long nr_blockdev_pages(void);
  struct blk_plug {
  };
  
-static inline void blk_start_plug(struct blk_plug *plug)
+static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
+                                        unsigned short nr_ios)
  {
  }
  
-static inline void blk_finish_plug(struct blk_plug *plug)
+static inline void blk_start_plug(struct blk_plug *plug)
  {
  }
  
-static inline void blk_flush_plug(struct task_struct *task)
+static inline void blk_finish_plug(struct blk_plug *plug)
  {
  }
  
-static inline void blk_schedule_flush_plug(struct task_struct *task)
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
  {
  }
  
-
  static inline bool blk_needs_flush_plug(struct task_struct *tsk)
  {
         return false;
@@ -1499,22 +1053,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
         return offset << SECTOR_SHIFT;
  }
  
-/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
- */
-static inline bool blk_discard_mergable(struct request *req)
-{
-       if (req_op(req) == REQ_OP_DISCARD &&
-           queue_max_discard_segments(req->q) > 1)
-               return true;
-       return false;
-}
-
  static inline int bdev_discard_alignment(struct block_device *bdev)
  {
         struct request_queue *q = bdev_get_queue(bdev);
@@ -1628,210 +1166,36 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
  #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
         MODULE_ALIAS("block-major-" __stringify(major) "-*")
  
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-enum blk_integrity_flags {
-       BLK_INTEGRITY_VERIFY            = 1 << 0,
-       BLK_INTEGRITY_GENERATE          = 1 << 1,
-       BLK_INTEGRITY_DEVICE_CAPABLE    = 1 << 2,
-       BLK_INTEGRITY_IP_CHECKSUM       = 1 << 3,
-};
-
-struct blk_integrity_iter {
-       void                    *prot_buf;
-       void                    *data_buf;
-       sector_t                seed;
-       unsigned int            data_size;
-       unsigned short          interval;
-       const char              *disk_name;
-};
-
-typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
-typedef void (integrity_prepare_fn) (struct request *);
-typedef void (integrity_complete_fn) (struct request *, unsigned int);
-
-struct blk_integrity_profile {
-       integrity_processing_fn         *generate_fn;
-       integrity_processing_fn         *verify_fn;
-       integrity_prepare_fn            *prepare_fn;
-       integrity_complete_fn           *complete_fn;
-       const char                      *name;
-};
-
-extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
-extern void blk_integrity_unregister(struct gendisk *);
-extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
-                                  struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
-
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-       struct blk_integrity *bi = &disk->queue->integrity;
-
-       if (!bi->profile)
-               return NULL;
-
-       return bi;
-}
-
-static inline
-struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-       return blk_get_integrity(bdev->bd_disk);
-}
-
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-       return q->integrity.profile;
-}
-
-static inline bool blk_integrity_rq(struct request *rq)
-{
-       return rq->cmd_flags & REQ_INTEGRITY;
-}
-
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-                                                   unsigned int segs)
-{
-       q->limits.max_integrity_segments = segs;
-}
-
-static inline unsigned short
-queue_max_integrity_segments(const struct request_queue *q)
-{
-       return q->limits.max_integrity_segments;
-}
-
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi:                blk_integrity profile for device
- * @sectors:   Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device.  Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-                                                  unsigned int sectors)
-{
-       return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-                                              unsigned int sectors)
-{
-       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/*
- * Return the first bvec that contains integrity data.  Only drivers that are
- * limited to a single integrity segment should use this helper.
- */
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-       if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
-               return NULL;
-       return rq->bio->bi_integrity->bip_vec;
-}
-
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-
-struct bio;
-struct block_device;
-struct gendisk;
-struct blk_integrity;
-
-static inline int blk_integrity_rq(struct request *rq)
-{
-       return 0;
-}
-static inline int blk_rq_count_integrity_sg(struct request_queue *q,
-                                           struct bio *b)
-{
-       return 0;
-}
-static inline int blk_rq_map_integrity_sg(struct request_queue *q,
-                                         struct bio *b,
-                                         struct scatterlist *s)
-{
-       return 0;
-}
-static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
-{
-       return NULL;
-}
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
-       return NULL;
-}
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
-       return false;
-}
-static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
-{
-       return 0;
-}
-static inline void blk_integrity_register(struct gendisk *d,
-                                        struct blk_integrity *b)
-{
-}
-static inline void blk_integrity_unregister(struct gendisk *d)
-{
-}
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
-                                                   unsigned int segs)
-{
-}
-static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
-{
-       return 0;
-}
-
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-                                                  unsigned int sectors)
-{
-       return 0;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-                                              unsigned int sectors)
-{
-       return 0;
-}
-
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
-       return NULL;
-}
-
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-
  #ifdef CONFIG_BLK_INLINE_ENCRYPTION
  
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+                        struct request_queue *q);
  
-void blk_ksm_unregister(struct request_queue *q);
+void blk_crypto_unregister(struct request_queue *q);
  
  #else /* CONFIG_BLK_INLINE_ENCRYPTION */
  
-static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm,
-                                   struct request_queue *q)
+static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
+                                      struct request_queue *q)
  {
         return true;
  }
  
-static inline void blk_ksm_unregister(struct request_queue *q) { }
+static inline void blk_crypto_unregister(struct request_queue *q) { }
  
  #endif /* CONFIG_BLK_INLINE_ENCRYPTION */
  
+enum blk_unique_id {
+       /* these match the Designator Types specified in SPC */
+       BLK_UID_T10     = 1,
+       BLK_UID_EUI64   = 2,
+       BLK_UID_NAA     = 3,
+};
+
+#define NFL4_UFLG_MASK                 0x0000003F
  
  struct block_device_operations {
-       blk_qc_t (*submit_bio) (struct bio *bio);
+       void (*submit_bio)(struct bio *bio);
         int (*open) (struct block_device *, fmode_t);
         void (*release) (struct gendisk *, fmode_t);
         int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
@@ -1847,6 +1211,9 @@ struct block_device_operations {
         int (*report_zones)(struct gendisk *, sector_t sector,
                         unsigned int nr_zones, report_zones_cb cb, void *data);
         char *(*devnode)(struct gendisk *disk, umode_t *mode);
+       /* returns the length of the identifier or a negative errno: */
+       int (*get_unique_id)(struct gendisk *disk, u8 id[16],
+                       enum blk_unique_id id_type);
         struct module *owner;
         const struct pr_ops *pr_ops;
  
@@ -1869,60 +1236,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
  extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                 struct writeback_control *);
  
-#ifdef CONFIG_BLK_DEV_ZONED
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-       if (blk_req_needs_zone_write_lock(rq))
-               __blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
-               __blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-       return rq->q->seq_zones_wlock &&
-               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-       if (!blk_req_needs_zone_write_lock(rq))
-               return true;
-       return !blk_req_zone_is_write_locked(rq);
-}
-#else
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-       return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-       return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-       return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
  static inline void blk_wake_io_task(struct task_struct *waiter)
  {
         /*
@@ -1991,6 +1304,8 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
  #ifdef CONFIG_BLOCK
  void invalidate_bdev(struct block_device *bdev);
  int sync_blockdev(struct block_device *bdev);
+int sync_blockdev_nowait(struct block_device *bdev);
+void sync_bdevs(bool wait);
  #else
  static inline void invalidate_bdev(struct block_device *bdev)
  {
@@ -1999,10 +1314,54 @@ static inline int sync_blockdev(struct block_device *bdev)
  {
         return 0;
  }
+static inline int sync_blockdev_nowait(struct block_device *bdev)
+{
+       return 0;
+}
+static inline void sync_bdevs(bool wait)
+{
+}
  #endif
  int fsync_bdev(struct block_device *bdev);
  
  int freeze_bdev(struct block_device *bdev);
  int thaw_bdev(struct block_device *bdev);
  
+struct io_comp_batch {
+       struct request *req_list;
+       bool need_ts;
+       void (*complete)(struct io_comp_batch *);
+};
+
+#define DEFINE_IO_COMP_BATCH(name)     struct io_comp_batch name = { }
+
+#define rq_list_add(listptr, rq)       do {            \
+       (rq)->rq_next = *(listptr);                     \
+       *(listptr) = rq;                                \
+} while (0)
+
+#define rq_list_pop(listptr)                           \
+({                                                     \
+       struct request *__req = NULL;                   \
+       if ((listptr) && *(listptr))    {               \
+               __req = *(listptr);                     \
+               *(listptr) = __req->rq_next;            \
+       }                                               \
+       __req;                                          \
+})
+
+#define rq_list_peek(listptr)                          \
+({                                                     \
+       struct request *__req = NULL;                   \
+       if ((listptr) && *(listptr))                    \
+               __req = *(listptr);                     \
+       __req;                                          \
+})
+
+#define rq_list_for_each(listptr, pos)                 \
+       for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \
+
+#define rq_list_next(rq)       (rq)->rq_next
+#define rq_list_empty(list)    ((list) == (struct request *) NULL)
+
  #endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h

index a083e15..22501a2 100644 (file)
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -2,7 +2,7 @@
  #ifndef BLKTRACE_H
  #define BLKTRACE_H
  
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  #include <linux/relay.h>
  #include <linux/compat.h>
  #include <uapi/linux/blktrace_api.h>
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 020a7d5..3db6f6c 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -929,8 +929,11 @@ struct bpf_array_aux {
          * stored in the map to make sure that all callers and callees have
          * the same prog type and JITed flag.
          */
-       enum bpf_prog_type type;
-       bool jited;
+       struct {
+               spinlock_t lock;
+               enum bpf_prog_type type;
+               bool jited;
+       } owner;
         /* Programs with direct jumps into programs part of this array. */
         struct list_head poke_progs;
         struct bpf_map *map;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index 9c81724..bbe1eef 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
  #endif
  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
-#ifdef CONFIG_NET
-BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
  #ifdef CONFIG_BPF_LSM
  BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
  #endif
  BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
+#ifdef CONFIG_NET
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
  #if defined(CONFIG_XDP_SOCKETS)
  BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
diff --git a/include/linux/bvec.h b/include/linux/bvec.h

index 0e9bdd4..35c25df 100644 (file)
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -44,7 +44,7 @@ struct bvec_iter {
  
         unsigned int            bi_bvec_done;   /* number of bytes completed in
                                                    current bvec */
-};
+} __packed;
  
  struct bvec_iter_all {
         struct bio_vec  bv;
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h

index c4fef00..0a89f11 100644 (file)
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -64,6 +64,7 @@ struct cdrom_device_info {
         int for_data;
         int (*exit)(struct cdrom_device_info *);
         int mrw_mode_page;
+       __s64 last_media_change_ms;
  };
  
  struct cdrom_device_ops {
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h

index 114553b..a7df155 100644 (file)
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
                                struct dm_table *t);
  
  /*
- * Table keyslot manager functions
+ * Table blk_crypto_profile functions
   */
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
  
  /*-----------------------------------------------------------------
   * Macros.
diff --git a/include/linux/filter.h b/include/linux/filter.h

index 4a93c12..ef03ff3 100644 (file)
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1051,6 +1051,7 @@ extern int bpf_jit_enable;
  extern int bpf_jit_harden;
  extern int bpf_jit_kallsyms;
  extern long bpf_jit_limit;
+extern long bpf_jit_limit_max;
  
  typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
  
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h

index c12df59..3e378b1 100644 (file)
--- a/include/linux/flex_proportions.h
+++ b/include/linux/flex_proportions.h
@@ -83,9 +83,10 @@ struct fprop_local_percpu {
  
  int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp);
  void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
-void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
-void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
-                           int max_frac);
+void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
+               long nr);
+void __fprop_add_percpu_max(struct fprop_global *p,
+               struct fprop_local_percpu *pl, int max_frac, long nr);
  void fprop_fraction_percpu(struct fprop_global *p,
         struct fprop_local_percpu *pl, unsigned long *numerator,
         unsigned long *denominator);
@@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
         unsigned long flags;
  
         local_irq_save(flags);
-       __fprop_inc_percpu(p, pl);
+       __fprop_add_percpu(p, pl, 1);
         local_irq_restore(flags);
  }
  
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 56eba72..f3cfca5 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@
  struct backing_dev_info;
  struct bdi_writeback;
  struct bio;
+struct io_comp_batch;
  struct export_operations;
  struct fiemap_extent_info;
  struct hd_geometry;
@@ -329,16 +330,12 @@ struct kiocb {
         randomized_struct_fields_start
  
         loff_t                  ki_pos;
-       void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+       void (*ki_complete)(struct kiocb *iocb, long ret);
         void                    *private;
         int                     ki_flags;
         u16                     ki_hint;
         u16                     ki_ioprio; /* See linux/ioprio.h */
-       union {
-               unsigned int            ki_cookie; /* for ->iopoll */
-               struct wait_page_queue  *ki_waitq; /* for async buffered IO */
-       };
-
+       struct wait_page_queue  *ki_waitq; /* for async buffered IO */
         randomized_struct_fields_end
  };
  
@@ -2075,7 +2072,8 @@ struct file_operations {
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-       int (*iopoll)(struct kiocb *kiocb, bool spin);
+       int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
+                       unsigned int flags);
         int (*iterate) (struct file *, struct dir_context *);
         int (*iterate_shared) (struct file *, struct dir_context *);
         __poll_t (*poll) (struct file *, struct poll_table_struct *);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h

index e912ed9..91ea947 100644 (file)
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -118,9 +118,6 @@ struct fscrypt_operations {
          */
         bool (*empty_dir)(struct inode *inode);
  
-       /* The filesystem's maximum ciphertext filename length, in bytes */
-       unsigned int max_namelen;
-
         /*
          * Check whether the filesystem's inode numbers and UUID are stable,
          * meaning that they will never be changed even by offline operations
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index 0f5315c..59eabbc 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,12 +12,10 @@
  
  #include <linux/types.h>
  #include <linux/kdev_t.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-#include <linux/percpu-refcount.h>
  #include <linux/uuid.h>
  #include <linux/blk_types.h>
-#include <asm/local.h>
+#include <linux/device.h>
+#include <linux/xarray.h>
  
  extern const struct device_type disk_type;
  extern struct device_type part_type;
@@ -26,14 +24,6 @@ extern struct class block_class;
  #define DISK_MAX_PARTS                 256
  #define DISK_NAME_LEN                  32
  
-#include <linux/major.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/workqueue.h>
-#include <linux/xarray.h>
-
  #define PARTITION_META_INFO_VOLNAMELTH 64
  /*
   * Enough for the string representation of any kind of UUID plus NULL.
@@ -223,6 +213,8 @@ static inline int add_disk(struct gendisk *disk)
  }
  extern void del_gendisk(struct gendisk *gp);
  
+void invalidate_disk(struct gendisk *disk);
+
  void set_disk_ro(struct gendisk *disk, bool read_only);
  
  static inline int get_disk_ro(struct gendisk *disk)
@@ -231,6 +223,11 @@ static inline int get_disk_ro(struct gendisk *disk)
                 test_bit(GD_READ_ONLY, &disk->state);
  }
  
+static inline int bdev_read_only(struct block_device *bdev)
+{
+       return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
  extern void disk_block_events(struct gendisk *disk);
  extern void disk_unblock_events(struct gendisk *disk);
  extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
@@ -248,7 +245,12 @@ static inline sector_t get_start_sect(struct block_device *bdev)
  
  static inline sector_t bdev_nr_sectors(struct block_device *bdev)
  {
-       return i_size_read(bdev->bd_inode) >> 9;
+       return bdev->bd_nr_sectors;
+}
+
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+{
+       return bdev_nr_sectors(bdev) << SECTOR_SHIFT;
  }
  
  static inline sector_t get_capacity(struct gendisk *disk)
@@ -256,6 +258,12 @@ static inline sector_t get_capacity(struct gendisk *disk)
         return bdev_nr_sectors(disk->part0);
  }
  
+static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
+{
+       return bdev_nr_sectors(sb->s_bdev) >>
+               (sb->s_blocksize_bits - SECTOR_SHIFT);
+}
+
  int bdev_disk_changed(struct gendisk *disk, bool invalidate);
  void blk_drop_partitions(struct gendisk *disk);
  
@@ -291,10 +299,6 @@ bool bdev_check_media_change(struct block_device *bdev);
  int __invalidate_device(struct block_device *bdev, bool kill_dirty);
  void set_capacity(struct gendisk *disk, sector_t size);
  
-/* for drivers/char/raw.c: */
-int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
-long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-
  #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
  int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
  void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h

index 55b2ec1..3745efd 100644 (file)
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -520,15 +520,11 @@ static inline void arch_free_page(struct page *page, int order) { }
  #ifndef HAVE_ARCH_ALLOC_PAGE
  static inline void arch_alloc_page(struct page *page, int order) { }
  #endif
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
-       return 0;
-}
-#endif
  
  struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
                 nodemask_t *nodemask);
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+               nodemask_t *nodemask);
  
  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                                 nodemask_t *nodemask, int nr_pages,
@@ -570,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
         return __alloc_pages(gfp_mask, order, nid, NULL);
  }
  
+static inline
+struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
+{
+       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+       VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid));
+
+       return __folio_alloc(gfp, order, nid, NULL);
+}
+
  /*
   * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
   * prefer the current CPU's closest node. Otherwise node must be valid and
@@ -586,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
  
  #ifdef CONFIG_NUMA
  struct page *alloc_pages(gfp_t gfp, unsigned int order);
+struct folio *folio_alloc(gfp_t gfp, unsigned order);
  extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                         struct vm_area_struct *vma, unsigned long addr,
                         int node, bool hugepage);
@@ -596,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
  {
         return alloc_pages_node(numa_node_id(), gfp_mask, order);
  }
+static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+{
+       return __folio_alloc_node(gfp, order, numa_node_id());
+}
  #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
         alloc_pages(gfp_mask, order)
  #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h

index 4aa1031..0a0b2b0 100644 (file)
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page)
         return __kmap_local_page_prot(page, kmap_prot);
  }
  
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+       struct page *page = folio_page(folio, offset / PAGE_SIZE);
+       return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
+}
+
  static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
  {
         return __kmap_local_page_prot(page, prot);
@@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page)
         return page_address(page);
  }
  
+static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+{
+       return page_address(&folio->page) + offset;
+}
+
  static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
  {
         return kmap_local_page(page);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h

index b4c49f9..27cdd71 100644 (file)
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -97,6 +97,43 @@ static inline void kmap_flush_unused(void);
  static inline void *kmap_local_page(struct page *page);
  
  /**
+ * kmap_local_folio - Map a page in this folio for temporary usage
+ * @folio: The folio containing the page.
+ * @offset: The byte offset within the folio which identifies the page.
+ *
+ * Requires careful handling when nesting multiple mappings because the map
+ * management is stack based. The unmap has to be in the reverse order of
+ * the map operation::
+ *
+ *   addr1 = kmap_local_folio(folio1, offset1);
+ *   addr2 = kmap_local_folio(folio2, offset2);
+ *   ...
+ *   kunmap_local(addr2);
+ *   kunmap_local(addr1);
+ *
+ * Unmapping addr1 before addr2 is invalid and causes malfunction.
+ *
+ * Contrary to kmap() mappings the mapping is only valid in the context of
+ * the caller and cannot be handed to other contexts.
+ *
+ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
+ * virtual address of the direct mapping. Only real highmem pages are
+ * temporarily mapped.
+ *
+ * While it is significantly faster than kmap() for the higmem case it
+ * comes with restrictions about the pointer validity. Only use when really
+ * necessary.
+ *
+ * On HIGHMEM enabled systems mapping a highmem page has the side effect of
+ * disabling migration in order to keep the virtual address stable across
+ * preemption. No caller of kmap_local_folio() can rely on this side effect.
+ *
+ * Context: Can be invoked from any context.
+ * Return: The virtual address of @offset.
+ */
+static inline void *kmap_local_folio(struct folio *folio, size_t offset);
+
+/**
   * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
   * @page:      Pointer to the page to be mapped
   *
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index f123e15..f280f33 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -251,15 +251,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
  }
  
  /**
- * thp_head - Head page of a transparent huge page.
- * @page: Any page (tail, head or regular) found in the page cache.
- */
-static inline struct page *thp_head(struct page *page)
-{
-       return compound_head(page);
-}
-
-/**
   * thp_order - Order of a transparent huge page.
   * @page: Head page of a transparent huge page.
   */
@@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page)
  #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
  #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
  
-static inline struct page *thp_head(struct page *page)
-{
-       VM_BUG_ON_PGFLAGS(PageTail(page), page);
-       return page;
-}
-
  static inline unsigned int thp_order(struct page *page)
  {
         VM_BUG_ON_PGFLAGS(PageTail(page), page);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h

index 24f8489..63f4ea4 100644 (file)
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping,
  struct iomap_dio_ops {
         int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
                       unsigned flags);
-       blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
-                             loff_t file_offset);
+       void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
+                         loff_t file_offset);
  };
  
  /*
@@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                 const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                 unsigned int dio_flags);
  ssize_t iomap_dio_complete(struct iomap_dio *dio);
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
  
  #ifdef CONFIG_SWAP
  struct file;
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h

deleted file mode 100644 (file)

index a27605e..0000000
--- a/include/linux/keyslot-manager.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2019 Google LLC
- */
-
-#ifndef __LINUX_KEYSLOT_MANAGER_H
-#define __LINUX_KEYSLOT_MANAGER_H
-
-#include <linux/bio.h>
-#include <linux/blk-crypto.h>
-
-struct blk_keyslot_manager;
-
-/**
- * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
- * @keyslot_program:   Program the specified key into the specified slot in the
- *                     inline encryption hardware.
- * @keyslot_evict:     Evict key from the specified keyslot in the hardware.
- *                     The key is provided so that e.g. dm layers can evict
- *                     keys from the devices that they map over.
- *                     Returns 0 on success, -errno otherwise.
- *
- * This structure should be provided by storage device drivers when they set up
- * a keyslot manager - this structure holds the function ptrs that the keyslot
- * manager will use to manipulate keyslots in the hardware.
- */
-struct blk_ksm_ll_ops {
-       int (*keyslot_program)(struct blk_keyslot_manager *ksm,
-                              const struct blk_crypto_key *key,
-                              unsigned int slot);
-       int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
-                            const struct blk_crypto_key *key,
-                            unsigned int slot);
-};
-
-struct blk_keyslot_manager {
-       /*
-        * The struct blk_ksm_ll_ops that this keyslot manager will use
-        * to perform operations like programming and evicting keys on the
-        * device
-        */
-       struct blk_ksm_ll_ops ksm_ll_ops;
-
-       /*
-        * The maximum number of bytes supported for specifying the data unit
-        * number.
-        */
-       unsigned int max_dun_bytes_supported;
-
-       /*
-        * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
-        * whether a crypto mode and data unit size are supported. The i'th
-        * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
-        * size of (1 << i) is supported. We only support data unit sizes
-        * that are powers of 2.
-        */
-       unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
-
-       /* Device for runtime power management (NULL if none) */
-       struct device *dev;
-
-       /* Here onwards are *private* fields for internal keyslot manager use */
-
-       unsigned int num_slots;
-
-       /* Protects programming and evicting keys from the device */
-       struct rw_semaphore lock;
-
-       /* List of idle slots, with least recently used slot at front */
-       wait_queue_head_t idle_slots_wait_queue;
-       struct list_head idle_slots;
-       spinlock_t idle_slots_lock;
-
-       /*
-        * Hash table which maps struct *blk_crypto_key to keyslots, so that we
-        * can find a key's keyslot in O(1) time rather than O(num_slots).
-        * Protected by 'lock'.
-        */
-       struct hlist_head *slot_hashtable;
-       unsigned int log_slot_ht_size;
-
-       /* Per-keyslot data */
-       struct blk_ksm_keyslot *slots;
-};
-
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
-
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
-                     unsigned int num_slots);
-
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
-                                     const struct blk_crypto_key *key,
-                                     struct blk_ksm_keyslot **slot_ptr);
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
-
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
-
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
-                                 const struct blk_crypto_config *cfg);
-
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
-                     const struct blk_crypto_key *key);
-
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
-                            const struct blk_keyslot_manager *child);
-
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
-
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
-                        struct blk_keyslot_manager *ksm_subset);
-
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
-                                struct blk_keyslot_manager *reference_ksm);
-
-#endif /* __LINUX_KEYSLOT_MANAGER_H */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h

index 161e816..a38a5bc 100644 (file)
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
                         struct vm_area_struct *vma, unsigned long address);
  
  void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
-void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
  
  #else  /* !CONFIG_KSM */
  
@@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page,
  {
  }
  
-static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
  {
  }
  #endif /* CONFIG_MMU */
diff --git a/include/linux/libata.h b/include/linux/libata.h

index c0c64f0..236ec68 100644 (file)
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -676,6 +676,18 @@ struct ata_ering {
         struct ata_ering_entry  ring[ATA_ERING_SIZE];
  };
  
+struct ata_cpr {
+       u8                      num;
+       u8                      num_storage_elements;
+       u64                     start_lba;
+       u64                     num_lbas;
+};
+
+struct ata_cpr_log {
+       u8                      nr_cpr;
+       struct ata_cpr          cpr[];
+};
+
  struct ata_device {
         struct ata_link         *link;
         unsigned int            devno;          /* 0 or 1 */
@@ -735,6 +747,9 @@ struct ata_device {
         u32                     zac_zones_optimal_nonseq;
         u32                     zac_zones_max_open;
  
+       /* Concurrent positioning ranges */
+       struct ata_cpr_log      *cpr_log;
+
         /* error history */
         int                     spdn_cnt;
         /* ering is CLEAR_END, read comment above CLEAR_END */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 3096c9a..e34bf0c 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -369,7 +369,7 @@ enum page_memcg_data_flags {
  
  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
  
-static inline bool PageMemcgKmem(struct page *page);
+static inline bool folio_memcg_kmem(struct folio *folio);
  
  /*
   * After the initialization objcg->memcg is always pointing at
@@ -384,89 +384,95 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
  }
  
  /*
- * __page_memcg - get the memory cgroup associated with a non-kmem page
- * @page: a pointer to the page struct
+ * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
+ * @folio: Pointer to the folio.
   *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
   * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * kmem pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * kmem folios.
   */
-static inline struct mem_cgroup *__page_memcg(struct page *page)
+static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
  {
-       unsigned long memcg_data = page->memcg_data;
+       unsigned long memcg_data = folio->memcg_data;
  
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
  
         return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
  }
  
  /*
- * __page_objcg - get the object cgroup associated with a kmem page
- * @page: a pointer to the page struct
+ * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * @folio: Pointer to the folio.
   *
- * Returns a pointer to the object cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the object cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
   * proper object cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages or
- * LRU pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios or
+ * LRU folios.
   */
-static inline struct obj_cgroup *__page_objcg(struct page *page)
+static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
  {
-       unsigned long memcg_data = page->memcg_data;
+       unsigned long memcg_data = folio->memcg_data;
  
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-       VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
+       VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+       VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
  
         return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
  }
  
  /*
- * page_memcg - get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+ * folio_memcg - Get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
   *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function assumes that the folio is known to have a
   * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
   *
- * For a non-kmem page any of the following ensures page and memcg binding
+ * For a non-kmem folio any of the following ensures folio and memcg binding
   * stability:
   *
- * - the page lock
+ * - the folio lock
   * - LRU isolation
   * - lock_page_memcg()
   * - exclusive reference
   *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * For a kmem folio a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem folio from being released.
   */
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+       if (folio_memcg_kmem(folio))
+               return obj_cgroup_memcg(__folio_objcg(folio));
+       return __folio_memcg(folio);
+}
+
  static inline struct mem_cgroup *page_memcg(struct page *page)
  {
-       if (PageMemcgKmem(page))
-               return obj_cgroup_memcg(__page_objcg(page));
-       else
-               return __page_memcg(page);
+       return folio_memcg(page_folio(page));
  }
  
-/*
- * page_memcg_rcu - locklessly get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+/**
+ * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
   *
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function assumes that the page is known to have a
+ * This function assumes that the folio is known to have a
   * proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of folios, e.g. slab folios or ex-slab folios.
+ *
+ * Return: A pointer to the memory cgroup associated with the folio,
+ * or NULL.
   */
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  {
-       unsigned long memcg_data = READ_ONCE(page->memcg_data);
+       unsigned long memcg_data = READ_ONCE(folio->memcg_data);
  
-       VM_BUG_ON_PAGE(PageSlab(page), page);
+       VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
         WARN_ON_ONCE(!rcu_read_lock_held());
  
         if (memcg_data & MEMCG_DATA_KMEM) {
@@ -523,17 +529,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
  
  #ifdef CONFIG_MEMCG_KMEM
  /*
- * PageMemcgKmem - check if the page has MemcgKmem flag set
- * @page: a pointer to the page struct
+ * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
+ * @folio: Pointer to the folio.
   *
- * Checks if the page has MemcgKmem flag set. The caller must ensure that
- * the page has an associated memory cgroup. It's not safe to call this function
- * against some types of pages, e.g. slab pages.
+ * Checks if the folio has MemcgKmem flag set. The caller must ensure
+ * that the folio has an associated memory cgroup. It's not safe to call
+ * this function against some types of folios, e.g. slab folios.
   */
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
  {
-       VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
-       return page->memcg_data & MEMCG_DATA_KMEM;
+       VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
+       VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+       return folio->memcg_data & MEMCG_DATA_KMEM;
  }
  
  /*
@@ -577,7 +584,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
  }
  
  #else
-static inline bool PageMemcgKmem(struct page *page)
+static inline bool folio_memcg_kmem(struct folio *folio)
  {
         return false;
  }
@@ -593,6 +600,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
  }
  #endif
  
+static inline bool PageMemcgKmem(struct page *page)
+{
+       return folio_memcg_kmem(page_folio(page));
+}
+
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
         return (memcg == root_mem_cgroup);
@@ -684,26 +696,47 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
                 page_counter_read(&memcg->memory);
  }
  
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                       gfp_t gfp_mask);
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                                   gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
+
+/**
+ * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
+ * @folio: Folio to charge.
+ * @mm: mm context of the allocating task.
+ * @gfp: Reclaim mode.
+ *
+ * Try to charge @folio to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp if necessary.  If @mm is NULL, try to
+ * charge to the active memcg.
+ *
+ * Do not use this for folios allocated for swapin.
+ *
+ * Return: 0 on success. Otherwise, an error code is returned.
+ */
+static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
+                                   gfp_t gfp)
  {
         if (mem_cgroup_disabled())
                 return 0;
-       return __mem_cgroup_charge(page, mm, gfp_mask);
+       return __mem_cgroup_charge(folio, mm, gfp);
  }
  
  int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                                   gfp_t gfp, swp_entry_t entry);
  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
  
-void __mem_cgroup_uncharge(struct page *page);
-static inline void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio);
+
+/**
+ * mem_cgroup_uncharge - Uncharge a folio.
+ * @folio: Folio to uncharge.
+ *
+ * Uncharge a folio previously charged with mem_cgroup_charge().
+ */
+static inline void mem_cgroup_uncharge(struct folio *folio)
  {
         if (mem_cgroup_disabled())
                 return;
-       __mem_cgroup_uncharge(page);
+       __mem_cgroup_uncharge(folio);
  }
  
  void __mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -714,7 +747,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
         __mem_cgroup_uncharge_list(page_list);
  }
  
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
+void mem_cgroup_migrate(struct folio *old, struct folio *new);
  
  /**
   * mem_cgroup_lruvec - get the lru list vector for a memcg & node
@@ -753,33 +786,33 @@ out:
  }
  
  /**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
+ * folio_lruvec - return lruvec for isolating/putting an LRU folio
+ * @folio: Pointer to the folio.
   *
- * This function relies on page->mem_cgroup being stable.
+ * This function relies on folio->mem_cgroup being stable.
   */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
  {
-       pg_data_t *pgdat = page_pgdat(page);
-       struct mem_cgroup *memcg = page_memcg(page);
+       struct mem_cgroup *memcg = folio_memcg(folio);
  
-       VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
-       return mem_cgroup_lruvec(memcg, pgdat);
+       VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
+       return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
  }
  
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
  
  struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
  
-struct lruvec *lock_page_lruvec(struct page *page);
-struct lruvec *lock_page_lruvec_irq(struct page *page);
-struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+struct lruvec *folio_lruvec_lock(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                 unsigned long *flags);
  
  #ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
  #else
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
  {
  }
  #endif
@@ -947,6 +980,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
  extern bool cgroup_memory_noswap;
  #endif
  
+void folio_memcg_lock(struct folio *folio);
+void folio_memcg_unlock(struct folio *folio);
  void lock_page_memcg(struct page *page);
  void unlock_page_memcg(struct page *page);
  
@@ -1115,12 +1150,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
  #define MEM_CGROUP_ID_SHIFT    0
  #define MEM_CGROUP_ID_MAX      0
  
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+       return NULL;
+}
+
  static inline struct mem_cgroup *page_memcg(struct page *page)
  {
         return NULL;
  }
  
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  {
         WARN_ON_ONCE(!rcu_read_lock_held());
         return NULL;
@@ -1131,6 +1171,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
         return NULL;
  }
  
+static inline bool folio_memcg_kmem(struct folio *folio)
+{
+       return false;
+}
+
  static inline bool PageMemcgKmem(struct page *page)
  {
         return false;
@@ -1179,8 +1224,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
         return false;
  }
  
-static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                                   gfp_t gfp_mask)
+static inline int mem_cgroup_charge(struct folio *folio,
+               struct mm_struct *mm, gfp_t gfp)
  {
         return 0;
  }
@@ -1195,7 +1240,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
  {
  }
  
-static inline void mem_cgroup_uncharge(struct page *page)
+static inline void mem_cgroup_uncharge(struct folio *folio)
  {
  }
  
@@ -1203,7 +1248,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
  {
  }
  
-static inline void mem_cgroup_migrate(struct page *old, struct page *new)
+static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
  {
  }
  
@@ -1213,14 +1258,14 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
         return &pgdat->__lruvec;
  }
  
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec(struct folio *folio)
  {
-       pg_data_t *pgdat = page_pgdat(page);
-
+       struct pglist_data *pgdat = folio_pgdat(folio);
         return &pgdat->__lruvec;
  }
  
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
  {
  }
  
@@ -1250,26 +1295,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
  {
  }
  
-static inline struct lruvec *lock_page_lruvec(struct page *page)
+static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
  {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
  
         spin_lock(&pgdat->__lruvec.lru_lock);
         return &pgdat->__lruvec;
  }
  
-static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
  {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
  
         spin_lock_irq(&pgdat->__lruvec.lru_lock);
         return &pgdat->__lruvec;
  }
  
-static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                 unsigned long *flagsp)
  {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
  
         spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
         return &pgdat->__lruvec;
@@ -1356,6 +1401,14 @@ static inline void unlock_page_memcg(struct page *page)
  {
  }
  
+static inline void folio_memcg_lock(struct folio *folio)
+{
+}
+
+static inline void folio_memcg_unlock(struct folio *folio)
+{
+}
+
  static inline void mem_cgroup_handle_over_high(void)
  {
  }
@@ -1517,38 +1570,39 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
  }
  
  /* Test requires a stable page->memcg binding, see page_memcg() */
-static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
+static inline bool folio_matches_lruvec(struct folio *folio,
+               struct lruvec *lruvec)
  {
-       return lruvec_pgdat(lruvec) == page_pgdat(page) &&
-              lruvec_memcg(lruvec) == page_memcg(page);
+       return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
+              lruvec_memcg(lruvec) == folio_memcg(folio);
  }
  
  /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                 struct lruvec *locked_lruvec)
  {
         if (locked_lruvec) {
-               if (page_matches_lruvec(page, locked_lruvec))
+               if (folio_matches_lruvec(folio, locked_lruvec))
                         return locked_lruvec;
  
                 unlock_page_lruvec_irq(locked_lruvec);
         }
  
-       return lock_page_lruvec_irq(page);
+       return folio_lruvec_lock_irq(folio);
  }
  
  /* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
+static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
                 struct lruvec *locked_lruvec, unsigned long *flags)
  {
         if (locked_lruvec) {
-               if (page_matches_lruvec(page, locked_lruvec))
+               if (folio_matches_lruvec(folio, locked_lruvec))
                         return locked_lruvec;
  
                 unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
         }
  
-       return lock_page_lruvec_irqsave(page, flags);
+       return folio_lruvec_lock_irqsave(folio, flags);
  }
  
  #ifdef CONFIG_CGROUP_WRITEBACK
@@ -1558,17 +1612,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                          unsigned long *pheadroom, unsigned long *pdirty,
                          unsigned long *pwriteback);
  
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                              struct bdi_writeback *wb);
  
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                   struct bdi_writeback *wb)
  {
         if (mem_cgroup_disabled())
                 return;
  
-       if (unlikely(&page_memcg(page)->css != wb->memcg_css))
-               mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+       if (unlikely(&folio_memcg(folio)->css != wb->memcg_css))
+               mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
  }
  
  void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
@@ -1588,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
  {
  }
  
-static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                   struct bdi_writeback *wb)
  {
  }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h

index c8077e9..0d2aeb9 100644 (file)
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -57,6 +57,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                   struct page *newpage, struct page *page);
  extern int migrate_page_move_mapping(struct address_space *mapping,
                 struct page *newpage, struct page *page, int extra_count);
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
+int folio_migrate_mapping(struct address_space *mapping,
+               struct folio *newfolio, struct folio *folio, int extra_count);
  #else
  
  static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 73a52ab..40ff114 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,10 +36,7 @@
  struct mempolicy;
  struct anon_vma;
  struct anon_vma_chain;
-struct file_ra_state;
  struct user_struct;
-struct writeback_control;
-struct bdi_writeback;
  struct pt_regs;
  
  extern int sysctl_page_lock_unfairness;
@@ -216,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                 loff_t *);
  int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                 loff_t *);
-/*
- * Any attempt to mark this function as static leads to build failure
- * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked()
- * is referred to by BPF code. This must be visible for error injection.
- */
-int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-               pgoff_t index, gfp_t gfp, void **shadowp);
  
  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
  #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
@@ -748,13 +738,18 @@ static inline int put_page_testzero(struct page *page)
         return page_ref_dec_and_test(page);
  }
  
+static inline int folio_put_testzero(struct folio *folio)
+{
+       return put_page_testzero(&folio->page);
+}
+
  /*
   * Try to grab a ref unless the page has a refcount of zero, return false if
   * that is the case.
   * This can be called when MMU is off so it must not access
   * any of the virtual mappings.
   */
-static inline int get_page_unless_zero(struct page *page)
+static inline bool get_page_unless_zero(struct page *page)
  {
         return page_ref_add_unless(page, 1, 0);
  }
@@ -907,7 +902,7 @@ void __put_page(struct page *page);
  void put_pages_list(struct list_head *pages);
  
  void split_page(struct page *page, unsigned int order);
-void copy_huge_page(struct page *dst, struct page *src);
+void folio_copy(struct folio *dst, struct folio *src);
  
  /*
   * Compound pages have a destructor function.  Provide a
@@ -950,6 +945,20 @@ static inline unsigned int compound_order(struct page *page)
         return page[1].compound_order;
  }
  
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+       return compound_order(&folio->page);
+}
+
  static inline bool hpage_pincount_available(struct page *page)
  {
         /*
@@ -1131,6 +1140,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
         return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
  }
  
+static inline enum zone_type folio_zonenum(const struct folio *folio)
+{
+       return page_zonenum(&folio->page);
+}
+
  #ifdef CONFIG_ZONE_DEVICE
  static inline bool is_zone_device_page(const struct page *page)
  {
@@ -1200,18 +1214,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
  }
  
  /* 127: arbitrary random number, small enough to assemble well */
-#define page_ref_zero_or_close_to_overflow(page) \
-       ((unsigned int) page_ref_count(page) + 127u <= 127u)
+#define folio_ref_zero_or_close_to_overflow(folio) \
+       ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
+
+/**
+ * folio_get - Increment the reference count on a folio.
+ * @folio: The folio.
+ *
+ * Context: May be called in any context, as long as you know that
+ * you have a refcount on the folio.  If you do not already have one,
+ * folio_try_get() may be the right interface for you to use.
+ */
+static inline void folio_get(struct folio *folio)
+{
+       VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
+       folio_ref_inc(folio);
+}
  
  static inline void get_page(struct page *page)
  {
-       page = compound_head(page);
-       /*
-        * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_refcount.
-        */
-       VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
-       page_ref_inc(page);
+       folio_get(page_folio(page));
  }
  
  bool __must_check try_grab_page(struct page *page, unsigned int flags);
@@ -1228,9 +1250,28 @@ static inline __must_check bool try_get_page(struct page *page)
         return true;
  }
  
+/**
+ * folio_put - Decrement the reference count on a folio.
+ * @folio: The folio.
+ *
+ * If the folio's reference count reaches zero, the memory will be
+ * released back to the page allocator and may be used by another
+ * allocation immediately.  Do not access the memory or the struct folio
+ * after calling folio_put() unless you can be sure that it wasn't the
+ * last reference.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folio_put(struct folio *folio)
+{
+       if (folio_put_testzero(folio))
+               __put_page(&folio->page);
+}
+
  static inline void put_page(struct page *page)
  {
-       page = compound_head(page);
+       struct folio *folio = page_folio(page);
  
         /*
          * For devmap managed pages we need to catch refcount transition from
@@ -1238,13 +1279,12 @@ static inline void put_page(struct page *page)
          * need to inform the device driver through callback. See
          * include/linux/memremap.h and HMM for details.
          */
-       if (page_is_devmap_managed(page)) {
-               put_devmap_managed_page(page);
+       if (page_is_devmap_managed(&folio->page)) {
+               put_devmap_managed_page(&folio->page);
                 return;
         }
  
-       if (put_page_testzero(page))
-               __put_page(page);
+       folio_put(folio);
  }
  
  /*
@@ -1379,6 +1419,11 @@ static inline int page_to_nid(const struct page *page)
  }
  #endif
  
+static inline int folio_nid(const struct folio *folio)
+{
+       return page_to_nid(&folio->page);
+}
+
  #ifdef CONFIG_NUMA_BALANCING
  static inline int cpu_pid_to_cpupid(int cpu, int pid)
  {
@@ -1546,6 +1591,16 @@ static inline pg_data_t *page_pgdat(const struct page *page)
         return NODE_DATA(page_to_nid(page));
  }
  
+static inline struct zone *folio_zone(const struct folio *folio)
+{
+       return page_zone(&folio->page);
+}
+
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
+{
+       return page_pgdat(&folio->page);
+}
+
  #ifdef SECTION_IN_PAGE_FLAGS
  static inline void set_page_section(struct page *page, unsigned long section)
  {
@@ -1559,6 +1614,20 @@ static inline unsigned long page_to_section(const struct page *page)
  }
  #endif
  
+/**
+ * folio_pfn - Return the Page Frame Number of a folio.
+ * @folio: The folio.
+ *
+ * A folio may contain multiple pages.  The pages have consecutive
+ * Page Frame Numbers.
+ *
+ * Return: The Page Frame Number of the first page in the folio.
+ */
+static inline unsigned long folio_pfn(struct folio *folio)
+{
+       return page_to_pfn(&folio->page);
+}
+
  /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
  #ifdef CONFIG_MIGRATION
  static inline bool is_pinnable_page(struct page *page)
@@ -1595,6 +1664,89 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  #endif
  }
  
+/**
+ * folio_nr_pages - The number of pages in the folio.
+ * @folio: The folio.
+ *
+ * Return: A positive power of two.
+ */
+static inline long folio_nr_pages(struct folio *folio)
+{
+       return compound_nr(&folio->page);
+}
+
+/**
+ * folio_next - Move to the next physical folio.
+ * @folio: The folio we're currently operating on.
+ *
+ * If you have physically contiguous memory which may span more than
+ * one folio (eg a &struct bio_vec), use this function to move from one
+ * folio to the next.  Do not use it if the memory is only virtually
+ * contiguous as the folios are almost certainly not adjacent to each
+ * other.  This is the folio equivalent to writing ``page++``.
+ *
+ * Context: We assume that the folios are refcounted and/or locked at a
+ * higher level and do not adjust the reference counts.
+ * Return: The next struct folio.
+ */
+static inline struct folio *folio_next(struct folio *folio)
+{
+       return (struct folio *)folio_page(folio, folio_nr_pages(folio));
+}
+
+/**
+ * folio_shift - The size of the memory described by this folio.
+ * @folio: The folio.
+ *
+ * A folio represents a number of bytes which is a power-of-two in size.
+ * This function tells you which power-of-two the folio is.  See also
+ * folio_size() and folio_order().
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The base-2 logarithm of the size of this folio.
+ */
+static inline unsigned int folio_shift(struct folio *folio)
+{
+       return PAGE_SHIFT + folio_order(folio);
+}
+
+/**
+ * folio_size - The number of bytes in a folio.
+ * @folio: The folio.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The number of bytes in this folio.
+ */
+static inline size_t folio_size(struct folio *folio)
+{
+       return PAGE_SIZE << folio_order(folio);
+}
+
+#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+static inline int arch_make_page_accessible(struct page *page)
+{
+       return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
+static inline int arch_make_folio_accessible(struct folio *folio)
+{
+       int ret;
+       long i, nr = folio_nr_pages(folio);
+
+       for (i = 0; i < nr; i++) {
+               ret = arch_make_page_accessible(folio_page(folio, i));
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+#endif
+
  /*
   * Some inline functions in vmstat.h depend on page_zone()
   */
@@ -1635,19 +1787,6 @@ void page_address_init(void);
  
  extern void *page_rmapping(struct page *page);
  extern struct anon_vma *page_anon_vma(struct page *page);
-extern struct address_space *page_mapping(struct page *page);
-
-extern struct address_space *__page_file_mapping(struct page *);
-
-static inline
-struct address_space *page_file_mapping(struct page *page)
-{
-       if (unlikely(PageSwapCache(page)))
-               return __page_file_mapping(page);
-
-       return page->mapping;
-}
-
  extern pgoff_t __page_file_index(struct page *page);
  
  /*
@@ -1662,7 +1801,7 @@ static inline pgoff_t page_index(struct page *page)
  }
  
  bool page_mapped(struct page *page);
-struct address_space *page_mapping(struct page *page);
+bool folio_mapped(struct folio *folio);
  
  /*
   * Return true only if the page has been allocated with
@@ -1700,6 +1839,7 @@ extern void pagefault_out_of_memory(void);
  
  #define offset_in_page(p)      ((unsigned long)(p) & ~PAGE_MASK)
  #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
  
  /*
   * Flags passed to show_mem() and show_free_areas() to suppress output in
@@ -1854,20 +1994,9 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
  extern void do_invalidatepage(struct page *page, unsigned int offset,
                               unsigned int length);
  
-int redirty_page_for_writepage(struct writeback_control *wbc,
-                               struct page *page);
-void account_page_cleaned(struct page *page, struct address_space *mapping,
-                         struct bdi_writeback *wb);
-int set_page_dirty(struct page *page);
+bool folio_mark_dirty(struct folio *folio);
+bool set_page_dirty(struct page *page);
  int set_page_dirty_lock(struct page *page);
-void __cancel_dirty_page(struct page *page);
-static inline void cancel_dirty_page(struct page *page)
-{
-       /* Avoid atomic ops, locking, etc. when not actually needed. */
-       if (PageDirty(page))
-               __cancel_dirty_page(page);
-}
-int clear_page_dirty_for_io(struct page *page);
  
  int get_cmdline(struct task_struct *task, char *buffer, int buflen);
  
@@ -2659,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                 pgoff_t start_pgoff, pgoff_t end_pgoff);
  extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
  
-/* mm/page-writeback.c */
-int __must_check write_one_page(struct page *page);
-void task_dirty_inc(struct task_struct *tsk);
-
  extern unsigned long stack_guard_gap;
  /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
  extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h

index 355ea1e..e2ec68b 100644 (file)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -6,27 +6,33 @@
  #include <linux/swap.h>
  
  /**
- * page_is_file_lru - should the page be on a file LRU or anon LRU?
- * @page: the page to test
- *
- * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
- * freed anonymous page (e.g. via MADV_FREE).  Returns 0 if @page is a normal
- * anonymous page, a tmpfs page or otherwise ram or swap backed page.  Used by
- * functions that manipulate the LRU lists, to sort a page onto the right LRU
- * list.
+ * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
+ * @folio: The folio to test.
   *
   * We would like to get this info without a page flag, but the state
- * needs to survive until the page is last deleted from the LRU, which
+ * needs to survive until the folio is last deleted from the LRU, which
   * could be as far down as __page_cache_release.
+ *
+ * Return: An integer (not a boolean!) used to sort a folio onto the
+ * right LRU list and to account folios correctly.
+ * 1 if @folio is a regular filesystem backed page cache folio
+ * or a lazily freed anonymous folio (e.g. via MADV_FREE).
+ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
+ * ram or swap backed folio.
   */
+static inline int folio_is_file_lru(struct folio *folio)
+{
+       return !folio_test_swapbacked(folio);
+}
+
  static inline int page_is_file_lru(struct page *page)
  {
-       return !PageSwapBacked(page);
+       return folio_is_file_lru(page_folio(page));
  }
  
  static __always_inline void update_lru_size(struct lruvec *lruvec,
                                 enum lru_list lru, enum zone_type zid,
-                               int nr_pages)
+                               long nr_pages)
  {
         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  
@@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
  }
  
  /**
- * __clear_page_lru_flags - clear page lru flags before releasing a page
- * @page: the page that was on lru and now has a zero reference
+ * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
+ * @folio: The folio that was on lru and now has a zero reference.
   */
-static __always_inline void __clear_page_lru_flags(struct page *page)
+static __always_inline void __folio_clear_lru_flags(struct folio *folio)
  {
-       VM_BUG_ON_PAGE(!PageLRU(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);
  
-       __ClearPageLRU(page);
+       __folio_clear_lru(folio);
  
         /* this shouldn't happen, so leave the flags to bad_page() */
-       if (PageActive(page) && PageUnevictable(page))
+       if (folio_test_active(folio) && folio_test_unevictable(folio))
                 return;
  
-       __ClearPageActive(page);
-       __ClearPageUnevictable(page);
+       __folio_clear_active(folio);
+       __folio_clear_unevictable(folio);
+}
+
+static __always_inline void __clear_page_lru_flags(struct page *page)
+{
+       __folio_clear_lru_flags(page_folio(page));
  }
  
  /**
- * page_lru - which LRU list should a page be on?
- * @page: the page to test
+ * folio_lru_list - Which LRU list should a folio be on?
+ * @folio: The folio to test.
   *
- * Returns the LRU list a page should be on, as an index
+ * Return: The LRU list a folio should be on, as an index
   * into the array of LRU lists.
   */
-static __always_inline enum lru_list page_lru(struct page *page)
+static __always_inline enum lru_list folio_lru_list(struct folio *folio)
  {
         enum lru_list lru;
  
-       VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+       VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
  
-       if (PageUnevictable(page))
+       if (folio_test_unevictable(folio))
                 return LRU_UNEVICTABLE;
  
-       lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
-       if (PageActive(page))
+       lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+       if (folio_test_active(folio))
                 lru += LRU_ACTIVE;
  
         return lru;
  }
  
+static __always_inline
+void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
+{
+       enum lru_list lru = folio_lru_list(folio);
+
+       update_lru_size(lruvec, lru, folio_zonenum(folio),
+                       folio_nr_pages(folio));
+       list_add(&folio->lru, &lruvec->lists[lru]);
+}
+
  static __always_inline void add_page_to_lru_list(struct page *page,
                                 struct lruvec *lruvec)
  {
-       enum lru_list lru = page_lru(page);
+       lruvec_add_folio(lruvec, page_folio(page));
+}
  
-       update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-       list_add(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
+{
+       enum lru_list lru = folio_lru_list(folio);
+
+       update_lru_size(lruvec, lru, folio_zonenum(folio),
+                       folio_nr_pages(folio));
+       list_add_tail(&folio->lru, &lruvec->lists[lru]);
  }
  
  static __always_inline void add_page_to_lru_list_tail(struct page *page,
                                 struct lruvec *lruvec)
  {
-       enum lru_list lru = page_lru(page);
+       lruvec_add_folio_tail(lruvec, page_folio(page));
+}
  
-       update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-       list_add_tail(&page->lru, &lruvec->lists[lru]);
+static __always_inline
+void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
+{
+       list_del(&folio->lru);
+       update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
+                       -folio_nr_pages(folio));
  }
  
  static __always_inline void del_page_from_lru_list(struct page *page,
                                 struct lruvec *lruvec)
  {
-       list_del(&page->lru);
-       update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-                       -thp_nr_pages(page));
+       lruvec_del_folio(lruvec, page_folio(page));
  }
  #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 7f8ee09..82dab23 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -239,6 +239,72 @@ struct page {
  #endif
  } _struct_page_alignment;
  
+/**
+ * struct folio - Represents a contiguous set of bytes.
+ * @flags: Identical to the page flags.
+ * @lru: Least Recently Used list; tracks how recently this folio was used.
+ * @mapping: The file this page belongs to, or refers to the anon_vma for
+ *    anonymous memory.
+ * @index: Offset within the file, in units of pages.  For anonymous memory,
+ *    this is the index from the beginning of the mmap.
+ * @private: Filesystem per-folio data (see folio_attach_private()).
+ *    Used for swp_entry_t if folio_test_swapcache().
+ * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
+ *    find out how many times this folio is mapped by userspace.
+ * @_refcount: Do not access this member directly.  Use folio_ref_count()
+ *    to find how many references there are to this folio.
+ * @memcg_data: Memory Control Group data.
+ *
+ * A folio is a physically, virtually and logically contiguous set
+ * of bytes.  It is a power-of-two in size, and it is aligned to that
+ * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
+ * in the page cache, it is at a file offset which is a multiple of that
+ * power-of-two.  It may be mapped into userspace at an address which is
+ * at an arbitrary page offset, but its kernel virtual address is aligned
+ * to its size.
+ */
+struct folio {
+       /* private: don't document the anon union */
+       union {
+               struct {
+       /* public: */
+                       unsigned long flags;
+                       struct list_head lru;
+                       struct address_space *mapping;
+                       pgoff_t index;
+                       void *private;
+                       atomic_t _mapcount;
+                       atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+                       unsigned long memcg_data;
+#endif
+       /* private: the union with struct page is transitional */
+               };
+               struct page page;
+       };
+};
+
+static_assert(sizeof(struct page) == sizeof(struct folio));
+#define FOLIO_MATCH(pg, fl)                                            \
+       static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
+FOLIO_MATCH(flags, flags);
+FOLIO_MATCH(lru, lru);
+FOLIO_MATCH(compound_head, lru);
+FOLIO_MATCH(index, index);
+FOLIO_MATCH(private, private);
+FOLIO_MATCH(_mapcount, _mapcount);
+FOLIO_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+FOLIO_MATCH(memcg_data, memcg_data);
+#endif
+#undef FOLIO_MATCH
+
+static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
+{
+       struct page *tail = &folio->page + 1;
+       return &tail->compound_mapcount;
+}
+
  static inline atomic_t *compound_mapcount_ptr(struct page *page)
  {
         return &page[1].compound_mapcount;
@@ -257,6 +323,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page)
  #define PAGE_FRAG_CACHE_MAX_SIZE       __ALIGN_MASK(32768, ~PAGE_MASK)
  #define PAGE_FRAG_CACHE_MAX_ORDER      get_order(PAGE_FRAG_CACHE_MAX_SIZE)
  
+/*
+ * page_private can be used on tail pages.  However, PagePrivate is only
+ * checked by the VM on the head page.  So page_private on the tail pages
+ * should be used for data that's ancillary to the head page (eg attaching
+ * buffer heads to tail pages after attaching buffer heads to the head page)
+ */
  #define page_private(page)             ((page)->private)
  
  static inline void set_page_private(struct page *page, unsigned long private)
@@ -264,6 +336,11 @@ static inline void set_page_private(struct page *page, unsigned long private)
         page->private = private;
  }
  
+static inline void *folio_get_private(struct folio *folio)
+{
+       return folio->private;
+}
+
  struct page_frag_cache {
         void * va;
  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h

index 0c0c9a0..52eae8c 100644 (file)
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -15,7 +15,7 @@
  #include <linux/mmc/card.h>
  #include <linux/mmc/pm.h>
  #include <linux/dma-direction.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
  
  struct mmc_ios {
         unsigned int    clock;                  /* clock rate */
@@ -492,7 +492,7 @@ struct mmc_host {
  
         /* Inline encryption support */
  #ifdef CONFIG_MMC_CRYPTO
-       struct blk_keyslot_manager ksm;
+       struct blk_crypto_profile crypto_profile;
  #endif
  
         /* Host Software Queue support */
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h

index 1935d4c..d7285f8 100644 (file)
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm);
                         BUG();                                          \
                 }                                                       \
         } while (0)
+#define VM_BUG_ON_FOLIO(cond, folio)                                   \
+       do {                                                            \
+               if (unlikely(cond)) {                                   \
+                       dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\
+                       BUG();                                          \
+               }                                                       \
+       } while (0)
  #define VM_BUG_ON_VMA(cond, vma)                                       \
         do {                                                            \
                 if (unlikely(cond)) {                                   \
@@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm);
         }                                                               \
         unlikely(__ret_warn_once);                                      \
  })
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)     ({                      \
+       static bool __section(".data.once") __warned;                   \
+       int __ret_warn_once = !!(cond);                                 \
+                                                                       \
+       if (unlikely(__ret_warn_once && !__warned)) {                   \
+               dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\
+               __warned = true;                                        \
+               WARN_ON(1);                                             \
+       }                                                               \
+       unlikely(__ret_warn_once);                                      \
+})
  
  #define VM_WARN_ON(cond) (void)WARN_ON(cond)
  #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm);
  #else
  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
  #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond)
  #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
  #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
  #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
  #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
  #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
  #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
  #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
  #endif
diff --git a/include/linux/netfs.h b/include/linux/netfs.h

index 5d6a415..12c4177 100644 (file)
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -22,6 +22,7 @@
   * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
   * a page is currently backed by a local disk cache
   */
+#define folio_test_fscache(folio)      folio_test_private_2(folio)
  #define PageFsCache(page)              PagePrivate2((page))
  #define SetPageFsCache(page)           SetPagePrivate2((page))
  #define ClearPageFsCache(page)         ClearPagePrivate2((page))
@@ -29,60 +30,80 @@
  #define TestClearPageFsCache(page)     TestClearPagePrivate2((page))
  
  /**
- * set_page_fscache - Set PG_fscache on a page and take a ref
- * @page: The page.
+ * folio_start_fscache - Start an fscache write on a folio.
+ * @folio: The folio.
   *
- * Set the PG_fscache (PG_private_2) flag on a page and take the reference
- * needed for the VM to handle its lifetime correctly.  This sets the flag and
- * takes the reference unconditionally, so care must be taken not to set the
- * flag again if it's already set.
+ * Call this function before writing a folio to a local cache.  Starting a
+ * second write before the first one finishes is not allowed.
   */
-static inline void set_page_fscache(struct page *page)
+static inline void folio_start_fscache(struct folio *folio)
  {
-       set_page_private_2(page);
+       VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio);
+       folio_get(folio);
+       folio_set_private_2(folio);
  }
  
  /**
- * end_page_fscache - Clear PG_fscache and release any waiters
- * @page: The page
- *
- * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers
- * waiting for this.  The page ref held for PG_private_2 being set is released.
+ * folio_end_fscache - End an fscache write on a folio.
+ * @folio: The folio.
   *
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
- * serialised.
+ * Call this function after the folio has been written to the local cache.
+ * This will wake any sleepers waiting on this folio.
   */
-static inline void end_page_fscache(struct page *page)
+static inline void folio_end_fscache(struct folio *folio)
  {
-       end_page_private_2(page);
+       folio_end_private_2(folio);
  }
  
  /**
- * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
   *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page.
+ * If this folio is currently being written to a local cache, wait for
+ * the write to finish.  Another write may start after this one finishes,
+ * unless the caller holds the folio lock.
   */
-static inline void wait_on_page_fscache(struct page *page)
+static inline void folio_wait_fscache(struct folio *folio)
  {
-       wait_on_page_private_2(page);
+       folio_wait_private_2(folio);
  }
  
  /**
- * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_fscache_killable - Wait for an fscache write on this folio to end.
+ * @folio: The folio.
   *
- * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a
- * fatal signal is received by the calling task.
+ * If this folio is currently being written to a local cache, wait
+ * for the write to finish or for a fatal signal to be received.
+ * Another write may start after this one finishes, unless the caller
+ * holds the folio lock.
   *
   * Return:
   * - 0 if successful.
   * - -EINTR if a fatal signal was encountered.
   */
+static inline int folio_wait_fscache_killable(struct folio *folio)
+{
+       return folio_wait_private_2_killable(folio);
+}
+
+static inline void set_page_fscache(struct page *page)
+{
+       folio_start_fscache(page_folio(page));
+}
+
+static inline void end_page_fscache(struct page *page)
+{
+       folio_end_private_2(page_folio(page));
+}
+
+static inline void wait_on_page_fscache(struct page *page)
+{
+       folio_wait_private_2(page_folio(page));
+}
+
  static inline int wait_on_page_fscache_killable(struct page *page)
  {
-       return wait_on_page_private_2_killable(page);
+       return folio_wait_private_2_killable(page_folio(page));
  }
  
  enum netfs_read_source {
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h

index 2a38f2b..cb909ed 100644 (file)
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -7,6 +7,7 @@
  #define _NVME_FC_DRIVER_H 1
  
  #include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
  
  
  /*
@@ -497,6 +498,8 @@ struct nvme_fc_port_template {
         int     (*xmt_ls_rsp)(struct nvme_fc_local_port *localport,
                                 struct nvme_fc_remote_port *rport,
                                 struct nvmefc_ls_rsp *ls_rsp);
+       void    (*map_queues)(struct nvme_fc_local_port *localport,
+                             struct blk_mq_queue_map *map);
  
         u32     max_hw_queues;
         u16     max_sgl_segments;
@@ -779,6 +782,10 @@ struct nvmet_fc_target_port {
   *       LS received.
   *       Entrypoint is Mandatory.
   *
+ * @map_queues: This functions lets the driver expose the queue mapping
+ *      to the block layer.
+ *       Entrypoint is Optional.
+ *
   * @fcp_op:  Called to perform a data transfer or transmit a response.
   *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
   *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h

index 3ec8e50..4dd7e6f 100644 (file)
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -6,6 +6,8 @@
  #ifndef _LINUX_NVME_RDMA_H
  #define _LINUX_NVME_RDMA_H
  
+#define NVME_RDMA_MAX_QUEUE_SIZE       128
+
  enum nvme_rdma_cm_fmt {
         NVME_RDMA_CM_FMT_1_0 = 0x0,
  };
diff --git a/include/linux/nvme.h b/include/linux/nvme.h

index b7c4c41..855dd9b 100644 (file)
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -27,8 +27,20 @@
  #define NVME_NSID_ALL          0xffffffff
  
  enum nvme_subsys_type {
-       NVME_NQN_DISC   = 1,            /* Discovery type target subsystem */
-       NVME_NQN_NVME   = 2,            /* NVME type target subsystem */
+       /* Referral to another discovery type target subsystem */
+       NVME_NQN_DISC   = 1,
+
+       /* NVME type target subsystem */
+       NVME_NQN_NVME   = 2,
+
+       /* Current discovery type target subsystem */
+       NVME_NQN_CURR   = 3,
+};
+
+enum nvme_ctrl_type {
+       NVME_CTRL_IO    = 1,            /* I/O controller */
+       NVME_CTRL_DISC  = 2,            /* Discovery controller */
+       NVME_CTRL_ADMIN = 3,            /* Administrative controller */
  };
  
  /* Address Family codes for Discovery Log Page entry ADRFAM field */
@@ -244,7 +256,9 @@ struct nvme_id_ctrl {
         __le32                  rtd3e;
         __le32                  oaes;
         __le32                  ctratt;
-       __u8                    rsvd100[28];
+       __u8                    rsvd100[11];
+       __u8                    cntrltype;
+       __u8                    fguid[16];
         __le16                  crdt1;
         __le16                  crdt2;
         __le16                  crdt3;
@@ -312,6 +326,7 @@ struct nvme_id_ctrl {
  };
  
  enum {
+       NVME_CTRL_CMIC_MULTI_PORT               = 1 << 0,
         NVME_CTRL_CMIC_MULTI_CTRL               = 1 << 1,
         NVME_CTRL_CMIC_ANA                      = 1 << 3,
         NVME_CTRL_ONCS_COMPARE                  = 1 << 0,
@@ -1303,6 +1318,12 @@ struct nvmf_common_command {
  
  #define MAX_DISC_LOGS  255
  
+/* Discovery log page entry flags (EFLAGS): */
+enum {
+       NVME_DISC_EFLAGS_EPCSD          = (1 << 1),
+       NVME_DISC_EFLAGS_DUPRETINFO     = (1 << 0),
+};
+
  /* Discovery log page entry */
  struct nvmf_disc_rsp_page_entry {
         __u8            trtype;
@@ -1312,7 +1333,8 @@ struct nvmf_disc_rsp_page_entry {
         __le16          portid;
         __le16          cntlid;
         __le16          asqsz;
-       __u8            resv8[22];
+       __le16          eflags;
+       __u8            resv10[20];
         char            trsvcid[NVMF_TRSVCID_SIZE];
         __u8            resv64[192];
         char            subnqn[NVMF_NQN_FIELD_LEN];
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index a558d67..d8623d6 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -143,6 +143,8 @@ enum pageflags {
  #endif
         __NR_PAGEFLAGS,
  
+       PG_readahead = PG_reclaim,
+
         /* Filesystems */
         PG_checked = PG_owner_priv_1,
  
@@ -171,6 +173,15 @@ enum pageflags {
         /* Compound pages. Stored in first tail page's flags */
         PG_double_map = PG_workingset,
  
+#ifdef CONFIG_MEMORY_FAILURE
+       /*
+        * Compound pages. Stored in first tail page's flags.
+        * Indicates that at least one subpage is hwpoisoned in the
+        * THP.
+        */
+       PG_has_hwpoisoned = PG_mappedtodisk,
+#endif
+
         /* non-lru isolated movable page */
         PG_isolated = PG_reclaim,
  
@@ -193,6 +204,34 @@ static inline unsigned long _compound_head(const struct page *page)
  
  #define compound_head(page)    ((typeof(page))_compound_head(page))
  
+/**
+ * page_folio - Converts from page to folio.
+ * @p: The page.
+ *
+ * Every page is part of a folio.  This function cannot be called on a
+ * NULL pointer.
+ *
+ * Context: No reference, nor lock is required on @page.  If the caller
+ * does not hold a reference, this call may race with a folio split, so
+ * it should re-check the folio still contains this page after gaining
+ * a reference on the folio.
+ * Return: The folio which contains this page.
+ */
+#define page_folio(p)          (_Generic((p),                          \
+       const struct page *:    (const struct folio *)_compound_head(p), \
+       struct page *:          (struct folio *)_compound_head(p)))
+
+/**
+ * folio_page - Return a page from a folio.
+ * @folio: The folio.
+ * @n: The page number to return.
+ *
+ * @n is relative to the start of the folio.  This function does not
+ * check that the page number lies within @folio; the caller is presumed
+ * to have a reference to the page.
+ */
+#define folio_page(folio, n)   nth_page(&(folio)->page, n)
+
  static __always_inline int PageTail(struct page *page)
  {
         return READ_ONCE(page->compound_head) & 1;
@@ -217,6 +256,15 @@ static inline void page_init_poison(struct page *page, size_t size)
  }
  #endif
  
+static unsigned long *folio_flags(struct folio *folio, unsigned n)
+{
+       struct page *page = &folio->page;
+
+       VM_BUG_ON_PGFLAGS(PageTail(page), page);
+       VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
+       return &page[n].flags;
+}
+
  /*
   * Page flags policies wrt compound pages
   *
@@ -261,36 +309,64 @@ static inline void page_init_poison(struct page *page, size_t size)
                 VM_BUG_ON_PGFLAGS(!PageHead(page), page);               \
                 PF_POISONED_CHECK(&page[1]); })
  
+/* Which page is the flag stored in */
+#define FOLIO_PF_ANY           0
+#define FOLIO_PF_HEAD          0
+#define FOLIO_PF_ONLY_HEAD     0
+#define FOLIO_PF_NO_TAIL       0
+#define FOLIO_PF_NO_COMPOUND   0
+#define FOLIO_PF_SECOND                1
+
  /*
   * Macros to create function definitions for page flags
   */
  #define TESTPAGEFLAG(uname, lname, policy)                             \
+static __always_inline bool folio_test_##lname(struct folio *folio)    \
+{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }   \
  static __always_inline int Page##uname(struct page *page)              \
-       { return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
  
  #define SETPAGEFLAG(uname, lname, policy)                              \
+static __always_inline                                                 \
+void folio_set_##lname(struct folio *folio)                            \
+{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }           \
  static __always_inline void SetPage##uname(struct page *page)          \
-       { set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define CLEARPAGEFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+void folio_clear_##lname(struct folio *folio)                          \
+{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }         \
  static __always_inline void ClearPage##uname(struct page *page)                \
-       { clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define __SETPAGEFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+void __folio_set_##lname(struct folio *folio)                          \
+{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }         \
  static __always_inline void __SetPage##uname(struct page *page)                \
-       { __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define __CLEARPAGEFLAG(uname, lname, policy)                          \
+static __always_inline                                                 \
+void __folio_clear_##lname(struct folio *folio)                                \
+{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }       \
  static __always_inline void __ClearPage##uname(struct page *page)      \
-       { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define TESTSETFLAG(uname, lname, policy)                              \
+static __always_inline                                                 \
+bool folio_test_set_##lname(struct folio *folio)                       \
+{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
  static __always_inline int TestSetPage##uname(struct page *page)       \
-       { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define TESTCLEARFLAG(uname, lname, policy)                            \
+static __always_inline                                                 \
+bool folio_test_clear_##lname(struct folio *folio)                     \
+{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
  static __always_inline int TestClearPage##uname(struct page *page)     \
-       { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
  
  #define PAGEFLAG(uname, lname, policy)                                 \
         TESTPAGEFLAG(uname, lname, policy)                              \
@@ -306,29 +382,37 @@ static __always_inline int TestClearPage##uname(struct page *page)        \
         TESTSETFLAG(uname, lname, policy)                               \
         TESTCLEARFLAG(uname, lname, policy)
  
-#define TESTPAGEFLAG_FALSE(uname)                                      \
+#define TESTPAGEFLAG_FALSE(uname, lname)                               \
+static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
  static inline int Page##uname(const struct page *page) { return 0; }
  
-#define SETPAGEFLAG_NOOP(uname)                                                \
+#define SETPAGEFLAG_NOOP(uname, lname)                                 \
+static inline void folio_set_##lname(struct folio *folio) { }          \
  static inline void SetPage##uname(struct page *page) {  }
  
-#define CLEARPAGEFLAG_NOOP(uname)                                      \
+#define CLEARPAGEFLAG_NOOP(uname, lname)                               \
+static inline void folio_clear_##lname(struct folio *folio) { }                \
  static inline void ClearPage##uname(struct page *page) {  }
  
-#define __CLEARPAGEFLAG_NOOP(uname)                                    \
+#define __CLEARPAGEFLAG_NOOP(uname, lname)                             \
+static inline void __folio_clear_##lname(struct folio *folio) { }      \
  static inline void __ClearPage##uname(struct page *page) {  }
  
-#define TESTSETFLAG_FALSE(uname)                                       \
+#define TESTSETFLAG_FALSE(uname, lname)                                        \
+static inline bool folio_test_set_##lname(struct folio *folio)         \
+{ return 0; }                                                          \
  static inline int TestSetPage##uname(struct page *page) { return 0; }
  
-#define TESTCLEARFLAG_FALSE(uname)                                     \
+#define TESTCLEARFLAG_FALSE(uname, lname)                              \
+static inline bool folio_test_clear_##lname(struct folio *folio)       \
+{ return 0; }                                                          \
  static inline int TestClearPage##uname(struct page *page) { return 0; }
  
-#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                        \
-       SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
+#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)  \
+       SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)
  
-#define TESTSCFLAG_FALSE(uname)                                                \
-       TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
+#define TESTSCFLAG_FALSE(uname, lname)                                 \
+       TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)
  
  __PAGEFLAG(Locked, locked, PF_NO_TAIL)
  PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
@@ -384,8 +468,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
  /* PG_readahead is only used for reads; PG_reclaim is only for writes */
  PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
         TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
-       TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)
  
  #ifdef CONFIG_HIGHMEM
  /*
@@ -394,22 +478,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
   */
  #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
  #else
-PAGEFLAG_FALSE(HighMem)
+PAGEFLAG_FALSE(HighMem, highmem)
  #endif
  
  #ifdef CONFIG_SWAP
-static __always_inline int PageSwapCache(struct page *page)
+static __always_inline bool folio_test_swapcache(struct folio *folio)
  {
-#ifdef CONFIG_THP_SWAP
-       page = compound_head(page);
-#endif
-       return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
+       return folio_test_swapbacked(folio) &&
+                       test_bit(PG_swapcache, folio_flags(folio, 0));
+}
  
+static __always_inline bool PageSwapCache(struct page *page)
+{
+       return folio_test_swapcache(page_folio(page));
  }
+
  SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
  CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
  #else
-PAGEFLAG_FALSE(SwapCache)
+PAGEFLAG_FALSE(SwapCache, swapcache)
  #endif
  
  PAGEFLAG(Unevictable, unevictable, PF_HEAD)
@@ -421,14 +508,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
         __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
         TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
  #else
-PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
-       TESTSCFLAG_FALSE(Mlocked)
+PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
+       TESTSCFLAG_FALSE(Mlocked, mlocked)
  #endif
  
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
  PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
  #else
-PAGEFLAG_FALSE(Uncached)
+PAGEFLAG_FALSE(Uncached, uncached)
  #endif
  
  #ifdef CONFIG_MEMORY_FAILURE
@@ -437,7 +524,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
  #define __PG_HWPOISON (1UL << PG_hwpoison)
  extern bool take_page_off_buddy(struct page *page);
  #else
-PAGEFLAG_FALSE(HWPoison)
+PAGEFLAG_FALSE(HWPoison, hwpoison)
  #define __PG_HWPOISON 0
  #endif
  
@@ -451,7 +538,7 @@ PAGEFLAG(Idle, idle, PF_ANY)
  #ifdef CONFIG_KASAN_HW_TAGS
  PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
  #else
-PAGEFLAG_FALSE(SkipKASanPoison)
+PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
  #endif
  
  /*
@@ -489,10 +576,14 @@ static __always_inline int PageMappingFlags(struct page *page)
         return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
  }
  
-static __always_inline int PageAnon(struct page *page)
+static __always_inline bool folio_test_anon(struct folio *folio)
+{
+       return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+static __always_inline bool PageAnon(struct page *page)
  {
-       page = compound_head(page);
-       return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+       return folio_test_anon(page_folio(page));
  }
  
  static __always_inline int __PageMovable(struct page *page)
@@ -508,30 +599,32 @@ static __always_inline int __PageMovable(struct page *page)
   * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
   * anon_vma, but to that page's node of the stable tree.
   */
-static __always_inline int PageKsm(struct page *page)
+static __always_inline bool folio_test_ksm(struct folio *folio)
  {
-       page = compound_head(page);
-       return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+       return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                                 PAGE_MAPPING_KSM;
  }
+
+static __always_inline bool PageKsm(struct page *page)
+{
+       return folio_test_ksm(page_folio(page));
+}
  #else
-TESTPAGEFLAG_FALSE(Ksm)
+TESTPAGEFLAG_FALSE(Ksm, ksm)
  #endif
  
  u64 stable_page_flags(struct page *page);
  
-static inline int PageUptodate(struct page *page)
+static inline bool folio_test_uptodate(struct folio *folio)
  {
-       int ret;
-       page = compound_head(page);
-       ret = test_bit(PG_uptodate, &(page)->flags);
+       bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
         /*
-        * Must ensure that the data we read out of the page is loaded
-        * _after_ we've loaded page->flags to check for PageUptodate.
-        * We can skip the barrier if the page is not uptodate, because
+        * Must ensure that the data we read out of the folio is loaded
+        * _after_ we've loaded folio->flags to check the uptodate bit.
+        * We can skip the barrier if the folio is not uptodate, because
          * we wouldn't be reading anything from it.
          *
-        * See SetPageUptodate() for the other side of the story.
+        * See folio_mark_uptodate() for the other side of the story.
          */
         if (ret)
                 smp_rmb();
@@ -539,47 +632,71 @@ static inline int PageUptodate(struct page *page)
         return ret;
  }
  
-static __always_inline void __SetPageUptodate(struct page *page)
+static inline int PageUptodate(struct page *page)
+{
+       return folio_test_uptodate(page_folio(page));
+}
+
+static __always_inline void __folio_mark_uptodate(struct folio *folio)
  {
-       VM_BUG_ON_PAGE(PageTail(page), page);
         smp_wmb();
-       __set_bit(PG_uptodate, &page->flags);
+       __set_bit(PG_uptodate, folio_flags(folio, 0));
  }
  
-static __always_inline void SetPageUptodate(struct page *page)
+static __always_inline void folio_mark_uptodate(struct folio *folio)
  {
-       VM_BUG_ON_PAGE(PageTail(page), page);
         /*
          * Memory barrier must be issued before setting the PG_uptodate bit,
-        * so that all previous stores issued in order to bring the page
-        * uptodate are actually visible before PageUptodate becomes true.
+        * so that all previous stores issued in order to bring the folio
+        * uptodate are actually visible before folio_test_uptodate becomes true.
          */
         smp_wmb();
-       set_bit(PG_uptodate, &page->flags);
+       set_bit(PG_uptodate, folio_flags(folio, 0));
+}
+
+static __always_inline void __SetPageUptodate(struct page *page)
+{
+       __folio_mark_uptodate((struct folio *)page);
+}
+
+static __always_inline void SetPageUptodate(struct page *page)
+{
+       folio_mark_uptodate((struct folio *)page);
  }
  
  CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
  
-int test_clear_page_writeback(struct page *page);
-int __test_set_page_writeback(struct page *page, bool keep_write);
+bool __folio_start_writeback(struct folio *folio, bool keep_write);
+bool set_page_writeback(struct page *page);
  
-#define test_set_page_writeback(page)                  \
-       __test_set_page_writeback(page, false)
-#define test_set_page_writeback_keepwrite(page)        \
-       __test_set_page_writeback(page, true)
+#define folio_start_writeback(folio)                   \
+       __folio_start_writeback(folio, false)
+#define folio_start_writeback_keepwrite(folio) \
+       __folio_start_writeback(folio, true)
  
-static inline void set_page_writeback(struct page *page)
+static inline void set_page_writeback_keepwrite(struct page *page)
  {
-       test_set_page_writeback(page);
+       folio_start_writeback_keepwrite(page_folio(page));
  }
  
-static inline void set_page_writeback_keepwrite(struct page *page)
+static inline bool test_set_page_writeback(struct page *page)
  {
-       test_set_page_writeback_keepwrite(page);
+       return set_page_writeback(page);
  }
  
  __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
  
+/* Whether there are one or multiple pages in a folio */
+static inline bool folio_test_single(struct folio *folio)
+{
+       return !folio_test_head(folio);
+}
+
+static inline bool folio_test_multi(struct folio *folio)
+{
+       return folio_test_head(folio);
+}
+
  static __always_inline void set_compound_head(struct page *page, struct page *head)
  {
         WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
@@ -603,12 +720,15 @@ static inline void ClearPageCompound(struct page *page)
  #ifdef CONFIG_HUGETLB_PAGE
  int PageHuge(struct page *page);
  int PageHeadHuge(struct page *page);
+static inline bool folio_test_hugetlb(struct folio *folio)
+{
+       return PageHeadHuge(&folio->page);
+}
  #else
-TESTPAGEFLAG_FALSE(Huge)
-TESTPAGEFLAG_FALSE(HeadHuge)
+TESTPAGEFLAG_FALSE(Huge, hugetlb)
+TESTPAGEFLAG_FALSE(HeadHuge, headhuge)
  #endif
  
-
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * PageHuge() only returns true for hugetlbfs pages, but not for
@@ -624,6 +744,11 @@ static inline int PageTransHuge(struct page *page)
         return PageHead(page);
  }
  
+static inline bool folio_test_transhuge(struct folio *folio)
+{
+       return folio_test_head(folio);
+}
+
  /*
   * PageTransCompound returns true for both transparent huge pages
   * and hugetlbfs pages, so it should only be called when it's known
@@ -660,12 +785,26 @@ static inline int PageTransTail(struct page *page)
  PAGEFLAG(DoubleMap, double_map, PF_SECOND)
         TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
  #else
-TESTPAGEFLAG_FALSE(TransHuge)
-TESTPAGEFLAG_FALSE(TransCompound)
-TESTPAGEFLAG_FALSE(TransCompoundMap)
-TESTPAGEFLAG_FALSE(TransTail)
-PAGEFLAG_FALSE(DoubleMap)
-       TESTSCFLAG_FALSE(DoubleMap)
+TESTPAGEFLAG_FALSE(TransHuge, transhuge)
+TESTPAGEFLAG_FALSE(TransCompound, transcompound)
+TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
+TESTPAGEFLAG_FALSE(TransTail, transtail)
+PAGEFLAG_FALSE(DoubleMap, double_map)
+       TESTSCFLAG_FALSE(DoubleMap, double_map)
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
+ * compound page.
+ *
+ * This flag is set by hwpoison handler.  Cleared by THP split or free page.
+ */
+PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
+       TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
+#else
+PAGEFLAG_FALSE(HasHWPoisoned)
+       TESTSCFLAG_FALSE(HasHWPoisoned)
  #endif
  
  /*
@@ -849,6 +988,11 @@ static inline int page_has_private(struct page *page)
         return !!(page->flags & PAGE_FLAGS_PRIVATE);
  }
  
+static inline bool folio_has_private(struct folio *folio)
+{
+       return page_has_private(&folio->page);
+}
+
  #undef PF_ANY
  #undef PF_HEAD
  #undef PF_ONLY_HEAD
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h

index d8a6aec..83abf95 100644 (file)
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -8,46 +8,16 @@
  
  #ifdef CONFIG_PAGE_IDLE_FLAG
  
-#ifdef CONFIG_64BIT
-static inline bool page_is_young(struct page *page)
-{
-       return PageYoung(page);
-}
-
-static inline void set_page_young(struct page *page)
-{
-       SetPageYoung(page);
-}
-
-static inline bool test_and_clear_page_young(struct page *page)
-{
-       return TestClearPageYoung(page);
-}
-
-static inline bool page_is_idle(struct page *page)
-{
-       return PageIdle(page);
-}
-
-static inline void set_page_idle(struct page *page)
-{
-       SetPageIdle(page);
-}
-
-static inline void clear_page_idle(struct page *page)
-{
-       ClearPageIdle(page);
-}
-#else /* !CONFIG_64BIT */
+#ifndef CONFIG_64BIT
  /*
   * If there is not enough space to store Idle and Young bits in page flags, use
   * page ext flags instead.
   */
  extern struct page_ext_operations page_idle_ops;
  
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return false;
@@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page)
         return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
  }
  
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return;
@@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page)
         set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
  }
  
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return false;
@@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page)
         return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
  }
  
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return false;
@@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page)
         return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
  }
  
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return;
@@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page)
         set_bit(PAGE_EXT_IDLE, &page_ext->flags);
  }
  
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
  {
-       struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_ext *page_ext = lookup_page_ext(&folio->page);
  
         if (unlikely(!page_ext))
                 return;
  
         clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
  }
-#endif /* CONFIG_64BIT */
+#endif /* !CONFIG_64BIT */
  
  #else /* !CONFIG_PAGE_IDLE_FLAG */
  
-static inline bool page_is_young(struct page *page)
+static inline bool folio_test_young(struct folio *folio)
  {
         return false;
  }
  
-static inline void set_page_young(struct page *page)
+static inline void folio_set_young(struct folio *folio)
  {
  }
  
-static inline bool test_and_clear_page_young(struct page *page)
+static inline bool folio_test_clear_young(struct folio *folio)
  {
         return false;
  }
  
-static inline bool page_is_idle(struct page *page)
+static inline bool folio_test_idle(struct folio *folio)
  {
         return false;
  }
  
-static inline void set_page_idle(struct page *page)
+static inline void folio_set_idle(struct folio *folio)
  {
  }
  
-static inline void clear_page_idle(struct page *page)
+static inline void folio_clear_idle(struct folio *folio)
  {
  }
  
  #endif /* CONFIG_PAGE_IDLE_FLAG */
  
+static inline bool page_is_young(struct page *page)
+{
+       return folio_test_young(page_folio(page));
+}
+
+static inline void set_page_young(struct page *page)
+{
+       folio_set_young(page_folio(page));
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return folio_test_clear_young(page_folio(page));
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return folio_test_idle(page_folio(page));
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       folio_set_idle(page_folio(page));
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       folio_clear_idle(page_folio(page));
+}
  #endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h

index 719bfe5..43c638c 100644 (file)
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order);
  extern void __set_page_owner(struct page *page,
                         unsigned int order, gfp_t gfp_mask);
  extern void __split_page_owner(struct page *page, unsigned int nr);
-extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
  extern void __set_page_owner_migrate_reason(struct page *page, int reason);
  extern void __dump_page_owner(const struct page *page);
  extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr)
         if (static_branch_unlikely(&page_owner_inited))
                 __split_page_owner(page, nr);
  }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
  {
         if (static_branch_unlikely(&page_owner_inited))
-               __copy_page_owner(oldpage, newpage);
+               __folio_copy_owner(newfolio, old);
  }
  static inline void set_page_owner_migrate_reason(struct page *page, int reason)
  {
@@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page,
                         unsigned int order)
  {
  }
-static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
  {
  }
  static inline void set_page_owner_migrate_reason(struct page *page, int reason)
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h

index 7ad46f4..2e677e6 100644 (file)
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page)
         return atomic_read(&page->_refcount);
  }
  
+/**
+ * folio_ref_count - The reference count on this folio.
+ * @folio: The folio.
+ *
+ * The refcount is usually incremented by calls to folio_get() and
+ * decremented by calls to folio_put().  Some typical users of the
+ * folio refcount:
+ *
+ * - Each reference from a page table
+ * - The page cache
+ * - Filesystem private data
+ * - The LRU list
+ * - Pipes
+ * - Direct IO which references this page in the process address space
+ *
+ * Return: The number of references to this folio.
+ */
+static inline int folio_ref_count(const struct folio *folio)
+{
+       return page_ref_count(&folio->page);
+}
+
  static inline int page_count(const struct page *page)
  {
-       return atomic_read(&compound_head(page)->_refcount);
+       return folio_ref_count(page_folio(page));
  }
  
  static inline void set_page_count(struct page *page, int v)
@@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v)
                 __page_ref_set(page, v);
  }
  
+static inline void folio_set_count(struct folio *folio, int v)
+{
+       set_page_count(&folio->page, v);
+}
+
  /*
   * Setup the page count before being freed into the page allocator for
   * the first time (boot or memory hotplug)
@@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr)
                 __page_ref_mod(page, nr);
  }
  
+static inline void folio_ref_add(struct folio *folio, int nr)
+{
+       page_ref_add(&folio->page, nr);
+}
+
  static inline void page_ref_sub(struct page *page, int nr)
  {
         atomic_sub(nr, &page->_refcount);
@@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr)
                 __page_ref_mod(page, -nr);
  }
  
+static inline void folio_ref_sub(struct folio *folio, int nr)
+{
+       page_ref_sub(&folio->page, nr);
+}
+
  static inline int page_ref_sub_return(struct page *page, int nr)
  {
         int ret = atomic_sub_return(nr, &page->_refcount);
@@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr)
         return ret;
  }
  
+static inline int folio_ref_sub_return(struct folio *folio, int nr)
+{
+       return page_ref_sub_return(&folio->page, nr);
+}
+
  static inline void page_ref_inc(struct page *page)
  {
         atomic_inc(&page->_refcount);
@@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page)
                 __page_ref_mod(page, 1);
  }
  
+static inline void folio_ref_inc(struct folio *folio)
+{
+       page_ref_inc(&folio->page);
+}
+
  static inline void page_ref_dec(struct page *page)
  {
         atomic_dec(&page->_refcount);
@@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page)
                 __page_ref_mod(page, -1);
  }
  
+static inline void folio_ref_dec(struct folio *folio)
+{
+       page_ref_dec(&folio->page);
+}
+
  static inline int page_ref_sub_and_test(struct page *page, int nr)
  {
         int ret = atomic_sub_and_test(nr, &page->_refcount);
@@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
         return ret;
  }
  
+static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
+{
+       return page_ref_sub_and_test(&folio->page, nr);
+}
+
  static inline int page_ref_inc_return(struct page *page)
  {
         int ret = atomic_inc_return(&page->_refcount);
@@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page)
         return ret;
  }
  
+static inline int folio_ref_inc_return(struct folio *folio)
+{
+       return page_ref_inc_return(&folio->page);
+}
+
  static inline int page_ref_dec_and_test(struct page *page)
  {
         int ret = atomic_dec_and_test(&page->_refcount);
@@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page)
         return ret;
  }
  
+static inline int folio_ref_dec_and_test(struct folio *folio)
+{
+       return page_ref_dec_and_test(&folio->page);
+}
+
  static inline int page_ref_dec_return(struct page *page)
  {
         int ret = atomic_dec_return(&page->_refcount);
@@ -161,15 +228,91 @@ static inline int page_ref_dec_return(struct page *page)
         return ret;
  }
  
-static inline int page_ref_add_unless(struct page *page, int nr, int u)
+static inline int folio_ref_dec_return(struct folio *folio)
+{
+       return page_ref_dec_return(&folio->page);
+}
+
+static inline bool page_ref_add_unless(struct page *page, int nr, int u)
  {
-       int ret = atomic_add_unless(&page->_refcount, nr, u);
+       bool ret = atomic_add_unless(&page->_refcount, nr, u);
  
         if (page_ref_tracepoint_active(page_ref_mod_unless))
                 __page_ref_mod_unless(page, nr, ret);
         return ret;
  }
  
+static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
+{
+       return page_ref_add_unless(&folio->page, nr, u);
+}
+
+/**
+ * folio_try_get - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * If you do not already have a reference to a folio, you can attempt to
+ * get one using this function.  It may fail if, for example, the folio
+ * has been freed since you found a pointer to it, or it is frozen for
+ * the purposes of splitting or migration.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get(struct folio *folio)
+{
+       return folio_ref_add_unless(folio, 1, 0);
+}
+
+static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
+{
+#ifdef CONFIG_TINY_RCU
+       /*
+        * The caller guarantees the folio will not be freed from interrupt
+        * context, so (on !SMP) we only need preemption to be disabled
+        * and TINY_RCU does that for us.
+        */
+# ifdef CONFIG_PREEMPT_COUNT
+       VM_BUG_ON(!in_atomic() && !irqs_disabled());
+# endif
+       VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
+       folio_ref_add(folio, count);
+#else
+       if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
+               /* Either the folio has been freed, or will be freed. */
+               return false;
+       }
+#endif
+       return true;
+}
+
+/**
+ * folio_try_get_rcu - Attempt to increase the refcount on a folio.
+ * @folio: The folio.
+ *
+ * This is a version of folio_try_get() optimised for non-SMP kernels.
+ * If you are still holding the rcu_read_lock() after looking up the
+ * page and know that the page cannot have its refcount decreased to
+ * zero in interrupt context, you can use this instead of folio_try_get().
+ *
+ * Example users include get_user_pages_fast() (as pages are not unmapped
+ * from interrupt context) and the page cache lookups (as pages are not
+ * truncated from interrupt context).  We also know that pages are not
+ * frozen in interrupt context for the purposes of splitting or migration.
+ *
+ * You can also use this function if you're holding a lock that prevents
+ * pages being frozen & removed; eg the i_pages lock for the page cache
+ * or the mmap_sem or page table lock for page tables.  In this case,
+ * it will always succeed, and you could have used a plain folio_get(),
+ * but it's sometimes more convenient to have a common function called
+ * from both locked and RCU-protected contexts.
+ *
+ * Return: True if the reference count was successfully incremented.
+ */
+static inline bool folio_try_get_rcu(struct folio *folio)
+{
+       return folio_ref_try_add_rcu(folio, 1);
+}
+
  static inline int page_ref_freeze(struct page *page, int count)
  {
         int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
@@ -179,6 +322,11 @@ static inline int page_ref_freeze(struct page *page, int count)
         return ret;
  }
  
+static inline int folio_ref_freeze(struct folio *folio, int count)
+{
+       return page_ref_freeze(&folio->page, count);
+}
+
  static inline void page_ref_unfreeze(struct page *page, int count)
  {
         VM_BUG_ON_PAGE(page_count(page) != 0, page);
@@ -189,4 +337,8 @@ static inline void page_ref_unfreeze(struct page *page, int count)
                 __page_ref_unfreeze(page, count);
  }
  
+static inline void folio_ref_unfreeze(struct folio *folio, int count)
+{
+       page_ref_unfreeze(&folio->page, count);
+}
  #endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index 62db6b0..013cdc9 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -162,149 +162,119 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
  
  void release_pages(struct page **pages, int nr);
  
-/*
- * For file cache pages, return the address_space, otherwise return NULL
+struct address_space *page_mapping(struct page *);
+struct address_space *folio_mapping(struct folio *);
+struct address_space *swapcache_mapping(struct folio *);
+
+/**
+ * folio_file_mapping - Find the mapping this folio belongs to.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the mapping of the
+ * swap file or swap device where the data is stored.  This is different
+ * from the mapping returned by folio_mapping().  The only reason to
+ * use it is if, like NFS, you return 0 from ->activate_swapfile.
+ *
+ * Do not call this for folios which aren't in the page cache or swap cache.
   */
-static inline struct address_space *page_mapping_file(struct page *page)
+static inline struct address_space *folio_file_mapping(struct folio *folio)
  {
-       if (unlikely(PageSwapCache(page)))
-               return NULL;
-       return page_mapping(page);
+       if (unlikely(folio_test_swapcache(folio)))
+               return swapcache_mapping(folio);
+
+       return folio->mapping;
+}
+
+static inline struct address_space *page_file_mapping(struct page *page)
+{
+       return folio_file_mapping(page_folio(page));
  }
  
  /*
- * speculatively take a reference to a page.
- * If the page is free (_refcount == 0), then _refcount is untouched, and 0
- * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
- *
- * This function must be called inside the same rcu_read_lock() section as has
- * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
- *
- * Unless an RCU grace period has passed, the count of all pages coming out
- * of the allocator must be considered unstable. page_count may return higher
- * than expected, and put_page must be able to do the right thing when the
- * page has been finished with, no matter what it is subsequently allocated
- * for (because put_page is what is used here to drop an invalid speculative
- * reference).
- *
- * This is the interesting part of the lockless pagecache (and lockless
- * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
- * has the following pattern:
- * 1. find page in radix tree
- * 2. conditionally increment refcount
- * 3. check the page is still in pagecache (if no, goto 1)
- *
- * Remove-side that cares about stability of _refcount (eg. reclaim) has the
- * following (with the i_pages lock held):
- * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
- * B. remove page from pagecache
- * C. free the page
- *
- * There are 2 critical interleavings that matter:
- * - 2 runs before A: in this case, A sees elevated refcount and bails out
- * - A runs before 2: in this case, 2 sees zero refcount and retries;
- *   subsequently, B will complete and 1 will find no page, causing the
- *   lookup to return NULL.
- *
- * It is possible that between 1 and 2, the page is removed then the exact same
- * page is inserted into the same position in pagecache. That's OK: the
- * old find_get_page using a lock could equally have run before or after
- * such a re-insertion, depending on order that locks are granted.
- *
- * Lookups racing against pagecache insertion isn't a big problem: either 1
- * will find the page or it will not. Likewise, the old find_get_page could run
- * either before the insertion or afterwards, depending on timing.
+ * For file cache pages, return the address_space, otherwise return NULL
   */
-static inline int __page_cache_add_speculative(struct page *page, int count)
+static inline struct address_space *page_mapping_file(struct page *page)
  {
-#ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
-       VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
-       /*
-        * Preempt must be disabled here - we rely on rcu_read_lock doing
-        * this for us.
-        *
-        * Pagecache won't be truncated from interrupt context, so if we have
-        * found a page in the radix tree here, we have pinned its refcount by
-        * disabling preempt, and hence no need for the "speculative get" that
-        * SMP requires.
-        */
-       VM_BUG_ON_PAGE(page_count(page) == 0, page);
-       page_ref_add(page, count);
+       struct folio *folio = page_folio(page);
  
-#else
-       if (unlikely(!page_ref_add_unless(page, count, 0))) {
-               /*
-                * Either the page has been freed, or will be freed.
-                * In either case, retry here and the caller should
-                * do the right thing (see comments above).
-                */
-               return 0;
-       }
-#endif
-       VM_BUG_ON_PAGE(PageTail(page), page);
-
-       return 1;
+       if (unlikely(folio_test_swapcache(folio)))
+               return NULL;
+       return folio_mapping(folio);
  }
  
-static inline int page_cache_get_speculative(struct page *page)
+static inline bool page_cache_add_speculative(struct page *page, int count)
  {
-       return __page_cache_add_speculative(page, 1);
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       return folio_ref_try_add_rcu((struct folio *)page, count);
  }
  
-static inline int page_cache_add_speculative(struct page *page, int count)
+static inline bool page_cache_get_speculative(struct page *page)
  {
-       return __page_cache_add_speculative(page, count);
+       return page_cache_add_speculative(page, 1);
  }
  
  /**
- * attach_page_private - Attach private data to a page.
- * @page: Page to attach data to.
- * @data: Data to attach to page.
+ * folio_attach_private - Attach private data to a folio.
+ * @folio: Folio to attach data to.
+ * @data: Data to attach to folio.
   *
- * Attaching private data to a page increments the page's reference count.
- * The data must be detached before the page will be freed.
+ * Attaching private data to a folio increments the page's reference count.
+ * The data must be detached before the folio will be freed.
   */
-static inline void attach_page_private(struct page *page, void *data)
+static inline void folio_attach_private(struct folio *folio, void *data)
  {
-       get_page(page);
-       set_page_private(page, (unsigned long)data);
-       SetPagePrivate(page);
+       folio_get(folio);
+       folio->private = data;
+       folio_set_private(folio);
  }
  
  /**
- * detach_page_private - Detach private data from a page.
- * @page: Page to detach data from.
+ * folio_detach_private - Detach private data from a folio.
+ * @folio: Folio to detach data from.
   *
- * Removes the data that was previously attached to the page and decrements
+ * Removes the data that was previously attached to the folio and decrements
   * the refcount on the page.
   *
- * Return: Data that was attached to the page.
+ * Return: Data that was attached to the folio.
   */
-static inline void *detach_page_private(struct page *page)
+static inline void *folio_detach_private(struct folio *folio)
  {
-       void *data = (void *)page_private(page);
+       void *data = folio_get_private(folio);
  
-       if (!PagePrivate(page))
+       if (!folio_test_private(folio))
                 return NULL;
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
+       folio_clear_private(folio);
+       folio->private = NULL;
+       folio_put(folio);
  
         return data;
  }
  
+static inline void attach_page_private(struct page *page, void *data)
+{
+       folio_attach_private(page_folio(page), data);
+}
+
+static inline void *detach_page_private(struct page *page)
+{
+       return folio_detach_private(page_folio(page));
+}
+
  #ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
  #else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
  {
-       return alloc_pages(gfp, 0);
+       return folio_alloc(gfp, order);
  }
  #endif
  
+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+       return &filemap_alloc_folio(gfp, 0)->page;
+}
+
  static inline struct page *page_cache_alloc(struct address_space *x)
  {
         return __page_cache_alloc(mapping_gfp_mask(x));
@@ -331,9 +301,28 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
  #define FGP_FOR_MMAP           0x00000040
  #define FGP_HEAD               0x00000080
  #define FGP_ENTRY              0x00000100
+#define FGP_STABLE             0x00000200
  
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
-               int fgp_flags, gfp_t cache_gfp_mask);
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp);
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp);
+
+/**
+ * filemap_get_folio - Find and get a folio.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ *
+ * Looks up the page cache entry at @mapping & @index.  If a folio is
+ * present, it is returned with an increased refcount.
+ *
+ * Otherwise, %NULL is returned.
+ */
+static inline struct folio *filemap_get_folio(struct address_space *mapping,
+                                       pgoff_t index)
+{
+       return __filemap_get_folio(mapping, index, 0, 0);
+}
  
  /**
   * find_get_page - find and get a page reference
@@ -377,25 +366,6 @@ static inline struct page *find_lock_page(struct address_space *mapping,
  }
  
  /**
- * find_lock_head - Locate, pin and lock a pagecache page.
- * @mapping: The address_space to search.
- * @index: The page index.
- *
- * Looks up the page cache entry at @mapping & @index.  If there is a
- * page cache page, its head page is returned locked and with an increased
- * refcount.
- *
- * Context: May sleep.
- * Return: A struct page which is !PageTail, or %NULL if there is no page
- * in the cache for this index.
- */
-static inline struct page *find_lock_head(struct address_space *mapping,
-                                       pgoff_t index)
-{
-       return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
-}
-
-/**
   * find_or_create_page - locate or add a pagecache page
   * @mapping: the page's address_space
   * @index: the page's index into the mapping
@@ -452,6 +422,73 @@ static inline bool thp_contains(struct page *head, pgoff_t index)
         return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
  }
  
+#define swapcache_index(folio) __page_file_index(&(folio)->page)
+
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to.  If you know
+ * the page is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+        if (unlikely(folio_test_swapcache(folio)))
+                return swapcache_index(folio);
+        return folio->index;
+}
+
+/**
+ * folio_next_index - Get the index of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The index of the folio which follows this folio in the file.
+ */
+static inline pgoff_t folio_next_index(struct folio *folio)
+{
+       return folio->index + folio_nr_pages(folio);
+}
+
+/**
+ * folio_file_page - The page for a particular index.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Sometimes after looking up a folio in the page cache, we need to
+ * obtain the specific page for an index (eg a page fault).
+ *
+ * Return: The page containing the file data for this index.
+ */
+static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
+{
+       /* HugeTLBfs indexes the page cache in units of hpage_size */
+       if (folio_test_hugetlb(folio))
+               return &folio->page;
+       return folio_page(folio, index & (folio_nr_pages(folio) - 1));
+}
+
+/**
+ * folio_contains - Does this folio contain this index?
+ * @folio: The folio.
+ * @index: The page index within the file.
+ *
+ * Context: The caller should have the page locked in order to prevent
+ * (eg) shmem from moving the page between the page cache and swap cache
+ * and changing its index in the middle of the operation.
+ * Return: true or false.
+ */
+static inline bool folio_contains(struct folio *folio, pgoff_t index)
+{
+       /* HugeTLBfs indexes the page cache in units of hpage_size */
+       if (folio_test_hugetlb(folio))
+               return folio->index == index;
+       return index - folio_index(folio) < folio_nr_pages(folio);
+}
+
  /*
   * Given the page we found in the page cache, return the page corresponding
   * to this index in the file
@@ -560,6 +597,27 @@ static inline loff_t page_file_offset(struct page *page)
         return ((loff_t)page_index(page)) << PAGE_SHIFT;
  }
  
+/**
+ * folio_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ */
+static inline loff_t folio_pos(struct folio *folio)
+{
+       return page_offset(&folio->page);
+}
+
+/**
+ * folio_file_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
+ *
+ * This differs from folio_pos() for folios which belong to a swap file.
+ * NFS is the only filesystem today which needs to use folio_file_pos().
+ */
+static inline loff_t folio_file_pos(struct folio *folio)
+{
+       return page_file_offset(&folio->page);
+}
+
  extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                      unsigned long address);
  
@@ -575,13 +633,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
  }
  
  struct wait_page_key {
-       struct page *page;
+       struct folio *folio;
         int bit_nr;
         int page_match;
  };
  
  struct wait_page_queue {
-       struct page *page;
+       struct folio *folio;
         int bit_nr;
         wait_queue_entry_t wait;
  };
@@ -589,7 +647,7 @@ struct wait_page_queue {
  static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                   struct wait_page_key *key)
  {
-       if (wait_page->page != key->page)
+       if (wait_page->folio != key->folio)
                return false;
         key->page_match = 1;
  
@@ -599,20 +657,31 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
         return true;
  }
  
-extern void __lock_page(struct page *page);
-extern int __lock_page_killable(struct page *page);
-extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
-extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+void __folio_lock(struct folio *folio);
+int __folio_lock_killable(struct folio *folio);
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
                                 unsigned int flags);
-extern void unlock_page(struct page *page);
+void unlock_page(struct page *page);
+void folio_unlock(struct folio *folio);
+
+static inline bool folio_trylock(struct folio *folio)
+{
+       return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
+}
  
  /*
   * Return true if the page was successfully locked
   */
  static inline int trylock_page(struct page *page)
  {
-       page = compound_head(page);
-       return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+       return folio_trylock(page_folio(page));
+}
+
+static inline void folio_lock(struct folio *folio)
+{
+       might_sleep();
+       if (!folio_trylock(folio))
+               __folio_lock(folio);
  }
  
  /*
@@ -620,38 +689,30 @@ static inline int trylock_page(struct page *page)
   */
  static inline void lock_page(struct page *page)
  {
+       struct folio *folio;
         might_sleep();
-       if (!trylock_page(page))
-               __lock_page(page);
+
+       folio = page_folio(page);
+       if (!folio_trylock(folio))
+               __folio_lock(folio);
  }
  
-/*
- * lock_page_killable is like lock_page but can be interrupted by fatal
- * signals.  It returns 0 if it locked the page and -EINTR if it was
- * killed while waiting.
- */
-static inline int lock_page_killable(struct page *page)
+static inline int folio_lock_killable(struct folio *folio)
  {
         might_sleep();
-       if (!trylock_page(page))
-               return __lock_page_killable(page);
+       if (!folio_trylock(folio))
+               return __folio_lock_killable(folio);
         return 0;
  }
  
  /*
- * lock_page_async - Lock the page, unless this would block. If the page
- * is already locked, then queue a callback when the page becomes unlocked.
- * This callback can then retry the operation.
- *
- * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
- * was already locked and the callback defined in 'wait' was queued.
+ * lock_page_killable is like lock_page but can be interrupted by fatal
+ * signals.  It returns 0 if it locked the page and -EINTR if it was
+ * killed while waiting.
   */
-static inline int lock_page_async(struct page *page,
-                                 struct wait_page_queue *wait)
+static inline int lock_page_killable(struct page *page)
  {
-       if (!trylock_page(page))
-               return __lock_page_async(page, wait);
-       return 0;
+       return folio_lock_killable(page_folio(page));
  }
  
  /*
@@ -659,78 +720,108 @@ static inline int lock_page_async(struct page *page,
   * caller indicated that it can handle a retry.
   *
   * Return value and mmap_lock implications depend on flags; see
- * __lock_page_or_retry().
+ * __folio_lock_or_retry().
   */
-static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
+static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                      unsigned int flags)
  {
+       struct folio *folio;
         might_sleep();
-       return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+
+       folio = page_folio(page);
+       return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
  }
  
  /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
   * and should not be used directly.
   */
-extern void wait_on_page_bit(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
+void folio_wait_bit(struct folio *folio, int bit_nr);
+int folio_wait_bit_killable(struct folio *folio, int bit_nr);
  
  /* 
- * Wait for a page to be unlocked.
+ * Wait for a folio to be unlocked.
   *
- * This must be called with the caller "holding" the page,
- * ie with increased "page->count" so that the page won't
+ * This must be called with the caller "holding" the folio,
+ * ie with increased "page->count" so that the folio won't
   * go away during the wait..
   */
+static inline void folio_wait_locked(struct folio *folio)
+{
+       if (folio_test_locked(folio))
+               folio_wait_bit(folio, PG_locked);
+}
+
+static inline int folio_wait_locked_killable(struct folio *folio)
+{
+       if (!folio_test_locked(folio))
+               return 0;
+       return folio_wait_bit_killable(folio, PG_locked);
+}
+
  static inline void wait_on_page_locked(struct page *page)
  {
-       if (PageLocked(page))
-               wait_on_page_bit(compound_head(page), PG_locked);
+       folio_wait_locked(page_folio(page));
  }
  
  static inline int wait_on_page_locked_killable(struct page *page)
  {
-       if (!PageLocked(page))
-               return 0;
-       return wait_on_page_bit_killable(compound_head(page), PG_locked);
+       return folio_wait_locked_killable(page_folio(page));
  }
  
  int put_and_wait_on_page_locked(struct page *page, int state);
  void wait_on_page_writeback(struct page *page);
-int wait_on_page_writeback_killable(struct page *page);
-extern void end_page_writeback(struct page *page);
+void folio_wait_writeback(struct folio *folio);
+int folio_wait_writeback_killable(struct folio *folio);
+void end_page_writeback(struct page *page);
+void folio_end_writeback(struct folio *folio);
  void wait_for_stable_page(struct page *page);
+void folio_wait_stable(struct folio *folio);
+void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
+static inline void __set_page_dirty(struct page *page,
+               struct address_space *mapping, int warn)
+{
+       __folio_mark_dirty(page_folio(page), mapping, warn);
+}
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
+                         struct bdi_writeback *wb);
+static inline void account_page_cleaned(struct page *page,
+               struct address_space *mapping, struct bdi_writeback *wb)
+{
+       return folio_account_cleaned(page_folio(page), mapping, wb);
+}
+void __folio_cancel_dirty(struct folio *folio);
+static inline void folio_cancel_dirty(struct folio *folio)
+{
+       /* Avoid atomic ops, locking, etc. when not actually needed. */
+       if (folio_test_dirty(folio))
+               __folio_cancel_dirty(folio);
+}
+static inline void cancel_dirty_page(struct page *page)
+{
+       folio_cancel_dirty(page_folio(page));
+}
+bool folio_clear_dirty_for_io(struct folio *folio);
+bool clear_page_dirty_for_io(struct page *page);
+int __must_check folio_write_one(struct folio *folio);
+static inline int __must_check write_one_page(struct page *page)
+{
+       return folio_write_one(page_folio(page));
+}
  
-void __set_page_dirty(struct page *, struct address_space *, int warn);
  int __set_page_dirty_nobuffers(struct page *page);
  int __set_page_dirty_no_writeback(struct page *page);
  
  void page_endio(struct page *page, bool is_write, int err);
  
-/**
- * set_page_private_2 - Set PG_private_2 on a page and take a ref
- * @page: The page.
- *
- * Set the PG_private_2 flag on a page and take the reference needed for the VM
- * to handle its lifetime correctly.  This sets the flag and takes the
- * reference unconditionally, so care must be taken not to set the flag again
- * if it's already set.
- */
-static inline void set_page_private_2(struct page *page)
-{
-       page = compound_head(page);
-       get_page(page);
-       SetPagePrivate2(page);
-}
-
-void end_page_private_2(struct page *page);
-void wait_on_page_private_2(struct page *page);
-int wait_on_page_private_2_killable(struct page *page);
+void folio_end_private_2(struct folio *folio);
+void folio_wait_private_2(struct folio *folio);
+int folio_wait_private_2_killable(struct folio *folio);
  
  /*
   * Add an arbitrary waiter to a page's wait queue
   */
-extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);
  
  /*
   * Fault everything in given userspace address range in.
@@ -790,9 +881,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
  }
  
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-                               pgoff_t index, gfp_t gfp_mask);
+               pgoff_t index, gfp_t gfp);
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-                               pgoff_t index, gfp_t gfp_mask);
+               pgoff_t index, gfp_t gfp);
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+               pgoff_t index, gfp_t gfp);
  extern void delete_from_page_cache(struct page *page);
  extern void __delete_from_page_cache(struct page *page, void *shadow);
  void replace_page_cache_page(struct page *old, struct page *new);
@@ -817,6 +910,10 @@ static inline int add_to_page_cache(struct page *page,
         return error;
  }
  
+/* Must be non-static for BPF error injection */
+int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
+               pgoff_t index, gfp_t gfp, void **shadowp);
+
  /**
   * struct readahead_control - Describes a readahead request.
   *
@@ -906,33 +1003,57 @@ void page_cache_async_readahead(struct address_space *mapping,
         page_cache_async_ra(&ractl, page, req_count);
  }
  
+static inline struct folio *__readahead_folio(struct readahead_control *ractl)
+{
+       struct folio *folio;
+
+       BUG_ON(ractl->_batch_count > ractl->_nr_pages);
+       ractl->_nr_pages -= ractl->_batch_count;
+       ractl->_index += ractl->_batch_count;
+
+       if (!ractl->_nr_pages) {
+               ractl->_batch_count = 0;
+               return NULL;
+       }
+
+       folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       ractl->_batch_count = folio_nr_pages(folio);
+
+       return folio;
+}
+
  /**
   * readahead_page - Get the next page to read.
- * @rac: The current readahead request.
+ * @ractl: The current readahead request.
   *
   * Context: The page is locked and has an elevated refcount.  The caller
   * should decreases the refcount once the page has been submitted for I/O
   * and unlock the page once all I/O to that page has completed.
   * Return: A pointer to the next page, or %NULL if we are done.
   */
-static inline struct page *readahead_page(struct readahead_control *rac)
+static inline struct page *readahead_page(struct readahead_control *ractl)
  {
-       struct page *page;
-
-       BUG_ON(rac->_batch_count > rac->_nr_pages);
-       rac->_nr_pages -= rac->_batch_count;
-       rac->_index += rac->_batch_count;
+       struct folio *folio = __readahead_folio(ractl);
  
-       if (!rac->_nr_pages) {
-               rac->_batch_count = 0;
-               return NULL;
-       }
+       return &folio->page;
+}
  
-       page = xa_load(&rac->mapping->i_pages, rac->_index);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       rac->_batch_count = thp_nr_pages(page);
+/**
+ * readahead_folio - Get the next folio to read.
+ * @ractl: The current readahead request.
+ *
+ * Context: The folio is locked.  The caller should unlock the folio once
+ * all I/O to that folio has completed.
+ * Return: A pointer to the next folio, or %NULL if we are done.
+ */
+static inline struct folio *readahead_folio(struct readahead_control *ractl)
+{
+       struct folio *folio = __readahead_folio(ractl);
  
-       return page;
+       if (folio)
+               folio_put(folio);
+       return folio;
  }
  
  static inline unsigned int __readahead_batch(struct readahead_control *rac,
@@ -1040,6 +1161,34 @@ static inline unsigned long dir_pages(struct inode *inode)
  }
  
  /**
+ * folio_mkwrite_check_truncate - check if folio was truncated
+ * @folio: the folio to check
+ * @inode: the inode to check the folio against
+ *
+ * Return: the number of bytes in the folio up to EOF,
+ * or -EFAULT if the folio was truncated.
+ */
+static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
+                                             struct inode *inode)
+{
+       loff_t size = i_size_read(inode);
+       pgoff_t index = size >> PAGE_SHIFT;
+       size_t offset = offset_in_folio(folio, size);
+
+       if (!folio->mapping)
+               return -EFAULT;
+
+       /* folio is wholly inside EOF */
+       if (folio_next_index(folio) - 1 < index)
+               return folio_size(folio);
+       /* folio is wholly past EOF */
+       if (folio->index > index || !offset)
+               return -EFAULT;
+       /* folio is partially inside EOF */
+       return offset;
+}
+
+/**
   * page_mkwrite_check_truncate - check if page was truncated
   * @page: the page to check
   * @inode: the inode to check the page against
@@ -1068,19 +1217,25 @@ static inline int page_mkwrite_check_truncate(struct page *page,
  }
  
  /**
- * i_blocks_per_page - How many blocks fit in this page.
+ * i_blocks_per_folio - How many blocks fit in this folio.
   * @inode: The inode which contains the blocks.
- * @page: The page (head page if the page is a THP).
+ * @folio: The folio.
   *
- * If the block size is larger than the size of this page, return zero.
+ * If the block size is larger than the size of this folio, return zero.
   *
- * Context: The caller should hold a refcount on the page to prevent it
+ * Context: The caller should hold a refcount on the folio to prevent it
   * from being split.
- * Return: The number of filesystem blocks covered by this page.
+ * Return: The number of filesystem blocks covered by this folio.
   */
  static inline
+unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
+{
+       return folio_size(folio) >> inode->i_blkbits;
+}
+
+static inline
  unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
  {
-       return thp_size(page) >> inode->i_blkbits;
+       return i_blocks_per_folio(inode, page_folio(page));
  }
  #endif /* _LINUX_PAGEMAP_H */
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h

index d255812..6f7949b 100644 (file)
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -3,6 +3,7 @@
  #define _LINUX_PART_STAT_H
  
  #include <linux/genhd.h>
+#include <asm/local.h>
  
  struct disk_stats {
         u64 nsecs[NR_STAT_GROUPS];
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h

index ae16a98..b31d3f3 100644 (file)
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -267,6 +267,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
  }
  
  /**
+ * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
+ * caller is responsible for taking RCU.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
+{
+       unsigned long __percpu *percpu_count;
+       bool ret = false;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (likely(__ref_is_percpu(ref, &percpu_count))) {
+               this_cpu_inc(*percpu_count);
+               ret = true;
+       } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
+               ret = atomic_long_inc_not_zero(&ref->data->count);
+       }
+       return ret;
+}
+
+/**
   * percpu_ref_tryget_live - try to increment a live percpu refcount
   * @ref: percpu_ref to try-get
   *
@@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
   */
  static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
  {
-       unsigned long __percpu *percpu_count;
         bool ret = false;
  
         rcu_read_lock();
-
-       if (__ref_is_percpu(ref, &percpu_count)) {
-               this_cpu_inc(*percpu_count);
-               ret = true;
-       } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
-               ret = atomic_long_inc_not_zero(&ref->data->count);
-       }
-
+       ret = percpu_ref_tryget_live_rcu(ref);
         rcu_read_unlock();
-
         return ret;
  }
  
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index c976cc6..e704b1a 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
   *
   * returns the number of cleaned PTEs.
   */
-int page_mkclean(struct page *);
+int folio_mkclean(struct folio *);
  
  /*
   * called in munlock()/munmap() path to check for other vmas holding
@@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
  {
  }
  
-static inline int page_mkclean(struct page *page)
+static inline int folio_mkclean(struct folio *folio)
  {
         return 0;
  }
-
-
  #endif /* CONFIG_MMU */
  
+static inline int page_mkclean(struct page *page)
+{
+       return folio_mkclean(page_folio(page));
+}
  #endif /* _LINUX_RMAP_H */
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h

index 2713e68..4a6ff27 100644 (file)
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -427,6 +427,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  int __sbitmap_queue_get(struct sbitmap_queue *sbq);
  
  /**
+ * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
+ * @sbq: Bitmap queue to allocate from.
+ * @nr_tags: number of tags requested
+ * @offset: offset to add to returned bits
+ *
+ * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
+ * a bit in the mask returned, and the caller must add @offset to the value to
+ * get the absolute tag value.
+ */
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+                                       unsigned int *offset);
+
+/**
   * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
   * sbitmap_queue, limiting the depth used from each word, with preemption
   * already disabled.
@@ -515,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
  void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                          unsigned int cpu);
  
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+                               int *tags, int nr_tags);
+
  static inline int sbq_index_inc(int index)
  {
         return (index + 1) & (SBQ_WAIT_QUEUES - 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index c1a927d..e0454e6 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,10 +1160,8 @@ struct task_struct {
         /* Stacked block device info: */
         struct bio_list                 *bio_list;
  
-#ifdef CONFIG_BLOCK
         /* Stack plugging: */
         struct blk_plug                 *plug;
-#endif
  
         /* VM state: */
         struct reclaim_state            *reclaim_state;
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

index 14ab0c0..1ce9a9e 100644 (file)
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
                              struct sk_msg *msg, u32 bytes);
  int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
                    int len, int flags);
+bool sk_msg_is_readable(struct sock *sk);
  
  static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
  {
diff --git a/include/linux/swap.h b/include/linux/swap.h

index ba52f3a..cdf0957 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -320,11 +320,17 @@ struct vma_swap_readahead {
  #endif
  };
  
+static inline swp_entry_t folio_swap_entry(struct folio *folio)
+{
+       swp_entry_t entry = { .val = page_private(&folio->page) };
+       return entry;
+}
+
  /* linux/mm/workingset.c */
  void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
  void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
-void workingset_refault(struct page *page, void *shadow);
-void workingset_activation(struct page *page);
+void workingset_refault(struct folio *folio, void *shadow);
+void workingset_activation(struct folio *folio);
  
  /* Only track the nodes of mappings with shadow entries */
  void workingset_update_node(struct xa_node *node);
@@ -344,9 +350,11 @@ extern unsigned long nr_free_buffer_pages(void);
  /* linux/mm/swap.c */
  extern void lru_note_cost(struct lruvec *lruvec, bool file,
                           unsigned int nr_pages);
-extern void lru_note_cost_page(struct page *);
+extern void lru_note_cost_folio(struct folio *);
+extern void folio_add_lru(struct folio *);
  extern void lru_cache_add(struct page *);
-extern void mark_page_accessed(struct page *);
+void mark_page_accessed(struct page *);
+void folio_mark_accessed(struct folio *);
  
  extern atomic_t lru_disable_count;
  
@@ -365,7 +373,6 @@ extern void lru_add_drain(void);
  extern void lru_add_drain_cpu(int cpu);
  extern void lru_add_drain_cpu_zone(struct zone *zone);
  extern void lru_add_drain_all(void);
-extern void rotate_reclaimable_page(struct page *page);
  extern void deactivate_file_page(struct page *page);
  extern void deactivate_page(struct page *page);
  extern void mark_page_lazyfree(struct page *page);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h

index 96305a6..c635c2e 100644 (file)
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -3,7 +3,7 @@
  #define _LINUX_T10_PI_H
  
  #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  
  /*
   * A T10 PI-capable target device can be formatted with different
diff --git a/include/linux/tpm.h b/include/linux/tpm.h

index aa11fe3..12d8277 100644 (file)
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -269,6 +269,7 @@ enum tpm2_cc_attrs {
  #define TPM_VID_INTEL    0x8086
  #define TPM_VID_WINBOND  0x1050
  #define TPM_VID_STM      0x104A
+#define TPM_VID_ATML     0x1114
  
  enum tpm_chip_flags {
         TPM_CHIP_FLAG_TPM2              = BIT(1),
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index d6a6cf5..bfe3886 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone,
                         struct per_cpu_zonestat *pzstats) { }
  #endif         /* CONFIG_SMP */
  
+static inline void __zone_stat_mod_folio(struct folio *folio,
+               enum zone_stat_item item, long nr)
+{
+       __mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void __zone_stat_add_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __zone_stat_sub_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void zone_stat_mod_folio(struct folio *folio,
+               enum zone_stat_item item, long nr)
+{
+       mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void zone_stat_add_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void zone_stat_sub_folio(struct folio *folio,
+               enum zone_stat_item item)
+{
+       mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void __node_stat_mod_folio(struct folio *folio,
+               enum node_stat_item item, long nr)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void __node_stat_add_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __node_stat_sub_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void node_stat_mod_folio(struct folio *folio,
+               enum node_stat_item item, long nr)
+{
+       mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void node_stat_add_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void node_stat_sub_folio(struct folio *folio,
+               enum node_stat_item item)
+{
+       mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
  static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
                                              int migratetype)
  {
@@ -525,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page,
  
  #endif /* CONFIG_MEMCG */
  
-static inline void inc_lruvec_state(struct lruvec *lruvec,
-                                   enum node_stat_item idx)
-{
-       mod_lruvec_state(lruvec, idx, 1);
-}
-
  static inline void __inc_lruvec_page_state(struct page *page,
                                            enum node_stat_item idx)
  {
@@ -543,6 +609,24 @@ static inline void __dec_lruvec_page_state(struct page *page,
         __mod_lruvec_page_state(page, idx, -1);
  }
  
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+                                          enum node_stat_item idx, int val)
+{
+       __mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void __lruvec_stat_add_folio(struct folio *folio,
+                                          enum node_stat_item idx)
+{
+       __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void __lruvec_stat_sub_folio(struct folio *folio,
+                                          enum node_stat_item idx)
+{
+       __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
+
  static inline void inc_lruvec_page_state(struct page *page,
                                          enum node_stat_item idx)
  {
@@ -555,4 +639,21 @@ static inline void dec_lruvec_page_state(struct page *page,
         mod_lruvec_page_state(page, idx, -1);
  }
  
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+                                        enum node_stat_item idx, int val)
+{
+       mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void lruvec_stat_add_folio(struct folio *folio,
+                                        enum node_stat_item idx)
+{
+       lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void lruvec_stat_sub_folio(struct folio *folio,
+                                        enum node_stat_item idx)
+{
+       lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
  #endif /* _LINUX_VMSTAT_H */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index d1f65ad..3bfd487 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,7 +11,6 @@
  #include <linux/flex_proportions.h>
  #include <linux/backing-dev-defs.h>
  #include <linux/blk_types.h>
-#include <linux/blk-cgroup.h>
  
  struct bio;
  
@@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
         return flags;
  }
  
-static inline struct cgroup_subsys_state *
-wbc_blkcg_css(struct writeback_control *wbc)
-{
  #ifdef CONFIG_CGROUP_WRITEBACK
-       if (wbc->wb)
-               return wbc->wb->blkcg_css;
-#endif
-       return blkcg_root_css;
-}
+#define wbc_blkcg_css(wbc) \
+       ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
+#else
+#define wbc_blkcg_css(wbc)             (blkcg_root_css)
+#endif /* CONFIG_CGROUP_WRITEBACK */
  
  /*
   * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
@@ -393,7 +389,14 @@ void writeback_set_ratelimit(void);
  void tag_pages_for_writeback(struct address_space *mapping,
                              pgoff_t start, pgoff_t end);
  
-void account_page_redirty(struct page *page);
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
+void folio_account_redirty(struct folio *folio);
+static inline void account_page_redirty(struct page *page)
+{
+       folio_account_redirty(page_folio(page));
+}
+bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
+bool redirty_page_for_writepage(struct writeback_control *, struct page *);
  
  void sb_mark_inode_writeback(struct inode *inode);
  void sb_clear_inode_writeback(struct inode *inode);
diff --git a/include/linux/xz.h b/include/linux/xz.h

index 9884c84..7285ca5 100644 (file)
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -234,6 +234,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
  XZ_EXTERN void xz_dec_end(struct xz_dec *s);
  
  /*
+ * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
+ * See xz_dec_microlzma_alloc() below for details.
+ *
+ * These functions aren't used or available in preboot code and thus aren't
+ * marked with XZ_EXTERN. This avoids warnings about static functions that
+ * are never defined.
+ */
+/**
+ * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
+ */
+struct xz_dec_microlzma;
+
+/**
+ * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
+ * @mode        XZ_SINGLE or XZ_PREALLOC
+ * @dict_size   LZMA dictionary size. This must be at least 4 KiB and
+ *              at most 3 GiB.
+ *
+ * In contrast to xz_dec_init(), this function only allocates the memory
+ * and remembers the dictionary size. xz_dec_microlzma_reset() must be used
+ * before calling xz_dec_microlzma_run().
+ *
+ * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE.
+ * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated.
+ *
+ * On success, xz_dec_microlzma_alloc() returns a pointer to
+ * struct xz_dec_microlzma. If memory allocation fails or
+ * dict_size is invalid, NULL is returned.
+ *
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too.
+ */
+extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+                                                      uint32_t dict_size);
+
+/**
+ * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
+ * @s           Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size   Compressed size of the input stream
+ * @uncomp_size Uncompressed size of the input stream. A value smaller
+ *              than the real uncompressed size of the input stream can
+ *              be specified if uncomp_size_is_exact is set to false.
+ *              uncomp_size can never be set to a value larger than the
+ *              expected real uncompressed size because it would eventually
+ *              result in XZ_DATA_ERROR.
+ * @uncomp_size_is_exact  This is an int instead of bool to avoid
+ *              requiring stdbool.h. This should normally be set to true.
+ *              When this is set to false, error detection is weaker.
+ */
+extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
+                                  uint32_t comp_size, uint32_t uncomp_size,
+                                  int uncomp_size_is_exact);
+
+/**
+ * xz_dec_microlzma_run() - Run the MicroLZMA decoder
+ * @s           Decoder state initialized using xz_dec_microlzma_reset()
+ * @b:          Input and output buffers
+ *
+ * This works similarly to xz_dec_run() with a few important differences.
+ * Only the differences are documented here.
+ *
+ * The only possible return values are XZ_OK, XZ_STREAM_END, and
+ * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress
+ * is possible due to lack of input data or output space, this function will
+ * keep returning XZ_OK. Thus, the calling code must be written so that it
+ * will eventually provide input and output space matching (or exceeding)
+ * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset().
+ * If the caller cannot do this (for example, if the input file is truncated
+ * or otherwise corrupt), the caller must detect this error by itself to
+ * avoid an infinite loop.
+ *
+ * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned.
+ * This can happen also when incorrect dictionary, uncompressed, or
+ * compressed sizes have been specified.
+ *
+ * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over
+ * uncompressed data. This way the caller doesn't need to provide a temporary
+ * output buffer for the bytes that will be ignored.
+ *
+ * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK
+ * is also possible and thus XZ_SINGLE is actually a limited multi-call mode.
+ * After XZ_OK the bytes decoded so far may be read from the output buffer.
+ * It is possible to continue decoding but the variables b->out and b->out_pos
+ * MUST NOT be changed by the caller. Increasing the value of b->out_size is
+ * allowed to make more output space available; one doesn't need to provide
+ * space for the whole uncompressed data on the first call. The input buffer
+ * may be changed normally like with XZ_PREALLOC. This way input data can be
+ * provided from non-contiguous memory.
+ */
+extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
+                                       struct xz_buf *b);
+
+/**
+ * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
+ * @s:          Decoder state allocated using xz_dec_microlzma_alloc().
+ *              If s is NULL, this function does nothing.
+ */
+extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+
+/*
   * Standalone build (userspace build or in-kernel build for boot time use)
   * needs a CRC32 implementation. For normal in-kernel use, kernel's own
   * CRC32 module is used instead, and users of this module don't need to
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h

index 62dd842..27336fc 100644 (file)
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5376,7 +5376,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
   *     netdev and may otherwise be used by driver read-only, will be update
   *     by cfg80211 on change_interface
   * @mgmt_registrations: list of registrations for management frames
- * @mgmt_registrations_lock: lock for the list
   * @mgmt_registrations_need_update: mgmt registrations were updated,
   *     need to propagate the update to the driver
   * @mtx: mutex used to lock data in this struct, may be used by drivers
@@ -5423,7 +5422,6 @@ struct wireless_dev {
         u32 identifier;
  
         struct list_head mgmt_registrations;
-       spinlock_t mgmt_registrations_lock;
         u8 mgmt_registrations_need_update:1;
  
         struct mutex mtx;
diff --git a/include/net/mptcp.h b/include/net/mptcp.h

index 6026bbe..3214848 100644 (file)
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -69,6 +69,10 @@ struct mptcp_out_options {
                 struct {
                         u64 sndr_key;
                         u64 rcvr_key;
+                       u64 data_seq;
+                       u32 subflow_seq;
+                       u16 data_len;
+                       __sum16 csum;
                 };
                 struct {
                         struct mptcp_addr_info addr;
diff --git a/include/net/sock.h b/include/net/sock.h

index ea6fbc8..463f390 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1208,7 +1208,7 @@ struct proto {
  #endif
  
         bool                    (*stream_memory_free)(const struct sock *sk, int wake);
-       bool                    (*stream_memory_read)(const struct sock *sk);
+       bool                    (*sock_is_readable)(struct sock *sk);
         /* Memory pressure */
         void                    (*enter_memory_pressure)(struct sock *sk);
         void                    (*leave_memory_pressure)(struct sock *sk);
@@ -2820,4 +2820,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs);
  
  int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
  
+static inline bool sk_is_readable(struct sock *sk)
+{
+       if (sk->sk_prot->sock_is_readable)
+               return sk->sk_prot->sock_is_readable(sk);
+       return false;
+}
  #endif /* _SOCK_H */
diff --git a/include/net/tls.h b/include/net/tls.h

index be4b3e1..1fffb20 100644 (file)
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -358,6 +358,7 @@ int tls_sk_query(struct sock *sk, int optname, char __user *optval,
                 int __user *optlen);
  int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
                   unsigned int optlen);
+void tls_err_abort(struct sock *sk, int err);
  
  int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
  void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
@@ -375,7 +376,7 @@ void tls_sw_release_resources_rx(struct sock *sk);
  void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
  int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                    int nonblock, int flags, int *addr_len);
-bool tls_sw_stream_read(const struct sock *sk);
+bool tls_sw_sock_is_readable(struct sock *sk);
  ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
@@ -466,12 +467,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
  #endif
  }
  
-static inline void tls_err_abort(struct sock *sk, int err)
-{
-       sk->sk_err = err;
-       sk_error_report(sk);
-}
-
  static inline bool tls_bigint_increment(unsigned char *seq, int len)
  {
         int i;
@@ -512,7 +507,7 @@ static inline void tls_advance_record_sn(struct sock *sk,
                                          struct cipher_context *ctx)
  {
         if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
-               tls_err_abort(sk, EBADMSG);
+               tls_err_abort(sk, -EBADMSG);
  
         if (prot->version != TLS_1_3_VERSION &&
             prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
diff --git a/include/net/udp.h b/include/net/udp.h

index 360df45..909ecf4 100644 (file)
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -494,8 +494,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
          * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial
          * packets in udp_gro_complete_segment. As does UDP GSO, verified by
          * udp_send_skb. But when those packets are looped in dev_loopback_xmit
-        * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this
-        * specific case, where PARTIAL is both correct and required.
+        * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY.
+        * Reset in this specific case, where PARTIAL is both correct and
+        * required.
          */
         if (skb->pkt_type == PACKET_LOOPBACK)
                 skb->ip_summed = CHECKSUM_PARTIAL;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h

index eaf04c9..3107806 100644 (file)
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -396,4 +396,7 @@ static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
  extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc,
                              u8 key, u8 asc, u8 ascq);
  
+struct request *scsi_alloc_request(struct request_queue *q,
+               unsigned int op, blk_mq_req_flags_t flags);
+
  #endif /* _SCSI_SCSI_CMND_H */
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h

index b97e142..430b73b 100644 (file)
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -5,7 +5,7 @@
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/workqueue.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  #include <scsi/scsi.h>
  #include <linux/atomic.h>
  #include <linux/sbitmap.h>
diff --git a/include/trace/events/block.h b/include/trace/events/block.h

index cc5ab96..a95daa4 100644 (file)
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -114,7 +114,7 @@ TRACE_EVENT(block_rq_requeue,
   */
  TRACE_EVENT(block_rq_complete,
  
-       TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
+       TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
  
         TP_ARGS(rq, error, nr_bytes),
  
@@ -122,7 +122,7 @@ TRACE_EVENT(block_rq_complete,
                 __field(  dev_t,        dev                     )
                 __field(  sector_t,     sector                  )
                 __field(  unsigned int, nr_sector               )
-               __field(  int,          error                   )
+               __field(  int   ,       error                   )
                 __array(  char,         rwbs,   RWBS_LEN        )
                 __dynamic_array( char,  cmd,    1               )
         ),
@@ -131,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
                 __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                 __entry->sector    = blk_rq_pos(rq);
                 __entry->nr_sector = nr_bytes >> 9;
-               __entry->error     = error;
+               __entry->error     = blk_status_to_errno(error);
  
                 blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                 __get_str(cmd)[0] = '\0';
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h

index db4f2ce..16ae7b6 100644 (file)
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -24,7 +24,7 @@ struct erofs_map_blocks;
  #define show_mflags(flags) __print_flags(flags, "",    \
         { EROFS_MAP_MAPPED,     "M" },                  \
         { EROFS_MAP_META,       "I" },                  \
-       { EROFS_MAP_ZIPPED,     "Z" })
+       { EROFS_MAP_ENCODED,    "E" })
  
  TRACE_EVENT(erofs_lookup,
  
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h

index 0dd30de..7346f01 100644 (file)
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -6,6 +6,7 @@
  #define _TRACE_IO_URING_H
  
  #include <linux/tracepoint.h>
+#include <uapi/linux/io_uring.h>
  
  struct io_wq_work;
  
@@ -497,6 +498,66 @@ TRACE_EVENT(io_uring_task_run,
                   (unsigned long long) __entry->user_data)
  );
  
+/*
+ * io_uring_req_failed - called when an sqe is errored dring submission
+ *
+ * @sqe:               pointer to the io_uring_sqe that failed
+ * @error:             error it failed with
+ *
+ * Allows easier diagnosing of malformed requests in production systems.
+ */
+TRACE_EVENT(io_uring_req_failed,
+
+       TP_PROTO(const struct io_uring_sqe *sqe, int error),
+
+       TP_ARGS(sqe, error),
+
+       TP_STRUCT__entry (
+               __field(  u8,   opcode )
+               __field(  u8,   flags )
+               __field(  u8,   ioprio )
+               __field( u64,   off )
+               __field( u64,   addr )
+               __field( u32,   len )
+               __field( u32,   op_flags )
+               __field( u64,   user_data )
+               __field( u16,   buf_index )
+               __field( u16,   personality )
+               __field( u32,   file_index )
+               __field( u64,   pad1 )
+               __field( u64,   pad2 )
+               __field( int,   error )
+       ),
+
+       TP_fast_assign(
+               __entry->opcode         = sqe->opcode;
+               __entry->flags          = sqe->flags;
+               __entry->ioprio         = sqe->ioprio;
+               __entry->off            = sqe->off;
+               __entry->addr           = sqe->addr;
+               __entry->len            = sqe->len;
+               __entry->op_flags       = sqe->rw_flags;
+               __entry->user_data      = sqe->user_data;
+               __entry->buf_index      = sqe->buf_index;
+               __entry->personality    = sqe->personality;
+               __entry->file_index     = sqe->file_index;
+               __entry->pad1           = sqe->__pad2[0];
+               __entry->pad2           = sqe->__pad2[1];
+               __entry->error          = error;
+       ),
+
+       TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
+                 "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
+                 "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
+                 __entry->opcode, __entry->flags, __entry->ioprio,
+                 (unsigned long long)__entry->off,
+                 (unsigned long long) __entry->addr, __entry->len,
+                 __entry->op_flags, (unsigned long long) __entry->user_data,
+                 __entry->buf_index, __entry->personality, __entry->file_index,
+                 (unsigned long long) __entry->pad1,
+                 (unsigned long long) __entry->pad2, __entry->error)
+);
+
  #endif /* _TRACE_IO_URING_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h

index 1d28431..171524d 100644 (file)
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -16,38 +16,38 @@
  #define PAGEMAP_MAPPEDDISK     0x0020u
  #define PAGEMAP_BUFFERS                0x0040u
  
-#define trace_pagemap_flags(page) ( \
-       (PageAnon(page)         ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
-       (page_mapped(page)      ? PAGEMAP_MAPPED     : 0) | \
-       (PageSwapCache(page)    ? PAGEMAP_SWAPCACHE  : 0) | \
-       (PageSwapBacked(page)   ? PAGEMAP_SWAPBACKED : 0) | \
-       (PageMappedToDisk(page) ? PAGEMAP_MAPPEDDISK : 0) | \
-       (page_has_private(page) ? PAGEMAP_BUFFERS    : 0) \
+#define trace_pagemap_flags(folio) ( \
+       (folio_test_anon(folio)         ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
+       (folio_mapped(folio)            ? PAGEMAP_MAPPED     : 0) | \
+       (folio_test_swapcache(folio)    ? PAGEMAP_SWAPCACHE  : 0) | \
+       (folio_test_swapbacked(folio)   ? PAGEMAP_SWAPBACKED : 0) | \
+       (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \
+       (folio_test_private(folio)      ? PAGEMAP_BUFFERS    : 0) \
         )
  
  TRACE_EVENT(mm_lru_insertion,
  
-       TP_PROTO(struct page *page),
+       TP_PROTO(struct folio *folio),
  
-       TP_ARGS(page),
+       TP_ARGS(folio),
  
         TP_STRUCT__entry(
-               __field(struct page *,  page    )
+               __field(struct folio *, folio   )
                 __field(unsigned long,  pfn     )
                 __field(enum lru_list,  lru     )
                 __field(unsigned long,  flags   )
         ),
  
         TP_fast_assign(
-               __entry->page   = page;
-               __entry->pfn    = page_to_pfn(page);
-               __entry->lru    = page_lru(page);
-               __entry->flags  = trace_pagemap_flags(page);
+               __entry->folio  = folio;
+               __entry->pfn    = folio_pfn(folio);
+               __entry->lru    = folio_lru_list(folio);
+               __entry->flags  = trace_pagemap_flags(folio);
         ),
  
         /* Flag format is based on page-types.c formatting for pagemap */
-       TP_printk("page=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
-                       __entry->page,
+       TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
+                       __entry->folio,
                         __entry->pfn,
                         __entry->lru,
                         __entry->flags & PAGEMAP_MAPPED         ? "M" : " ",
@@ -60,23 +60,21 @@ TRACE_EVENT(mm_lru_insertion,
  
  TRACE_EVENT(mm_lru_activate,
  
-       TP_PROTO(struct page *page),
+       TP_PROTO(struct folio *folio),
  
-       TP_ARGS(page),
+       TP_ARGS(folio),
  
         TP_STRUCT__entry(
-               __field(struct page *,  page    )
+               __field(struct folio *, folio   )
                 __field(unsigned long,  pfn     )
         ),
  
         TP_fast_assign(
-               __entry->page   = page;
-               __entry->pfn    = page_to_pfn(page);
+               __entry->folio  = folio;
+               __entry->pfn    = folio_pfn(folio);
         ),
  
-       /* Flag format is based on page-types.c formatting for pagemap */
-       TP_printk("page=%p pfn=0x%lx", __entry->page, __entry->pfn)
-
+       TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
  );
  
  #endif /* _TRACE_PAGEMAP_H */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index 840d1ba..7dccb66 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -52,11 +52,11 @@ WB_WORK_REASON
  
  struct wb_writeback_work;
  
-DECLARE_EVENT_CLASS(writeback_page_template,
+DECLARE_EVENT_CLASS(writeback_folio_template,
  
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
  
-       TP_ARGS(page, mapping),
+       TP_ARGS(folio, mapping),
  
         TP_STRUCT__entry (
                 __array(char, name, 32)
@@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(writeback_page_template,
                             bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                          NULL), 32);
                 __entry->ino = mapping ? mapping->host->i_ino : 0;
-               __entry->index = page->index;
+               __entry->index = folio->index;
         ),
  
         TP_printk("bdi %s: ino=%lu index=%lu",
@@ -79,18 +79,18 @@ DECLARE_EVENT_CLASS(writeback_page_template,
         )
  );
  
-DEFINE_EVENT(writeback_page_template, writeback_dirty_page,
+DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,
  
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
  
-       TP_ARGS(page, mapping)
+       TP_ARGS(folio, mapping)
  );
  
-DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,
+DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,
  
-       TP_PROTO(struct page *page, struct address_space *mapping),
+       TP_PROTO(struct folio *folio, struct address_space *mapping),
  
-       TP_ARGS(page, mapping)
+       TP_ARGS(folio, mapping)
  );
  
  DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
@@ -236,9 +236,9 @@ TRACE_EVENT(inode_switch_wbs,
  
  TRACE_EVENT(track_foreign_dirty,
  
-       TP_PROTO(struct page *page, struct bdi_writeback *wb),
+       TP_PROTO(struct folio *folio, struct bdi_writeback *wb),
  
-       TP_ARGS(page, wb),
+       TP_ARGS(folio, wb),
  
         TP_STRUCT__entry(
                 __array(char,           name, 32)
@@ -250,7 +250,7 @@ TRACE_EVENT(track_foreign_dirty,
         ),
  
         TP_fast_assign(
-               struct address_space *mapping = page_mapping(page);
+               struct address_space *mapping = folio_mapping(folio);
                 struct inode *inode = mapping ? mapping->host : NULL;
  
                 strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
@@ -258,7 +258,7 @@ TRACE_EVENT(track_foreign_dirty,
                 __entry->ino            = inode ? inode->i_ino : 0;
                 __entry->memcg_id       = wb->memcg_css->id;
                 __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
-               __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup);
+               __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
         ),
  
         TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h

index 9dc0bf0..ecd0f5b 100644 (file)
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -181,6 +181,10 @@ struct f_owner_ex {
                                    blocking */
  #define LOCK_UN                8       /* remove lock */
  
+/*
+ * LOCK_MAND support has been removed from the kernel. We leave the symbols
+ * here to not break legacy builds, but these should not be used in new code.
+ */
  #define LOCK_MAND      32      /* This is a mandatory flock ... */
  #define LOCK_READ      64      /* which allows concurrent read operations */
  #define LOCK_WRITE     128     /* which allows concurrent write operations */
diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h

index 6c34f6e..804ff8d 100644 (file)
--- a/include/uapi/linux/cdrom.h
+++ b/include/uapi/linux/cdrom.h
@@ -147,6 +147,8 @@
  #define CDROM_NEXT_WRITABLE    0x5394  /* get next writable block */
  #define CDROM_LAST_WRITTEN     0x5395  /* get last block written on disc */
  
+#define CDROM_TIMED_MEDIA_CHANGE   0x5396  /* get the timestamp of the last media change */
+
  /*******************************************************
   * CDROM IOCTL structures
   *******************************************************/
@@ -295,6 +297,23 @@ struct cdrom_generic_command
         };
  };
  
+/* This struct is used by CDROM_TIMED_MEDIA_CHANGE */
+struct cdrom_timed_media_change_info {
+       __s64   last_media_change;      /* Timestamp of the last detected media
+                                        * change in ms. May be set by caller,
+                                        * updated upon successful return of
+                                        * ioctl.
+                                        */
+       __u64   media_flags;            /* Flags returned by ioctl to indicate
+                                        * media status.
+                                        */
+};
+#define MEDIA_CHANGED_FLAG     0x1     /* Last detected media change was more
+                                        * recent than last_media_change set by
+                                        * caller.
+                                        */
+/* other bits of media_flags available for future use */
+
  /*
   * A CD-ROM physical sector size is 2048, 2052, 2056, 2324, 2332, 2336, 
   * 2340, or 2352 bytes long.  
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h

index b270a07..c45b5e9 100644 (file)
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -158,6 +158,7 @@ enum {
  #define IORING_TIMEOUT_BOOTTIME                (1U << 2)
  #define IORING_TIMEOUT_REALTIME                (1U << 3)
  #define IORING_LINK_TIMEOUT_UPDATE     (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS   (1U << 5)
  #define IORING_TIMEOUT_CLOCK_MASK      (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
  #define IORING_TIMEOUT_UPDATE_MASK     (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
  /*
diff --git a/init/main.c b/init/main.c

index 3c4054a..4162d7f 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -83,7 +83,6 @@
  #include <linux/ptrace.h>
  #include <linux/pti.h>
  #include <linux/blkdev.h>
-#include <linux/elevator.h>
  #include <linux/sched/clock.h>
  #include <linux/sched/task.h>
  #include <linux/sched/task_stack.h>
diff --git a/kernel/acct.c b/kernel/acct.c

index 23a7ab8..3df53cf 100644 (file)
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -60,7 +60,6 @@
  #include <linux/sched/cputime.h>
  
  #include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
  #include <linux/pid_namespace.h>
  #include <linux/fs_pin.h>
  
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c

index cebd4fb..447def5 100644 (file)
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
         INIT_WORK(&aux->work, prog_array_map_clear_deferred);
         INIT_LIST_HEAD(&aux->poke_progs);
         mutex_init(&aux->poke_mutex);
+       spin_lock_init(&aux->owner.lock);
  
         map = array_map_alloc(attr);
         if (IS_ERR(map)) {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c

index d6b7dfd..6e3ae90 100644 (file)
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -524,6 +524,7 @@ int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
  int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
  int bpf_jit_harden   __read_mostly;
  long bpf_jit_limit   __read_mostly;
+long bpf_jit_limit_max __read_mostly;
  
  static void
  bpf_prog_ksym_set_addr(struct bpf_prog *prog)
@@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
  static int __init bpf_jit_charge_init(void)
  {
         /* Only used as heuristic here to derive limit. */
-       bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+       bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+       bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
                                             PAGE_SIZE), LONG_MAX);
         return 0;
  }
@@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
  bool bpf_prog_array_compatible(struct bpf_array *array,
                                const struct bpf_prog *fp)
  {
+       bool ret;
+
         if (fp->kprobe_override)
                 return false;
  
-       if (!array->aux->type) {
+       spin_lock(&array->aux->owner.lock);
+
+       if (!array->aux->owner.type) {
                 /* There's no owner yet where we could check for
                  * compatibility.
                  */
-               array->aux->type  = fp->type;
-               array->aux->jited = fp->jited;
-               return true;
+               array->aux->owner.type  = fp->type;
+               array->aux->owner.jited = fp->jited;
+               ret = true;
+       } else {
+               ret = array->aux->owner.type  == fp->type &&
+                     array->aux->owner.jited == fp->jited;
         }
-
-       return array->aux->type  == fp->type &&
-              array->aux->jited == fp->jited;
+       spin_unlock(&array->aux->owner.lock);
+       return ret;
  }
  
  static int bpf_check_tail_call(const struct bpf_prog *fp)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 4e50c0b..1cad697 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
  
         if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
                 array = container_of(map, struct bpf_array, map);
-               type  = array->aux->type;
-               jited = array->aux->jited;
+               spin_lock(&array->aux->owner.lock);
+               type  = array->aux->owner.type;
+               jited = array->aux->owner.jited;
+               spin_unlock(&array->aux->owner.lock);
         }
  
         seq_printf(m,
@@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map,
         void __user *values = u64_to_user_ptr(attr->batch.values);
         void __user *keys = u64_to_user_ptr(attr->batch.keys);
         u32 value_size, cp, max_count;
-       int ufd = attr->map_fd;
+       int ufd = attr->batch.map_fd;
         void *key, *value;
         struct fd f;
         int err = 0;
  
-       f = fdget(ufd);
         if (attr->batch.elem_flags & ~BPF_F_LOCK)
                 return -EINVAL;
  
@@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map,
                 return -ENOMEM;
         }
  
+       f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
         for (cp = 0; cp < max_count; cp++) {
                 err = -EFAULT;
                 if (copy_from_user(key, keys + cp * map->key_size,
@@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map,
  
         kvfree(value);
         kvfree(key);
+       fdput(f);
         return err;
  }
  
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index e76b559..de00655 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13319,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject)
  /* Three functions below can be called from sleepable and non-sleepable context.
   * Assume non-sleepable from bpf safety point of view.
   */
-BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, __filemap_add_folio)
  BTF_ID(func, should_fail_alloc_page)
  BTF_ID(func, should_failslab)
  BTF_SET_END(btf_non_sleepable_error_inject)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 570b0c9..ea08f01 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb)
          * And don't kill the default root.
          */
         if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
-           !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+           !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+               cgroup_bpf_offline(&root->cgrp);
                 percpu_ref_kill(&root->cgrp.self.refcnt);
+       }
         cgroup_put(&root->cgrp);
         kernfs_kill_sb(sb);
  }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c

index af24dc3..6357c35 100644 (file)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                 addr + PAGE_SIZE);
  
         if (new_page) {
-               err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+               err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
+                                       GFP_KERNEL);
                 if (err)
                         return err;
         }
diff --git a/kernel/exit.c b/kernel/exit.c

index 91a43e5..a53863d 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,6 @@
  #include <linux/pipe_fs_i.h>
  #include <linux/audit.h> /* for audit_free() */
  #include <linux/resource.h>
-#include <linux/blkdev.h>
  #include <linux/task_io_accounting_ops.h>
  #include <linux/tracehook.h>
  #include <linux/fs_struct.h>
diff --git a/kernel/fork.c b/kernel/fork.c

index 38681ad..67679e3 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,7 +76,6 @@
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
  #include <linux/tty.h>
-#include <linux/blkdev.h>
  #include <linux/fs_struct.h>
  #include <linux/magic.h>
  #include <linux/perf_event.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index f21714e..59bea52 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -13,7 +13,7 @@
  #include "sched.h"
  
  #include <linux/nospec.h>
-
+#include <linux/blkdev.h>
  #include <linux/kcov.h>
  #include <linux/scs.h>
  
@@ -6343,7 +6343,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
          * make sure to submit it to avoid deadlocks.
          */
         if (blk_needs_flush_plug(tsk))
-               blk_schedule_flush_plug(tsk);
+               blk_flush_plug(tsk->plug, true);
  }
  
  static void sched_update_worker(struct task_struct *tsk)
@@ -8354,7 +8354,8 @@ int io_schedule_prepare(void)
         int old_iowait = current->in_iowait;
  
         current->in_iowait = 1;
-       blk_schedule_flush_plug(current);
+       if (current->plug)
+               blk_flush_plug(current->plug, true);
  
         return old_iowait;
  }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 3d3e579..66128df 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -37,7 +37,6 @@
  
  #include <linux/binfmts.h>
  #include <linux/bitops.h>
-#include <linux/blkdev.h>
  #include <linux/compat.h>
  #include <linux/context_tracking.h>
  #include <linux/cpufreq.h>
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c

index fa91f39..1183c88 100644 (file)
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -816,7 +816,7 @@ blk_trace_request_get_cgid(struct request *rq)
   *     Records an action against a request. Will log the bio offset + size.
   *
   **/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
                              unsigned int nr_bytes, u32 what, u64 cgid)
  {
         struct blk_trace *bt;
@@ -834,7 +834,8 @@ static void blk_add_trace_rq(struct request *rq, int error,
                 what |= BLK_TC_ACT(BLK_TC_FS);
  
         __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-                       rq->cmd_flags, what, error, 0, NULL, cgid);
+                       rq->cmd_flags, what, blk_status_to_errno(error), 0,
+                       NULL, cgid);
         rcu_read_unlock();
  }
  
@@ -863,7 +864,7 @@ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
  }
  
  static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
-                       int error, unsigned int nr_bytes)
+                       blk_status_t error, unsigned int nr_bytes)
  {
         blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
                          blk_trace_request_get_cgid(rq));
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index 635fbdc..feebf57 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
  }
  
  /**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
   * @rec: the record to update
   * @enable: set to true if the record is tracing, false to force disable
   *
@@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
  }
  
  /**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
   * @rec: the record to test
   * @enable: set to true to check if enabled, false if it is disabled
   *
@@ -2574,7 +2574,7 @@ struct ftrace_rec_iter {
  };
  
  /**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
   *
   * Returns an iterator handle that is used to iterate over all
   * the records that represent address locations where functions
@@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
  }
  
  /**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
   * @iter: The handle to the iterator.
   *
   * Returns the next iterator after the given iterator @iter.
@@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
  }
  
  /**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
   * @iter: The current iterator location
   *
   * Returns the record that the current @iter is at.
@@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data)
  }
  
  /**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
   * @command: The command to tell ftrace what to do
   *
   * If an arch needs to fall back to the stop machine method, the
@@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command)
  }
  
  /**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
   * @command: The command that needs to be done
   *
   * Archs can override this function if it does not need to
@@ -7525,7 +7525,9 @@ void ftrace_kill(void)
  }
  
  /**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
   */
  int ftrace_is_dead(void)
  {
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c

index c4a15ae..928867f 100644 (file)
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -904,8 +904,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
  
         if (IS_ERR(ep)) {
                 ret = PTR_ERR(ep);
-               /* This must return -ENOMEM, else there is a bug */
-               WARN_ON_ONCE(ret != -ENOMEM);
+               /* This must return -ENOMEM or missing event, else there is a bug */
+               WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
                 ep = NULL;
                 goto error;
         }
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c

index a2f38e2..9f4262e 100644 (file)
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -20,8 +20,8 @@
   *
   * The worst case for in-place decompression is that the beginning of
   * the file is compressed extremely well, and the rest of the file is
- * uncompressible. Thus, we must look for worst-case expansion when the
- * compressor is encoding uncompressible data.
+ * incompressible. Thus, we must look for worst-case expansion when the
+ * compressor is encoding incompressible data.
   *
   * The structure of the .xz file in case of a compressed kernel is as follows.
   * Sizes (as bytes) of the fields are in parenthesis.
@@ -58,7 +58,7 @@
   * uncompressed size of the payload is in practice never less than the
   * payload size itself. The LZMA2 format would allow uncompressed size
   * to be less than the payload size, but no sane compressor creates such
- * files. LZMA2 supports storing uncompressible data in uncompressed form,
+ * files. LZMA2 supports storing incompressible data in uncompressed form,
   * so there's never a need to create payloads whose uncompressed size is
   * smaller than the compressed size.
   *
@@ -167,8 +167,8 @@
   * memeq and memzero are not used much and any remotely sane implementation
   * is fast enough. memcpy/memmove speed matters in multi-call mode, but
   * the kernel image is decompressed in single-call mode, in which only
- * memcpy speed can matter and only if there is a lot of uncompressible data
- * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the
+ * memmove speed can matter and only if there is a lot of incompressible data
+ * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the
   * functions below should just be kept small; it's probably not worth
   * optimizing for speed.
   */
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c

index 4515439..53e7eb1 100644 (file)
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -217,11 +217,12 @@ static void fprop_reflect_period_percpu(struct fprop_global *p,
  }
  
  /* Event of type pl happened */
-void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
+               long nr)
  {
         fprop_reflect_period_percpu(p, pl);
-       percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
-       percpu_counter_add(&p->events, 1);
+       percpu_counter_add_batch(&pl->events, nr, PROP_BATCH);
+       percpu_counter_add(&p->events, nr);
  }
  
  void fprop_fraction_percpu(struct fprop_global *p,
@@ -253,20 +254,29 @@ void fprop_fraction_percpu(struct fprop_global *p,
  }
  
  /*
- * Like __fprop_inc_percpu() except that event is counted only if the given
+ * Like __fprop_add_percpu() except that event is counted only if the given
   * type has fraction smaller than @max_frac/FPROP_FRAC_BASE
   */
-void __fprop_inc_percpu_max(struct fprop_global *p,
-                           struct fprop_local_percpu *pl, int max_frac)
+void __fprop_add_percpu_max(struct fprop_global *p,
+               struct fprop_local_percpu *pl, int max_frac, long nr)
  {
         if (unlikely(max_frac < FPROP_FRAC_BASE)) {
                 unsigned long numerator, denominator;
+               s64 tmp;
  
                 fprop_fraction_percpu(p, pl, &numerator, &denominator);
-               if (numerator >
-                   (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT)
+               /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */
+               tmp = (u64)denominator * max_frac -
+                                       ((u64)numerator << FPROP_FRAC_SHIFT);
+               if (tmp < 0) {
+                       /* Maximum fraction already exceeded? */
                         return;
+               } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) {
+                       /* Add just enough for the fraction to saturate */
+                       nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1,
+                                       FPROP_FRAC_BASE - max_frac);
+               }
         }
  
-       __fprop_inc_percpu(p, pl);
+       __fprop_add_percpu(p, pl, nr);
  }
diff --git a/lib/random32.c b/lib/random32.c

index 4d0e05e..a57a0e1 100644 (file)
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -39,6 +39,7 @@
  #include <linux/random.h>
  #include <linux/sched.h>
  #include <linux/bitops.h>
+#include <linux/slab.h>
  #include <asm/unaligned.h>
  #include <trace/events/random.h>
  
diff --git a/lib/sbitmap.c b/lib/sbitmap.c

index b25db9b..2709ab8 100644 (file)
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -489,6 +489,57 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
  }
  EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
  
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+                                       unsigned int *offset)
+{
+       struct sbitmap *sb = &sbq->sb;
+       unsigned int hint, depth;
+       unsigned long index, nr;
+       int i;
+
+       if (unlikely(sb->round_robin))
+               return 0;
+
+       depth = READ_ONCE(sb->depth);
+       hint = update_alloc_hint_before_get(sb, depth);
+
+       index = SB_NR_TO_INDEX(sb, hint);
+
+       for (i = 0; i < sb->map_nr; i++) {
+               struct sbitmap_word *map = &sb->map[index];
+               unsigned long get_mask;
+
+               sbitmap_deferred_clear(map);
+               if (map->word == (1UL << (map->depth - 1)) - 1)
+                       continue;
+
+               nr = find_first_zero_bit(&map->word, map->depth);
+               if (nr + nr_tags <= map->depth) {
+                       atomic_long_t *ptr = (atomic_long_t *) &map->word;
+                       int map_tags = min_t(int, nr_tags, map->depth);
+                       unsigned long val, ret;
+
+                       get_mask = ((1UL << map_tags) - 1) << nr;
+                       do {
+                               val = READ_ONCE(map->word);
+                               ret = atomic_long_cmpxchg(ptr, val, get_mask | val);
+                       } while (ret != val);
+                       get_mask = (get_mask & ~ret) >> nr;
+                       if (get_mask) {
+                               *offset = nr + (index << sb->shift);
+                               update_alloc_hint_after_get(sb, depth, hint,
+                                                       *offset + map_tags - 1);
+                               return get_mask;
+                       }
+               }
+               /* Jump to next index. */
+               if (++index >= sb->map_nr)
+                       index = 0;
+       }
+
+       return 0;
+}
+
  int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                                 unsigned int shallow_depth)
  {
@@ -577,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
  }
  EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
  
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+       if (likely(!sb->round_robin && tag < sb->depth))
+               data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag);
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+                               int *tags, int nr_tags)
+{
+       struct sbitmap *sb = &sbq->sb;
+       unsigned long *addr = NULL;
+       unsigned long mask = 0;
+       int i;
+
+       smp_mb__before_atomic();
+       for (i = 0; i < nr_tags; i++) {
+               const int tag = tags[i] - offset;
+               unsigned long *this_addr;
+
+               /* since we're clearing a batch, skip the deferred map */
+               this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+               if (!addr) {
+                       addr = this_addr;
+               } else if (addr != this_addr) {
+                       atomic_long_andnot(mask, (atomic_long_t *) addr);
+                       mask = 0;
+                       addr = this_addr;
+               }
+               mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+       }
+
+       if (mask)
+               atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+       smp_mb__after_atomic();
+       sbitmap_queue_wake_up(sbq);
+       sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+                                       tags[nr_tags - 1] - offset);
+}
+
  void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                          unsigned int cpu)
  {
@@ -601,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
          */
         smp_mb__after_atomic();
         sbitmap_queue_wake_up(sbq);
-
-       if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-               *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+       sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
  }
  EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
  
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig

index 5cb5024..adce22a 100644 (file)
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -39,6 +39,19 @@ config XZ_DEC_SPARC
         default y
         select XZ_DEC_BCJ
  
+config XZ_DEC_MICROLZMA
+       bool "MicroLZMA decoder"
+       default n
+       help
+         MicroLZMA is a header format variant where the first byte
+         of a raw LZMA stream (without the end of stream marker) has
+         been replaced with a bitwise-negation of the lc/lp/pb
+         properties byte. MicroLZMA was created to be used in EROFS
+         but can be used by other things too where wasting minimal
+         amount of space for headers is important.
+
+         Unless you know that you need this, say N.
+
  endif
  
  config XZ_DEC_BCJ
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c

index 7a6781e..27ce345 100644 (file)
--- a/lib/xz/xz_dec_lzma2.c
+++ b/lib/xz/xz_dec_lzma2.c
@@ -248,6 +248,10 @@ struct lzma2_dec {
          * before the first LZMA chunk.
          */
         bool need_props;
+
+#ifdef XZ_DEC_MICROLZMA
+       bool pedantic_microlzma;
+#endif
  };
  
  struct xz_dec_lzma2 {
@@ -387,7 +391,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
  
                 *left -= copy_size;
  
-               memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
+               /*
+                * If doing in-place decompression in single-call mode and the
+                * uncompressed size of the file is larger than the caller
+                * thought (i.e. it is invalid input!), the buffers below may
+                * overlap and cause undefined behavior with memcpy().
+                * With valid inputs memcpy() would be fine here.
+                */
+               memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
                 dict->pos += copy_size;
  
                 if (dict->full < dict->pos)
@@ -397,7 +408,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
                         if (dict->pos == dict->end)
                                 dict->pos = 0;
  
-                       memcpy(b->out + b->out_pos, b->in + b->in_pos,
+                       /*
+                        * Like above but for multi-call mode: use memmove()
+                        * to avoid undefined behavior with invalid input.
+                        */
+                       memmove(b->out + b->out_pos, b->in + b->in_pos,
                                         copy_size);
                 }
  
@@ -408,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
         }
  }
  
+#ifdef XZ_DEC_MICROLZMA
+#      define DICT_FLUSH_SUPPORTS_SKIPPING true
+#else
+#      define DICT_FLUSH_SUPPORTS_SKIPPING false
+#endif
+
  /*
   * Flush pending data from dictionary to b->out. It is assumed that there is
   * enough space in b->out. This is guaranteed because caller uses dict_limit()
@@ -421,8 +442,19 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b)
                 if (dict->pos == dict->end)
                         dict->pos = 0;
  
-               memcpy(b->out + b->out_pos, dict->buf + dict->start,
-                               copy_size);
+               /*
+                * These buffers cannot overlap even if doing in-place
+                * decompression because in multi-call mode dict->buf
+                * has been allocated by us in this file; it's not
+                * provided by the caller like in single-call mode.
+                *
+                * With MicroLZMA, b->out can be NULL to skip bytes that
+                * the caller doesn't need. This cannot be done with XZ
+                * because it would break BCJ filters.
+                */
+               if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL)
+                       memcpy(b->out + b->out_pos, dict->buf + dict->start,
+                                       copy_size);
         }
  
         dict->start = dict->pos;
@@ -488,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc)
   * functions so that the compiler is supposed to be able to more easily avoid
   * an extra branch. In this particular version of the LZMA decoder, this
   * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
- * on x86). Using a non-splitted version results in nicer looking code too.
+ * on x86). Using a non-split version results in nicer looking code too.
   *
   * NOTE: This must return an int. Do not make it return a bool or the speed
   * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care,
@@ -774,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s)
         s->lzma.rep1 = 0;
         s->lzma.rep2 = 0;
         s->lzma.rep3 = 0;
+       s->lzma.len = 0;
  
         /*
          * All probabilities are initialized to the same value. This hack
@@ -1157,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props)
                 }
         }
  
-       s->lzma.len = 0;
-
         s->lzma2.sequence = SEQ_CONTROL;
         s->lzma2.need_dict_reset = true;
  
@@ -1174,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s)
  
         kfree(s);
  }
+
+#ifdef XZ_DEC_MICROLZMA
+/* This is a wrapper struct to have a nice struct name in the public API. */
+struct xz_dec_microlzma {
+       struct xz_dec_lzma2 s;
+};
+
+enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr,
+                                struct xz_buf *b)
+{
+       struct xz_dec_lzma2 *s = &s_ptr->s;
+
+       /*
+        * sequence is SEQ_PROPERTIES before the first input byte,
+        * SEQ_LZMA_PREPARE until a total of five bytes have been read,
+        * and SEQ_LZMA_RUN for the rest of the input stream.
+        */
+       if (s->lzma2.sequence != SEQ_LZMA_RUN) {
+               if (s->lzma2.sequence == SEQ_PROPERTIES) {
+                       /* One byte is needed for the props. */
+                       if (b->in_pos >= b->in_size)
+                               return XZ_OK;
+
+                       /*
+                        * Don't increment b->in_pos here. The same byte is
+                        * also passed to rc_read_init() which will ignore it.
+                        */
+                       if (!lzma_props(s, ~b->in[b->in_pos]))
+                               return XZ_DATA_ERROR;
+
+                       s->lzma2.sequence = SEQ_LZMA_PREPARE;
+               }
+
+               /*
+                * xz_dec_microlzma_reset() doesn't validate the compressed
+                * size so we do it here. We have to limit the maximum size
+                * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice
+                * round number and much more than users of this code should
+                * ever need.
+                */
+               if (s->lzma2.compressed < RC_INIT_BYTES
+                               || s->lzma2.compressed > (3U << 30))
+                       return XZ_DATA_ERROR;
+
+               if (!rc_read_init(&s->rc, b))
+                       return XZ_OK;
+
+               s->lzma2.compressed -= RC_INIT_BYTES;
+               s->lzma2.sequence = SEQ_LZMA_RUN;
+
+               dict_reset(&s->dict, b);
+       }
+
+       /* This is to allow increasing b->out_size between calls. */
+       if (DEC_IS_SINGLE(s->dict.mode))
+               s->dict.end = b->out_size - b->out_pos;
+
+       while (true) {
+               dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos,
+                                          s->lzma2.uncompressed));
+
+               if (!lzma2_lzma(s, b))
+                       return XZ_DATA_ERROR;
+
+               s->lzma2.uncompressed -= dict_flush(&s->dict, b);
+
+               if (s->lzma2.uncompressed == 0) {
+                       if (s->lzma2.pedantic_microlzma) {
+                               if (s->lzma2.compressed > 0 || s->lzma.len > 0
+                                               || !rc_is_finished(&s->rc))
+                                       return XZ_DATA_ERROR;
+                       }
+
+                       return XZ_STREAM_END;
+               }
+
+               if (b->out_pos == b->out_size)
+                       return XZ_OK;
+
+               if (b->in_pos == b->in_size
+                               && s->temp.size < s->lzma2.compressed)
+                       return XZ_OK;
+       }
+}
+
+struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+                                               uint32_t dict_size)
+{
+       struct xz_dec_microlzma *s;
+
+       /* Restrict dict_size to the same range as in the LZMA2 code. */
+       if (dict_size < 4096 || dict_size > (3U << 30))
+               return NULL;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (s == NULL)
+               return NULL;
+
+       s->s.dict.mode = mode;
+       s->s.dict.size = dict_size;
+
+       if (DEC_IS_MULTI(mode)) {
+               s->s.dict.end = dict_size;
+
+               s->s.dict.buf = vmalloc(dict_size);
+               if (s->s.dict.buf == NULL) {
+                       kfree(s);
+                       return NULL;
+               }
+       }
+
+       return s;
+}
+
+void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
+                           uint32_t uncomp_size, int uncomp_size_is_exact)
+{
+       /*
+        * comp_size is validated in xz_dec_microlzma_run().
+        * uncomp_size can safely be anything.
+        */
+       s->s.lzma2.compressed = comp_size;
+       s->s.lzma2.uncompressed = uncomp_size;
+       s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact;
+
+       s->s.lzma2.sequence = SEQ_PROPERTIES;
+       s->s.temp.size = 0;
+}
+
+void xz_dec_microlzma_end(struct xz_dec_microlzma *s)
+{
+       if (DEC_IS_MULTI(s->s.dict.mode))
+               vfree(s->s.dict.buf);
+
+       kfree(s);
+}
+#endif
diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c

index fea86de..683570b 100644 (file)
--- a/lib/xz/xz_dec_stream.c
+++ b/lib/xz/xz_dec_stream.c
@@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s)
          * we will accept other check types too, but then the check won't
          * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given.
          */
+       if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX)
+               return XZ_OPTIONS_ERROR;
+
         s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1];
  
  #ifdef XZ_DEC_ANY_CHECK
-       if (s->check_type > XZ_CHECK_MAX)
-               return XZ_OPTIONS_ERROR;
-
         if (s->check_type > XZ_CHECK_CRC32)
                 return XZ_UNSUPPORTED_CHECK;
  #else
diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c

index 32eb3c0..61098c6 100644 (file)
--- a/lib/xz/xz_dec_syms.c
+++ b/lib/xz/xz_dec_syms.c
@@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset);
  EXPORT_SYMBOL(xz_dec_run);
  EXPORT_SYMBOL(xz_dec_end);
  
+#ifdef CONFIG_XZ_DEC_MICROLZMA
+EXPORT_SYMBOL(xz_dec_microlzma_alloc);
+EXPORT_SYMBOL(xz_dec_microlzma_reset);
+EXPORT_SYMBOL(xz_dec_microlzma_run);
+EXPORT_SYMBOL(xz_dec_microlzma_end);
+#endif
+
  MODULE_DESCRIPTION("XZ decompressor");
-MODULE_VERSION("1.0");
+MODULE_VERSION("1.1");
  MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov");
  
  /*
diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h

index 09360eb..bf1e94e 100644 (file)
--- a/lib/xz/xz_private.h
+++ b/lib/xz/xz_private.h
@@ -37,6 +37,9 @@
  #              ifdef CONFIG_XZ_DEC_SPARC
  #                      define XZ_DEC_SPARC
  #              endif
+#              ifdef CONFIG_XZ_DEC_MICROLZMA
+#                      define XZ_DEC_MICROLZMA
+#              endif
  #              define memeq(a, b, size) (memcmp(a, b, size) == 0)
  #              define memzero(buf, size) memset(buf, 0, size)
  #      endif
diff --git a/mm/Makefile b/mm/Makefile

index fc60a40..d6c0042 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -46,7 +46,7 @@ mmu-$(CONFIG_MMU)     += process_vm_access.o
  endif
  
  obj-y                  := filemap.o mempool.o oom_kill.o fadvise.o \
-                          maccess.o page-writeback.o \
+                          maccess.o page-writeback.o folio-compat.o \
                            readahead.o swap.o truncate.o vmscan.o shmem.o \
                            util.o mmzone.o vmstat.o backing-dev.o \
                            mm_init.o percpu.o slab_common.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 4a9d4e2..c878d99 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,8 +2,9 @@
  
  #include <linux/wait.h>
  #include <linux/rbtree.h>
-#include <linux/backing-dev.h>
  #include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
  #include <linux/freezer.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@ -977,6 +978,22 @@ void bdi_put(struct backing_dev_info *bdi)
  }
  EXPORT_SYMBOL(bdi_put);
  
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+       struct super_block *sb;
+
+       if (!inode)
+               return &noop_backing_dev_info;
+
+       sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+       if (sb_is_blkdev_sb(sb))
+               return I_BDEV(inode)->bd_disk->bdi;
+#endif
+       return sb->s_bdi;
+}
+EXPORT_SYMBOL(inode_to_bdi);
+
  const char *bdi_dev_name(struct backing_dev_info *bdi)
  {
         if (!bdi || !bdi->dev)
diff --git a/mm/compaction.c b/mm/compaction.c

index bfc93da..fbc60f9 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1022,7 +1022,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 if (!TestClearPageLRU(page))
                         goto isolate_fail_put;
  
-               lruvec = mem_cgroup_page_lruvec(page);
+               lruvec = folio_lruvec(page_folio(page));
  
                 /* If we already hold the lock, we can skip some rechecking */
                 if (lruvec != locked) {
@@ -1032,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                         compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
                         locked = lruvec;
  
-                       lruvec_memcg_debug(lruvec, page);
+                       lruvec_memcg_debug(lruvec, page_folio(page));
  
                         /* Try get exclusive access under lock */
                         if (!skip_updated) {
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h

index c938a9c..7008c37 100644 (file)
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
         r = damon_new_region(0, 22);
         damon_add_region(r, t);
         damon_split_regions_of(c, t, 2);
-       KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u);
+       KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
         damon_free_target(t);
  
         t = damon_new_target(42);
         r = damon_new_region(0, 220);
         damon_add_region(r, t);
         damon_split_regions_of(c, t, 4);
-       KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u);
+       KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
         damon_free_target(t);
         damon_destroy_ctx(c);
  }
diff --git a/mm/filemap.c b/mm/filemap.c

index dae4812..5e206a4 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,6 @@
  #include <linux/writeback.h>
  #include <linux/backing-dev.h>
  #include <linux/pagevec.h>
-#include <linux/blkdev.h>
  #include <linux/security.h>
  #include <linux/cpuset.h>
  #include <linux/hugetlb.h>
@@ -835,6 +834,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
   */
  void replace_page_cache_page(struct page *old, struct page *new)
  {
+       struct folio *fold = page_folio(old);
+       struct folio *fnew = page_folio(new);
         struct address_space *mapping = old->mapping;
         void (*freepage)(struct page *) = mapping->a_ops->freepage;
         pgoff_t offset = old->index;
@@ -848,7 +849,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
         new->mapping = mapping;
         new->index = offset;
  
-       mem_cgroup_migrate(old, new);
+       mem_cgroup_migrate(fold, fnew);
  
         xas_lock_irq(&xas);
         xas_store(&xas, new);
@@ -870,26 +871,25 @@ void replace_page_cache_page(struct page *old, struct page *new)
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
  
-noinline int __add_to_page_cache_locked(struct page *page,
-                                       struct address_space *mapping,
-                                       pgoff_t offset, gfp_t gfp,
-                                       void **shadowp)
+noinline int __filemap_add_folio(struct address_space *mapping,
+               struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
  {
-       XA_STATE(xas, &mapping->i_pages, offset);
-       int huge = PageHuge(page);
+       XA_STATE(xas, &mapping->i_pages, index);
+       int huge = folio_test_hugetlb(folio);
         int error;
         bool charged = false;
  
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
         mapping_set_update(&xas, mapping);
  
-       get_page(page);
-       page->mapping = mapping;
-       page->index = offset;
+       folio_get(folio);
+       folio->mapping = mapping;
+       folio->index = index;
  
         if (!huge) {
-               error = mem_cgroup_charge(page, NULL, gfp);
+               error = mem_cgroup_charge(folio, NULL, gfp);
+               VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
                 if (error)
                         goto error;
                 charged = true;
@@ -901,7 +901,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
                 unsigned int order = xa_get_order(xas.xa, xas.xa_index);
                 void *entry, *old = NULL;
  
-               if (order > thp_order(page))
+               if (order > folio_order(folio))
                         xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
                                         order, gfp);
                 xas_lock_irq(&xas);
@@ -918,13 +918,13 @@ noinline int __add_to_page_cache_locked(struct page *page,
                                 *shadowp = old;
                         /* entry may have been split before we acquired lock */
                         order = xa_get_order(xas.xa, xas.xa_index);
-                       if (order > thp_order(page)) {
+                       if (order > folio_order(folio)) {
                                 xas_split(&xas, old, order);
                                 xas_reset(&xas);
                         }
                 }
  
-               xas_store(&xas, page);
+               xas_store(&xas, folio);
                 if (xas_error(&xas))
                         goto unlock;
  
@@ -932,7 +932,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
  
                 /* hugetlb pages do not participate in page cache accounting */
                 if (!huge)
-                       __inc_lruvec_page_state(page, NR_FILE_PAGES);
+                       __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
  unlock:
                 xas_unlock_irq(&xas);
         } while (xas_nomem(&xas, gfp));
@@ -940,19 +940,19 @@ unlock:
         if (xas_error(&xas)) {
                 error = xas_error(&xas);
                 if (charged)
-                       mem_cgroup_uncharge(page);
+                       mem_cgroup_uncharge(folio);
                 goto error;
         }
  
-       trace_mm_filemap_add_to_page_cache(page);
+       trace_mm_filemap_add_to_page_cache(&folio->page);
         return 0;
  error:
-       page->mapping = NULL;
+       folio->mapping = NULL;
         /* Leave page->index set: truncation relies upon it */
-       put_page(page);
+       folio_put(folio);
         return error;
  }
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
  
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
@@ -969,59 +969,58 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                 pgoff_t offset, gfp_t gfp_mask)
  {
-       return __add_to_page_cache_locked(page, mapping, offset,
+       return __filemap_add_folio(mapping, page_folio(page), offset,
                                           gfp_mask, NULL);
  }
  EXPORT_SYMBOL(add_to_page_cache_locked);
  
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-                               pgoff_t offset, gfp_t gfp_mask)
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+                               pgoff_t index, gfp_t gfp)
  {
         void *shadow = NULL;
         int ret;
  
-       __SetPageLocked(page);
-       ret = __add_to_page_cache_locked(page, mapping, offset,
-                                        gfp_mask, &shadow);
+       __folio_set_locked(folio);
+       ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
         if (unlikely(ret))
-               __ClearPageLocked(page);
+               __folio_clear_locked(folio);
         else {
                 /*
-                * The page might have been evicted from cache only
+                * The folio might have been evicted from cache only
                  * recently, in which case it should be activated like
-                * any other repeatedly accessed page.
-                * The exception is pages getting rewritten; evicting other
+                * any other repeatedly accessed folio.
+                * The exception is folios getting rewritten; evicting other
                  * data from the working set, only to cache data that will
                  * get overwritten with something else, is a waste of memory.
                  */
-               WARN_ON_ONCE(PageActive(page));
-               if (!(gfp_mask & __GFP_WRITE) && shadow)
-                       workingset_refault(page, shadow);
-               lru_cache_add(page);
+               WARN_ON_ONCE(folio_test_active(folio));
+               if (!(gfp & __GFP_WRITE) && shadow)
+                       workingset_refault(folio, shadow);
+               folio_add_lru(folio);
         }
         return ret;
  }
-EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+EXPORT_SYMBOL_GPL(filemap_add_folio);
  
  #ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
  {
         int n;
-       struct page *page;
+       struct folio *folio;
  
         if (cpuset_do_page_mem_spread()) {
                 unsigned int cpuset_mems_cookie;
                 do {
                         cpuset_mems_cookie = read_mems_allowed_begin();
                         n = cpuset_mem_spread_node();
-                       page = __alloc_pages_node(n, gfp, 0);
-               } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+                       folio = __folio_alloc_node(gfp, order, n);
+               } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
  
-               return page;
+               return folio;
         }
-       return alloc_pages(gfp, 0);
+       return folio_alloc(gfp, order);
  }
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(filemap_alloc_folio);
  #endif
  
  /*
@@ -1074,11 +1073,11 @@ EXPORT_SYMBOL(filemap_invalidate_unlock_two);
   */
  #define PAGE_WAIT_TABLE_BITS 8
  #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
  {
-       return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+       return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
  }
  
  void __init pagecache_init(void)
@@ -1086,7 +1085,7 @@ void __init pagecache_init(void)
         int i;
  
         for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
-               init_waitqueue_head(&page_wait_table[i]);
+               init_waitqueue_head(&folio_wait_table[i]);
  
         page_writeback_init();
  }
@@ -1141,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
          */
         flags = wait->flags;
         if (flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_bit(key->bit_nr, &key->page->flags))
+               if (test_bit(key->bit_nr, &key->folio->flags))
                         return -1;
                 if (flags & WQ_FLAG_CUSTOM) {
-                       if (test_and_set_bit(key->bit_nr, &key->page->flags))
+                       if (test_and_set_bit(key->bit_nr, &key->folio->flags))
                                 return -1;
                         flags |= WQ_FLAG_DONE;
                 }
@@ -1157,7 +1156,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
          *
          * So update the flags atomically, and wake up the waiter
          * afterwards to avoid any races. This store-release pairs
-        * with the load-acquire in wait_on_page_bit_common().
+        * with the load-acquire in folio_wait_bit_common().
          */
         smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
         wake_up_state(wait->private, mode);
@@ -1176,14 +1175,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
         return (flags & WQ_FLAG_EXCLUSIVE) != 0;
  }
  
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void folio_wake_bit(struct folio *folio, int bit_nr)
  {
-       wait_queue_head_t *q = page_waitqueue(page);
+       wait_queue_head_t *q = folio_waitqueue(folio);
         struct wait_page_key key;
         unsigned long flags;
         wait_queue_entry_t bookmark;
  
-       key.page = page;
+       key.folio = folio;
         key.bit_nr = bit_nr;
         key.page_match = 0;
  
@@ -1218,7 +1217,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
          * page waiters.
          */
         if (!waitqueue_active(q) || !key.page_match) {
-               ClearPageWaiters(page);
+               folio_clear_waiters(folio);
                 /*
                  * It's possible to miss clearing Waiters here, when we woke
                  * our page waiters, but the hashed waitqueue has waiters for
@@ -1230,19 +1229,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
         spin_unlock_irqrestore(&q->lock, flags);
  }
  
-static void wake_up_page(struct page *page, int bit)
+static void folio_wake(struct folio *folio, int bit)
  {
-       if (!PageWaiters(page))
+       if (!folio_test_waiters(folio))
                 return;
-       wake_up_page_bit(page, bit);
+       folio_wake_bit(folio, bit);
  }
  
  /*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for folio_wait_bit_common():
   */
  enum behavior {
         EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
-                        * __lock_page() waiting on then setting PG_locked.
+                        * __folio_lock() waiting on then setting PG_locked.
                          */
         SHARED,         /* Hold ref to page and check the bit when woken, like
                          * wait_on_page_writeback() waiting on PG_writeback.
@@ -1253,16 +1252,16 @@ enum behavior {
  };
  
  /*
- * Attempt to check (or get) the page bit, and mark us done
+ * Attempt to check (or get) the folio flag, and mark us done
   * if successful.
   */
-static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                         struct wait_queue_entry *wait)
  {
         if (wait->flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_and_set_bit(bit_nr, &page->flags))
+               if (test_and_set_bit(bit_nr, &folio->flags))
                         return false;
-       } else if (test_bit(bit_nr, &page->flags))
+       } else if (test_bit(bit_nr, &folio->flags))
                 return false;
  
         wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1272,9 +1271,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
  /* How many times do we accept lock stealing from under a waiter? */
  int sysctl_page_lock_unfairness = 5;
  
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-       struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
+               int state, enum behavior behavior)
  {
+       wait_queue_head_t *q = folio_waitqueue(folio);
         int unfairness = sysctl_page_lock_unfairness;
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
@@ -1283,8 +1283,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         unsigned long pflags;
  
         if (bit_nr == PG_locked &&
-           !PageUptodate(page) && PageWorkingset(page)) {
-               if (!PageSwapBacked(page)) {
+           !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+               if (!folio_test_swapbacked(folio)) {
                         delayacct_thrashing_start();
                         delayacct = true;
                 }
@@ -1294,7 +1294,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  
         init_wait(wait);
         wait->func = wake_page_function;
-       wait_page.page = page;
+       wait_page.folio = folio;
         wait_page.bit_nr = bit_nr;
  
  repeat:
@@ -1309,7 +1309,7 @@ repeat:
          * Do one last check whether we can get the
          * page bit synchronously.
          *
-        * Do the SetPageWaiters() marking before that
+        * Do the folio_set_waiters() marking before that
          * to let any waker we _just_ missed know they
          * need to wake us up (otherwise they'll never
          * even go to the slow case that looks at the
@@ -1320,8 +1320,8 @@ repeat:
          * lock to avoid races.
          */
         spin_lock_irq(&q->lock);
-       SetPageWaiters(page);
-       if (!trylock_page_bit_common(page, bit_nr, wait))
+       folio_set_waiters(folio);
+       if (!folio_trylock_flag(folio, bit_nr, wait))
                 __add_wait_queue_entry_tail(q, wait);
         spin_unlock_irq(&q->lock);
  
@@ -1331,10 +1331,10 @@ repeat:
          * see whether the page bit testing has already
          * been done by the wake function.
          *
-        * We can drop our reference to the page.
+        * We can drop our reference to the folio.
          */
         if (behavior == DROP)
-               put_page(page);
+               folio_put(folio);
  
         /*
          * Note that until the "finish_wait()", or until
@@ -1371,7 +1371,7 @@ repeat:
                  *
                  * And if that fails, we'll have to retry this all.
                  */
-               if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+               if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                         goto repeat;
  
                 wait->flags |= WQ_FLAG_DONE;
@@ -1380,7 +1380,7 @@ repeat:
  
         /*
          * If a signal happened, this 'finish_wait()' may remove the last
-        * waiter from the wait-queues, but the PageWaiters bit will remain
+        * waiter from the wait-queues, but the folio waiters bit will remain
          * set. That's ok. The next wakeup will take care of it, and trying
          * to do it here would be difficult and prone to races.
          */
@@ -1411,19 +1411,17 @@ repeat:
         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
  }
  
-void wait_on_page_bit(struct page *page, int bit_nr)
+void folio_wait_bit(struct folio *folio, int bit_nr)
  {
-       wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
+       folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
  }
-EXPORT_SYMBOL(wait_on_page_bit);
+EXPORT_SYMBOL(folio_wait_bit);
  
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+int folio_wait_bit_killable(struct folio *folio, int bit_nr)
  {
-       wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
+       return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
  }
-EXPORT_SYMBOL(wait_on_page_bit_killable);
+EXPORT_SYMBOL(folio_wait_bit_killable);
  
  /**
   * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
@@ -1440,31 +1438,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable);
   */
  int put_and_wait_on_page_locked(struct page *page, int state)
  {
-       wait_queue_head_t *q;
-
-       page = compound_head(page);
-       q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
+       return folio_wait_bit_common(page_folio(page), PG_locked, state,
+                       DROP);
  }
  
  /**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
+ * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
+ * @folio: Folio defining the wait queue of interest
   * @waiter: Waiter to add to the queue
   *
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ * Add an arbitrary @waiter to the wait queue for the nominated @folio.
   */
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
  {
-       wait_queue_head_t *q = page_waitqueue(page);
+       wait_queue_head_t *q = folio_waitqueue(folio);
         unsigned long flags;
  
         spin_lock_irqsave(&q->lock, flags);
         __add_wait_queue_entry_tail(q, waiter);
-       SetPageWaiters(page);
+       folio_set_waiters(folio);
         spin_unlock_irqrestore(&q->lock, flags);
  }
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
+EXPORT_SYMBOL_GPL(folio_add_wait_queue);
  
  #ifndef clear_bit_unlock_is_negative_byte
  
@@ -1490,124 +1485,116 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
  #endif
  
  /**
- * unlock_page - unlock a locked page
- * @page: the page
+ * folio_unlock - Unlock a locked folio.
+ * @folio: The folio.
   *
- * Unlocks the page and wakes up sleepers in wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
   *
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context.  May not be
+ * called from NMI context.
   */
-void unlock_page(struct page *page)
+void folio_unlock(struct folio *folio)
  {
+       /* Bit 7 allows x86 to check the byte's sign bit */
         BUILD_BUG_ON(PG_waiters != 7);
-       page = compound_head(page);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
-               wake_up_page_bit(page, PG_locked);
+       BUILD_BUG_ON(PG_locked > 7);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+               folio_wake_bit(folio, PG_locked);
  }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(folio_unlock);
  
  /**
- * end_page_private_2 - Clear PG_private_2 and release any waiters
- * @page: The page
+ * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
+ * @folio: The folio.
   *
- * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
- * this.  The page ref held for PG_private_2 being set is released.
+ * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
+ * it.  The folio reference held for PG_private_2 being set is released.
   *
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
+ * This is, for example, used when a netfs folio is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same folio to be
   * serialised.
   */
-void end_page_private_2(struct page *page)
+void folio_end_private_2(struct folio *folio)
  {
-       page = compound_head(page);
-       VM_BUG_ON_PAGE(!PagePrivate2(page), page);
-       clear_bit_unlock(PG_private_2, &page->flags);
-       wake_up_page_bit(page, PG_private_2);
-       put_page(page);
+       VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
+       clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+       folio_wake_bit(folio, PG_private_2);
+       folio_put(folio);
  }
-EXPORT_SYMBOL(end_page_private_2);
+EXPORT_SYMBOL(folio_end_private_2);
  
  /**
- * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
   *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
   */
-void wait_on_page_private_2(struct page *page)
+void folio_wait_private_2(struct folio *folio)
  {
-       page = compound_head(page);
-       while (PagePrivate2(page))
-               wait_on_page_bit(page, PG_private_2);
+       while (folio_test_private_2(folio))
+               folio_wait_bit(folio, PG_private_2);
  }
-EXPORT_SYMBOL(wait_on_page_private_2);
+EXPORT_SYMBOL(folio_wait_private_2);
  
  /**
- * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
   *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
   * fatal signal is received by the calling task.
   *
   * Return:
   * - 0 if successful.
   * - -EINTR if a fatal signal was encountered.
   */
-int wait_on_page_private_2_killable(struct page *page)
+int folio_wait_private_2_killable(struct folio *folio)
  {
         int ret = 0;
  
-       page = compound_head(page);
-       while (PagePrivate2(page)) {
-               ret = wait_on_page_bit_killable(page, PG_private_2);
+       while (folio_test_private_2(folio)) {
+               ret = folio_wait_bit_killable(folio, PG_private_2);
                 if (ret < 0)
                         break;
         }
  
         return ret;
  }
-EXPORT_SYMBOL(wait_on_page_private_2_killable);
+EXPORT_SYMBOL(folio_wait_private_2_killable);
  
  /**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
   */
-void end_page_writeback(struct page *page)
+void folio_end_writeback(struct folio *folio)
  {
         /*
-        * TestClearPageReclaim could be used here but it is an atomic
-        * operation and overkill in this particular case. Failing to
-        * shuffle a page marked for immediate reclaim is too mild to
-        * justify taking an atomic operation penalty at the end of
-        * ever page writeback.
+        * folio_test_clear_reclaim() could be used here but it is an
+        * atomic operation and overkill in this particular case. Failing
+        * to shuffle a folio marked for immediate reclaim is too mild
+        * a gain to justify taking an atomic operation penalty at the
+        * end of every folio writeback.
          */
-       if (PageReclaim(page)) {
-               ClearPageReclaim(page);
-               rotate_reclaimable_page(page);
+       if (folio_test_reclaim(folio)) {
+               folio_clear_reclaim(folio);
+               folio_rotate_reclaimable(folio);
         }
  
         /*
-        * Writeback does not hold a page reference of its own, relying
+        * Writeback does not hold a folio reference of its own, relying
          * on truncation to wait for the clearing of PG_writeback.
-        * But here we must make sure that the page is not freed and
-        * reused before the wake_up_page().
+        * But here we must make sure that the folio is not freed and
+        * reused before the folio_wake().
          */
-       get_page(page);
-       if (!test_clear_page_writeback(page))
+       folio_get(folio);
+       if (!__folio_end_writeback(folio))
                 BUG();
  
         smp_mb__after_atomic();
-       wake_up_page(page, PG_writeback);
-       put_page(page);
+       folio_wake(folio, PG_writeback);
+       folio_put(folio);
  }
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(folio_end_writeback);
  
  /*
   * After completing I/O on a page, call this routine to update the page
@@ -1638,39 +1625,35 @@ void page_endio(struct page *page, bool is_write, int err)
  EXPORT_SYMBOL_GPL(page_endio);
  
  /**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
   */
-void __lock_page(struct page *__page)
+void __folio_lock(struct folio *folio)
  {
-       struct page *page = compound_head(__page);
-       wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+       folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                 EXCLUSIVE);
  }
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__folio_lock);
  
-int __lock_page_killable(struct page *__page)
+int __folio_lock_killable(struct folio *folio)
  {
-       struct page *page = compound_head(__page);
-       wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+       return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                         EXCLUSIVE);
  }
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__folio_lock_killable);
  
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
  {
-       struct wait_queue_head *q = page_waitqueue(page);
+       struct wait_queue_head *q = folio_waitqueue(folio);
         int ret = 0;
  
-       wait->page = page;
+       wait->folio = folio;
         wait->bit_nr = PG_locked;
  
         spin_lock_irq(&q->lock);
         __add_wait_queue_entry_tail(q, &wait->wait);
-       SetPageWaiters(page);
-       ret = !trylock_page(page);
+       folio_set_waiters(folio);
+       ret = !folio_trylock(folio);
         /*
          * If we were successful now, we know we're still on the
          * waitqueue as we're still under the lock. This means it's
@@ -1687,16 +1670,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait)
  
  /*
   * Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * true - folio is locked; mmap_lock is still held.
+ * false - folio is not locked.
   *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
   *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
   *     which case mmap_lock is still held.
   *
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * with the folio locked and the mmap_lock unperturbed.
   */
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
                          unsigned int flags)
  {
         if (fault_flag_allow_retry_first(flags)) {
@@ -1705,28 +1688,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                  * even though return 0.
                  */
                 if (flags & FAULT_FLAG_RETRY_NOWAIT)
-                       return 0;
+                       return false;
  
                 mmap_read_unlock(mm);
                 if (flags & FAULT_FLAG_KILLABLE)
-                       wait_on_page_locked_killable(page);
+                       folio_wait_locked_killable(folio);
                 else
-                       wait_on_page_locked(page);
-               return 0;
+                       folio_wait_locked(folio);
+               return false;
         }
         if (flags & FAULT_FLAG_KILLABLE) {
-               int ret;
+               bool ret;
  
-               ret = __lock_page_killable(page);
+               ret = __folio_lock_killable(folio);
                 if (ret) {
                         mmap_read_unlock(mm);
-                       return 0;
+                       return false;
                 }
         } else {
-               __lock_page(page);
+               __folio_lock(folio);
         }
-       return 1;
  
+       return true;
  }
  
  /**
@@ -1802,143 +1785,155 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
  EXPORT_SYMBOL(page_cache_prev_miss);
  
  /*
+ * Lockless page cache protocol:
+ * On the lookup side:
+ * 1. Load the folio from i_pages
+ * 2. Increment the refcount if it's not zero
+ * 3. If the folio is not found by xas_reload(), put the refcount and retry
+ *
+ * On the removal side:
+ * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
+ * B. Remove the page from i_pages
+ * C. Return the page to the page allocator
+ *
+ * This means that any page may have its reference count temporarily
+ * increased by a speculative page cache (or fast GUP) lookup as it can
+ * be allocated by another user before the RCU grace period expires.
+ * Because the refcount temporarily acquired here may end up being the
+ * last refcount on the page, any page allocation must be freeable by
+ * folio_put().
+ */
+
+/*
   * mapping_get_entry - Get a page cache entry.
   * @mapping: the address_space to search
   * @index: The page cache index.
   *
- * Looks up the page cache slot at @mapping & @index.  If there is a
- * page cache page, the head page is returned with an increased refcount.
+ * Looks up the page cache entry at @mapping & @index.  If it is a folio,
+ * it is returned with an increased refcount.  If it is a shadow entry
+ * of a previously evicted folio, or a swap entry from shmem/tmpfs,
+ * it is returned without further action.
   *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Return: The head page or shadow entry, %NULL if nothing is found.
+ * Return: The folio, swap or shadow entry, %NULL if nothing is found.
   */
-static struct page *mapping_get_entry(struct address_space *mapping,
-               pgoff_t index)
+static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
  {
         XA_STATE(xas, &mapping->i_pages, index);
-       struct page *page;
+       struct folio *folio;
  
         rcu_read_lock();
  repeat:
         xas_reset(&xas);
-       page = xas_load(&xas);
-       if (xas_retry(&xas, page))
+       folio = xas_load(&xas);
+       if (xas_retry(&xas, folio))
                 goto repeat;
         /*
          * A shadow entry of a recently evicted page, or a swap entry from
          * shmem/tmpfs.  Return it without attempting to raise page count.
          */
-       if (!page || xa_is_value(page))
+       if (!folio || xa_is_value(folio))
                 goto out;
  
-       if (!page_cache_get_speculative(page))
+       if (!folio_try_get_rcu(folio))
                 goto repeat;
  
-       /*
-        * Has the page moved or been split?
-        * This is part of the lockless pagecache protocol. See
-        * include/linux/pagemap.h for details.
-        */
-       if (unlikely(page != xas_reload(&xas))) {
-               put_page(page);
+       if (unlikely(folio != xas_reload(&xas))) {
+               folio_put(folio);
                 goto repeat;
         }
  out:
         rcu_read_unlock();
  
-       return page;
+       return folio;
  }
  
  /**
- * pagecache_get_page - Find and get a reference to a page.
+ * __filemap_get_folio - Find and get a reference to a folio.
   * @mapping: The address_space to search.
   * @index: The page index.
- * @fgp_flags: %FGP flags modify how the page is returned.
- * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
   *
   * Looks up the page cache entry at @mapping & @index.
   *
   * @fgp_flags can be zero or more of these flags:
   *
- * * %FGP_ACCESSED - The page will be marked accessed.
- * * %FGP_LOCK - The page is returned locked.
- * * %FGP_HEAD - If the page is present and a THP, return the head page
- *   rather than the exact page specified by the index.
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
   * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- *   instead of allocating a new page to replace it.
+ *   instead of allocating a new folio to replace it.
   * * %FGP_CREAT - If no page is present then a new page is allocated using
- *   @gfp_mask and added to the page cache and the VM's LRU list.
+ *   @gfp and added to the page cache and the VM's LRU list.
   *   The page is returned locked and with an increased refcount.
   * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
   *   page is already in cache.  If the page was allocated, unlock it before
   *   returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
- * * %FGP_NOWAIT - Don't get blocked by page lock
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
   *
   * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
   * if the %GFP flags specified for %FGP_CREAT are atomic.
   *
   * If there is a page cache page, it is returned with an increased refcount.
   *
- * Return: The found page or %NULL otherwise.
+ * Return: The found folio or %NULL otherwise.
   */
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-               int fgp_flags, gfp_t gfp_mask)
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp)
  {
-       struct page *page;
+       struct folio *folio;
  
  repeat:
-       page = mapping_get_entry(mapping, index);
-       if (xa_is_value(page)) {
+       folio = mapping_get_entry(mapping, index);
+       if (xa_is_value(folio)) {
                 if (fgp_flags & FGP_ENTRY)
-                       return page;
-               page = NULL;
+                       return folio;
+               folio = NULL;
         }
-       if (!page)
+       if (!folio)
                 goto no_page;
  
         if (fgp_flags & FGP_LOCK) {
                 if (fgp_flags & FGP_NOWAIT) {
-                       if (!trylock_page(page)) {
-                               put_page(page);
+                       if (!folio_trylock(folio)) {
+                               folio_put(folio);
                                 return NULL;
                         }
                 } else {
-                       lock_page(page);
+                       folio_lock(folio);
                 }
  
                 /* Has the page been truncated? */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
+               if (unlikely(folio->mapping != mapping)) {
+                       folio_unlock(folio);
+                       folio_put(folio);
                         goto repeat;
                 }
-               VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+               VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
         }
  
         if (fgp_flags & FGP_ACCESSED)
-               mark_page_accessed(page);
+               folio_mark_accessed(folio);
         else if (fgp_flags & FGP_WRITE) {
                 /* Clear idle flag for buffer write */
-               if (page_is_idle(page))
-                       clear_page_idle(page);
+               if (folio_test_idle(folio))
+                       folio_clear_idle(folio);
         }
-       if (!(fgp_flags & FGP_HEAD))
-               page = find_subpage(page, index);
  
+       if (fgp_flags & FGP_STABLE)
+               folio_wait_stable(folio);
  no_page:
-       if (!page && (fgp_flags & FGP_CREAT)) {
+       if (!folio && (fgp_flags & FGP_CREAT)) {
                 int err;
                 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
-                       gfp_mask |= __GFP_WRITE;
+                       gfp |= __GFP_WRITE;
                 if (fgp_flags & FGP_NOFS)
-                       gfp_mask &= ~__GFP_FS;
+                       gfp &= ~__GFP_FS;
  
-               page = __page_cache_alloc(gfp_mask);
-               if (!page)
+               folio = filemap_alloc_folio(gfp, 0);
+               if (!folio)
                         return NULL;
  
                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
@@ -1946,27 +1941,27 @@ no_page:
  
                 /* Init accessed so avoid atomic mark_page_accessed later */
                 if (fgp_flags & FGP_ACCESSED)
-                       __SetPageReferenced(page);
+                       __folio_set_referenced(folio);
  
-               err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+               err = filemap_add_folio(mapping, folio, index, gfp);
                 if (unlikely(err)) {
-                       put_page(page);
-                       page = NULL;
+                       folio_put(folio);
+                       folio = NULL;
                         if (err == -EEXIST)
                                 goto repeat;
                 }
  
                 /*
-                * add_to_page_cache_lru locks the page, and for mmap we expect
-                * an unlocked page.
+                * filemap_add_folio locks the page, and for mmap
+                * we expect an unlocked page.
                  */
-               if (page && (fgp_flags & FGP_FOR_MMAP))
-                       unlock_page(page);
+               if (folio && (fgp_flags & FGP_FOR_MMAP))
+                       folio_unlock(folio);
         }
  
-       return page;
+       return folio;
  }
-EXPORT_SYMBOL(pagecache_get_page);
+EXPORT_SYMBOL(__filemap_get_folio);
  
  static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
                 xa_mark_t mark)
@@ -2421,6 +2416,7 @@ static int filemap_update_page(struct kiocb *iocb,
                 struct address_space *mapping, struct iov_iter *iter,
                 struct page *page)
  {
+       struct folio *folio = page_folio(page);
         int error;
  
         if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -2430,40 +2426,40 @@ static int filemap_update_page(struct kiocb *iocb,
                 filemap_invalidate_lock_shared(mapping);
         }
  
-       if (!trylock_page(page)) {
+       if (!folio_trylock(folio)) {
                 error = -EAGAIN;
                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                         goto unlock_mapping;
                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
                         filemap_invalidate_unlock_shared(mapping);
-                       put_and_wait_on_page_locked(page, TASK_KILLABLE);
+                       put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE);
                         return AOP_TRUNCATED_PAGE;
                 }
-               error = __lock_page_async(page, iocb->ki_waitq);
+               error = __folio_lock_async(folio, iocb->ki_waitq);
                 if (error)
                         goto unlock_mapping;
         }
  
         error = AOP_TRUNCATED_PAGE;
-       if (!page->mapping)
+       if (!folio->mapping)
                 goto unlock;
  
         error = 0;
-       if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+       if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
                 goto unlock;
  
         error = -EAGAIN;
         if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                 goto unlock;
  
-       error = filemap_read_page(iocb->ki_filp, mapping, page);
+       error = filemap_read_page(iocb->ki_filp, mapping, &folio->page);
         goto unlock_mapping;
  unlock:
-       unlock_page(page);
+       folio_unlock(folio);
  unlock_mapping:
         filemap_invalidate_unlock_shared(mapping);
         if (error == AOP_TRUNCATED_PAGE)
-               put_page(page);
+               folio_put(folio);
         return error;
  }
  
@@ -2900,7 +2896,9 @@ unlock:
  static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
                                      struct file **fpin)
  {
-       if (trylock_page(page))
+       struct folio *folio = page_folio(page);
+
+       if (folio_trylock(folio))
                 return 1;
  
         /*
@@ -2913,7 +2911,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
  
         *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
         if (vmf->flags & FAULT_FLAG_KILLABLE) {
-               if (__lock_page_killable(page)) {
+               if (__folio_lock_killable(folio)) {
                         /*
                          * We didn't have the right flags to drop the mmap_lock,
                          * but all fault_handlers only check for fatal signals
@@ -2925,11 +2923,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
                         return 0;
                 }
         } else
-               __lock_page(page);
+               __folio_lock(folio);
+
         return 1;
  }
  
-
  /*
   * Synchronous readahead happens when we don't even find a page in the page
   * cache at all.  We don't want to perform IO under the mmap sem, so if we have
@@ -3708,28 +3706,6 @@ out:
  }
  EXPORT_SYMBOL(generic_file_direct_write);
  
-/*
- * Find or create a page at the given pagecache position. Return the locked
- * page. This function is specifically for buffered writes.
- */
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
-                                       pgoff_t index, unsigned flags)
-{
-       struct page *page;
-       int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
-
-       if (flags & AOP_FLAG_NOFS)
-               fgp_flags |= FGP_NOFS;
-
-       page = pagecache_get_page(mapping, index, fgp_flags,
-                       mapping_gfp_mask(mapping));
-       if (page)
-               wait_for_stable_page(page);
-
-       return page;
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
-
  ssize_t generic_perform_write(struct file *file,
                                 struct iov_iter *i, loff_t pos)
  {
diff --git a/mm/folio-compat.c b/mm/folio-compat.c

new file mode 100644 (file)

index 0000000..5b6ae1d
--- /dev/null
+++ b/mm/folio-compat.c
@@ -0,0 +1,142 @@
+/*
+ * Compatibility functions which bloat the callers too much to make inline.
+ * All of the callers of these functions should be converted to use folios
+ * eventually.
+ */
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+struct address_space *page_mapping(struct page *page)
+{
+       return folio_mapping(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapping);
+
+void unlock_page(struct page *page)
+{
+       return folio_unlock(page_folio(page));
+}
+EXPORT_SYMBOL(unlock_page);
+
+void end_page_writeback(struct page *page)
+{
+       return folio_end_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+void wait_on_page_writeback(struct page *page)
+{
+       return folio_wait_writeback(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+void wait_for_stable_page(struct page *page)
+{
+       return folio_wait_stable(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
+
+bool page_mapped(struct page *page)
+{
+       return folio_mapped(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapped);
+
+void mark_page_accessed(struct page *page)
+{
+       folio_mark_accessed(page_folio(page));
+}
+EXPORT_SYMBOL(mark_page_accessed);
+
+#ifdef CONFIG_MIGRATION
+int migrate_page_move_mapping(struct address_space *mapping,
+               struct page *newpage, struct page *page, int extra_count)
+{
+       return folio_migrate_mapping(mapping, page_folio(newpage),
+                                       page_folio(page), extra_count);
+}
+EXPORT_SYMBOL(migrate_page_move_mapping);
+
+void migrate_page_states(struct page *newpage, struct page *page)
+{
+       folio_migrate_flags(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+       folio_migrate_copy(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_copy);
+#endif
+
+bool set_page_writeback(struct page *page)
+{
+       return folio_start_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_writeback);
+
+bool set_page_dirty(struct page *page)
+{
+       return folio_mark_dirty(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+int __set_page_dirty_nobuffers(struct page *page)
+{
+       return filemap_dirty_folio(page_mapping(page), page_folio(page));
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+bool clear_page_dirty_for_io(struct page *page)
+{
+       return folio_clear_dirty_for_io(page_folio(page));
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+bool redirty_page_for_writepage(struct writeback_control *wbc,
+               struct page *page)
+{
+       return folio_redirty_for_writepage(wbc, page_folio(page));
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+void lru_cache_add(struct page *page)
+{
+       folio_add_lru(page_folio(page));
+}
+EXPORT_SYMBOL(lru_cache_add);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+               pgoff_t index, gfp_t gfp)
+{
+       return filemap_add_folio(mapping, page_folio(page), index, gfp);
+}
+EXPORT_SYMBOL(add_to_page_cache_lru);
+
+noinline
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+               int fgp_flags, gfp_t gfp)
+{
+       struct folio *folio;
+
+       folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
+       if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+               return &folio->page;
+       return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL(pagecache_get_page);
+
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+                                       pgoff_t index, unsigned flags)
+{
+       unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+
+       if (flags & AOP_FLAG_NOFS)
+               fgp_flags |= FGP_NOFS;
+       return pagecache_get_page(mapping, index, fgp_flags,
+                       mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/highmem.c b/mm/highmem.c

index 4212ad0..471d977 100644 (file)
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -23,7 +23,6 @@
  #include <linux/bio.h>
  #include <linux/pagemap.h>
  #include <linux/mempool.h>
-#include <linux/blkdev.h>
  #include <linux/init.h>
  #include <linux/hash.h>
  #include <linux/highmem.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 92192cb..e548334 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  
         VM_BUG_ON_PAGE(!PageCompound(page), page);
  
-       if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
                 put_page(page);
                 count_vm_event(THP_FAULT_FALLBACK);
                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
@@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
  static void __split_huge_page(struct page *page, struct list_head *list,
                 pgoff_t end)
  {
-       struct page *head = compound_head(page);
+       struct folio *folio = page_folio(page);
+       struct page *head = &folio->page;
         struct lruvec *lruvec;
         struct address_space *swap_cache = NULL;
         unsigned long offset = 0;
@@ -2424,7 +2425,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         }
  
         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
-       lruvec = lock_page_lruvec(head);
+       lruvec = folio_lruvec_lock(folio);
+
+       ClearPageHasHWPoisoned(head);
  
         for (i = nr - 1; i >= 1; i--) {
                 __split_huge_page_tail(head, i, lruvec, list);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 95dc7b8..6378c10 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5302,7 +5302,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                         *pagep = NULL;
                         goto out;
                 }
-               copy_huge_page(page, *pagep);
+               folio_copy(page_folio(page), page_folio(*pagep));
                 put_page(*pagep);
                 *pagep = NULL;
         }
diff --git a/mm/internal.h b/mm/internal.h

index cf3cb93..b1001eb 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,7 +34,16 @@
  
  void page_writeback_init(void);
  
+static inline void *folio_raw_mapping(struct folio *folio)
+{
+       unsigned long mapping = (unsigned long)folio->mapping;
+
+       return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+}
+
  vm_fault_t do_swap_page(struct vm_fault *vmf);
+void folio_rotate_reclaimable(struct folio *folio);
+bool __folio_end_writeback(struct folio *folio);
  
  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                 unsigned long floor, unsigned long ceiling);
@@ -63,17 +72,28 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
                 pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
  
  /**
- * page_evictable - test whether a page is evictable
- * @page: the page to test
+ * folio_evictable - Test whether a folio is evictable.
+ * @folio: The folio to test.
   *
- * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
- *
- * Reasons page might not be evictable:
- * (1) page's mapping marked unevictable
- * (2) page is part of an mlocked VMA
+ * Test whether @folio is evictable -- i.e., should be placed on
+ * active/inactive lists vs unevictable list.
   *
+ * Reasons folio might not be evictable:
+ * 1. folio's mapping marked unevictable
+ * 2. One of the pages in the folio is part of an mlocked VMA
   */
+static inline bool folio_evictable(struct folio *folio)
+{
+       bool ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(folio_mapping(folio)) &&
+                       !folio_test_mlocked(folio);
+       rcu_read_unlock();
+       return ret;
+}
+
  static inline bool page_evictable(struct page *page)
  {
         bool ret;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index 045cc57..5f02fda 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -445,22 +445,25 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
         if (!transhuge_vma_enabled(vma, vm_flags))
                 return false;
  
+       if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+                               vma->vm_pgoff, HPAGE_PMD_NR))
+               return false;
+
         /* Enabled via shmem mount options or sysfs settings. */
-       if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
-       }
+       if (shmem_file(vma->vm_file))
+               return shmem_huge_enabled(vma);
  
         /* THP settings require madvise. */
         if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
                 return false;
  
-       /* Read-only file mappings need to be aligned for THP to work. */
+       /* Only regular file is valid */
         if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
-           !inode_is_open_for_write(vma->vm_file->f_inode) &&
             (vm_flags & VM_EXEC)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
+               struct inode *inode = vma->vm_file->f_inode;
+
+               return !inode_is_open_for_write(inode) &&
+                       S_ISREG(inode->i_mode);
         }
  
         if (!vma->anon_vma || vma->vm_ops)
@@ -1087,7 +1090,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                 goto out_nolock;
         }
  
-       if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
                 result = SCAN_CGROUP_CHARGE_FAIL;
                 goto out_nolock;
         }
@@ -1211,7 +1214,7 @@ out_up_write:
         mmap_write_unlock(mm);
  out_nolock:
         if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(*hpage);
+               mem_cgroup_uncharge(page_folio(*hpage));
         trace_mm_collapse_huge_page(mm, isolated, result);
         return;
  }
@@ -1658,7 +1661,7 @@ static void collapse_file(struct mm_struct *mm,
                 goto out;
         }
  
-       if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
                 result = SCAN_CGROUP_CHARGE_FAIL;
                 goto out;
         }
@@ -1763,6 +1766,10 @@ static void collapse_file(struct mm_struct *mm,
                                 filemap_flush(mapping);
                                 result = SCAN_FAIL;
                                 goto xa_unlocked;
+                       } else if (PageWriteback(page)) {
+                               xas_unlock_irq(&xas);
+                               result = SCAN_FAIL;
+                               goto xa_unlocked;
                         } else if (trylock_page(page)) {
                                 get_page(page);
                                 xas_unlock_irq(&xas);
@@ -1798,7 +1805,8 @@ static void collapse_file(struct mm_struct *mm,
                         goto out_unlock;
                 }
  
-               if (!is_shmem && PageDirty(page)) {
+               if (!is_shmem && (PageDirty(page) ||
+                                 PageWriteback(page))) {
                         /*
                          * khugepaged only works on read-only fd, so this
                          * page is dirty because it hasn't been flushed
@@ -1975,7 +1983,7 @@ xa_unlocked:
  out:
         VM_BUG_ON(!list_empty(&pagelist));
         if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(*hpage);
+               mem_cgroup_uncharge(page_folio(*hpage));
         /* TODO: tracepoints */
  }
  
diff --git a/mm/ksm.c b/mm/ksm.c

index a5716fd..0662093 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ stale:
         /*
          * We come here from above when page->mapping or !PageSwapCache
          * suggests that the node is stale; but it might be under migration.
-        * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+        * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
          * before checking whether node->kpfn has been changed.
          */
         smp_rmb();
@@ -852,9 +852,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
         return err;
  }
  
+static inline struct stable_node *folio_stable_node(struct folio *folio)
+{
+       return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
+}
+
  static inline struct stable_node *page_stable_node(struct page *page)
  {
-       return PageKsm(page) ? page_rmapping(page) : NULL;
+       return folio_stable_node(page_folio(page));
  }
  
  static inline void set_page_stable_node(struct page *page,
@@ -2578,7 +2583,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
                 return page;            /* let do_swap_page report the error */
  
         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-       if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+       if (new_page &&
+           mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
                 put_page(new_page);
                 new_page = NULL;
         }
@@ -2658,26 +2664,26 @@ again:
  }
  
  #ifdef CONFIG_MIGRATION
-void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
  {
         struct stable_node *stable_node;
  
-       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
+       VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
  
-       stable_node = page_stable_node(newpage);
+       stable_node = folio_stable_node(folio);
         if (stable_node) {
-               VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
-               stable_node->kpfn = page_to_pfn(newpage);
+               VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
+               stable_node->kpfn = folio_pfn(newfolio);
                 /*
-                * newpage->mapping was set in advance; now we need smp_wmb()
+                * newfolio->mapping was set in advance; now we need smp_wmb()
                  * to make sure that the new stable_node->kpfn is visible
-                * to get_ksm_page() before it can see that oldpage->mapping
-                * has gone stale (or that PageSwapCache has been cleared).
+                * to get_ksm_page() before it can see that folio->mapping
+                * has gone stale (or that folio_test_swapcache has been cleared).
                  */
                 smp_wmb();
-               set_page_stable_node(oldpage, NULL);
+               set_page_stable_node(&folio->page, NULL);
         }
  }
  #endif /* CONFIG_MIGRATION */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 6da5020..8dab23a 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -456,28 +456,6 @@ ino_t page_cgroup_ino(struct page *page)
         return ino;
  }
  
-static struct mem_cgroup_per_node *
-mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
-{
-       int nid = page_to_nid(page);
-
-       return memcg->nodeinfo[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_node(int nid)
-{
-       return soft_limit_tree.rb_tree_per_node[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_from_page(struct page *page)
-{
-       int nid = page_to_nid(page);
-
-       return soft_limit_tree.rb_tree_per_node[nid];
-}
-
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
                                          struct mem_cgroup_tree_per_node *mctz,
                                          unsigned long new_usage_in_excess)
@@ -548,13 +526,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
         return excess;
  }
  
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
  {
         unsigned long excess;
         struct mem_cgroup_per_node *mz;
         struct mem_cgroup_tree_per_node *mctz;
  
-       mctz = soft_limit_tree_from_page(page);
+       mctz = soft_limit_tree.rb_tree_per_node[nid];
         if (!mctz)
                 return;
         /*
@@ -562,7 +540,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
          * because their event counter is not touched.
          */
         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-               mz = mem_cgroup_page_nodeinfo(memcg, page);
+               mz = memcg->nodeinfo[nid];
                 excess = soft_limit_excess(memcg);
                 /*
                  * We have to update the tree if mz is on RB-tree or
@@ -593,7 +571,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
  
         for_each_node(nid) {
                 mz = memcg->nodeinfo[nid];
-               mctz = soft_limit_tree_node(nid);
+               mctz = soft_limit_tree.rb_tree_per_node[nid];
                 if (mctz)
                         mem_cgroup_remove_exceeded(mz, mctz);
         }
@@ -799,7 +777,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
  }
  
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
-                                        struct page *page,
                                          int nr_pages)
  {
         /* pagein of a big page is an event. So, ignore page size */
@@ -842,7 +819,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
   * Check events in order.
   *
   */
-static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, int nid)
  {
         /* threshold event is triggered in finer grain than soft limit */
         if (unlikely(mem_cgroup_event_ratelimit(memcg,
@@ -853,7 +830,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
                 mem_cgroup_threshold(memcg);
                 if (unlikely(do_softlimit))
-                       mem_cgroup_update_tree(memcg, page);
+                       mem_cgroup_update_tree(memcg, nid);
         }
  }
  
@@ -1149,64 +1126,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
  }
  
  #ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
  {
         struct mem_cgroup *memcg;
  
         if (mem_cgroup_disabled())
                 return;
  
-       memcg = page_memcg(page);
+       memcg = folio_memcg(folio);
  
         if (!memcg)
-               VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+               VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
         else
-               VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+               VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
  }
  #endif
  
  /**
- * lock_page_lruvec - lock and return lruvec for a given page.
- * @page: the page
+ * folio_lruvec_lock - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
   *
   * These functions are safe to use under any of the following conditions:
- * - page locked
- * - PageLRU cleared
- * - lock_page_memcg()
- * - page->_refcount is zero
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held.
   */
-struct lruvec *lock_page_lruvec(struct page *page)
+struct lruvec *folio_lruvec_lock(struct folio *folio)
  {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
  
-       lruvec = mem_cgroup_page_lruvec(page);
         spin_lock(&lruvec->lru_lock);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
  
         return lruvec;
  }
  
-struct lruvec *lock_page_lruvec_irq(struct page *page)
+/**
+ * folio_lruvec_lock_irq - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
  {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
  
-       lruvec = mem_cgroup_page_lruvec(page);
         spin_lock_irq(&lruvec->lru_lock);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
  
         return lruvec;
  }
  
-struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+/**
+ * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ * @flags: Pointer to irqsave flags.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
+               unsigned long *flags)
  {
-       struct lruvec *lruvec;
+       struct lruvec *lruvec = folio_lruvec(folio);
  
-       lruvec = mem_cgroup_page_lruvec(page);
         spin_lock_irqsave(&lruvec->lru_lock, *flags);
-
-       lruvec_memcg_debug(lruvec, page);
+       lruvec_memcg_debug(lruvec, folio);
  
         return lruvec;
  }
@@ -1956,18 +1957,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
  }
  
  /**
- * lock_page_memcg - lock a page and memcg binding
- * @page: the page
+ * folio_memcg_lock - Bind a folio to its memcg.
+ * @folio: The folio.
   *
- * This function protects unlocked LRU pages from being moved to
+ * This function prevents unlocked LRU folios from being moved to
   * another cgroup.
   *
- * It ensures lifetime of the locked memcg. Caller is responsible
- * for the lifetime of the page.
+ * It ensures lifetime of the bound memcg.  The caller is responsible
+ * for the lifetime of the folio.
   */
-void lock_page_memcg(struct page *page)
+void folio_memcg_lock(struct folio *folio)
  {
-       struct page *head = compound_head(page); /* rmap on tail pages */
         struct mem_cgroup *memcg;
         unsigned long flags;
  
@@ -1981,7 +1981,7 @@ void lock_page_memcg(struct page *page)
         if (mem_cgroup_disabled())
                 return;
  again:
-       memcg = page_memcg(head);
+       memcg = folio_memcg(folio);
         if (unlikely(!memcg))
                 return;
  
@@ -1995,7 +1995,7 @@ again:
                 return;
  
         spin_lock_irqsave(&memcg->move_lock, flags);
-       if (memcg != page_memcg(head)) {
+       if (memcg != folio_memcg(folio)) {
                 spin_unlock_irqrestore(&memcg->move_lock, flags);
                 goto again;
         }
@@ -2009,9 +2009,15 @@ again:
         memcg->move_lock_task = current;
         memcg->move_lock_flags = flags;
  }
+EXPORT_SYMBOL(folio_memcg_lock);
+
+void lock_page_memcg(struct page *page)
+{
+       folio_memcg_lock(page_folio(page));
+}
  EXPORT_SYMBOL(lock_page_memcg);
  
-static void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __folio_memcg_unlock(struct mem_cgroup *memcg)
  {
         if (memcg && memcg->move_lock_task == current) {
                 unsigned long flags = memcg->move_lock_flags;
@@ -2026,14 +2032,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg)
  }
  
  /**
- * unlock_page_memcg - unlock a page and memcg binding
- * @page: the page
+ * folio_memcg_unlock - Release the binding between a folio and its memcg.
+ * @folio: The folio.
+ *
+ * This releases the binding created by folio_memcg_lock().  This does
+ * not change the accounting of this folio to its memcg, but it does
+ * permit others to change it.
   */
-void unlock_page_memcg(struct page *page)
+void folio_memcg_unlock(struct folio *folio)
  {
-       struct page *head = compound_head(page);
+       __folio_memcg_unlock(folio_memcg(folio));
+}
+EXPORT_SYMBOL(folio_memcg_unlock);
  
-       __unlock_page_memcg(page_memcg(head));
+void unlock_page_memcg(struct page *page)
+{
+       folio_memcg_unlock(page_folio(page));
  }
  EXPORT_SYMBOL(unlock_page_memcg);
  
@@ -2734,9 +2748,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  }
  #endif
  
-static void commit_charge(struct page *page, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
  {
-       VM_BUG_ON_PAGE(page_memcg(page), page);
+       VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
         /*
          * Any of the following ensures page's memcg stability:
          *
@@ -2745,7 +2759,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
          * - lock_page_memcg()
          * - exclusive reference
          */
-       page->memcg_data = (unsigned long)memcg;
+       folio->memcg_data = (unsigned long)memcg;
  }
  
  static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
@@ -3015,15 +3029,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
   */
  void __memcg_kmem_uncharge_page(struct page *page, int order)
  {
+       struct folio *folio = page_folio(page);
         struct obj_cgroup *objcg;
         unsigned int nr_pages = 1 << order;
  
-       if (!PageMemcgKmem(page))
+       if (!folio_memcg_kmem(folio))
                 return;
  
-       objcg = __page_objcg(page);
+       objcg = __folio_objcg(folio);
         obj_cgroup_uncharge_pages(objcg, nr_pages);
-       page->memcg_data = 0;
+       folio->memcg_data = 0;
         obj_cgroup_put(objcg);
  }
  
@@ -3257,17 +3272,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
   */
  void split_page_memcg(struct page *head, unsigned int nr)
  {
-       struct mem_cgroup *memcg = page_memcg(head);
+       struct folio *folio = page_folio(head);
+       struct mem_cgroup *memcg = folio_memcg(folio);
         int i;
  
         if (mem_cgroup_disabled() || !memcg)
                 return;
  
         for (i = 1; i < nr; i++)
-               head[i].memcg_data = head->memcg_data;
+               folio_page(folio, i)->memcg_data = folio->memcg_data;
  
-       if (PageMemcgKmem(head))
-               obj_cgroup_get_many(__page_objcg(head), nr - 1);
+       if (folio_memcg_kmem(folio))
+               obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
         else
                 css_get_many(&memcg->css, nr - 1);
  }
@@ -3381,7 +3397,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
         if (order > 0)
                 return 0;
  
-       mctz = soft_limit_tree_node(pgdat->node_id);
+       mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
  
         /*
          * Do not even bother to check the largest node if the root
@@ -4537,17 +4553,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
   * As being wrong occasionally doesn't matter, updates and accesses to the
   * records are lockless and racy.
   */
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                              struct bdi_writeback *wb)
  {
-       struct mem_cgroup *memcg = page_memcg(page);
+       struct mem_cgroup *memcg = folio_memcg(folio);
         struct memcg_cgwb_frn *frn;
         u64 now = get_jiffies_64();
         u64 oldest_at = now;
         int oldest = -1;
         int i;
  
-       trace_track_foreign_dirty(page, wb);
+       trace_track_foreign_dirty(folio, wb);
  
         /*
          * Pick the slot to use.  If there is already a slot for @wb, keep
@@ -5575,38 +5591,39 @@ static int mem_cgroup_move_account(struct page *page,
                                    struct mem_cgroup *from,
                                    struct mem_cgroup *to)
  {
+       struct folio *folio = page_folio(page);
         struct lruvec *from_vec, *to_vec;
         struct pglist_data *pgdat;
-       unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
-       int ret;
+       unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
+       int nid, ret;
  
         VM_BUG_ON(from == to);
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON(compound && !PageTransHuge(page));
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+       VM_BUG_ON(compound && !folio_test_multi(folio));
  
         /*
          * Prevent mem_cgroup_migrate() from looking at
          * page's memory cgroup of its source page while we change it.
          */
         ret = -EBUSY;
-       if (!trylock_page(page))
+       if (!folio_trylock(folio))
                 goto out;
  
         ret = -EINVAL;
-       if (page_memcg(page) != from)
+       if (folio_memcg(folio) != from)
                 goto out_unlock;
  
-       pgdat = page_pgdat(page);
+       pgdat = folio_pgdat(folio);
         from_vec = mem_cgroup_lruvec(from, pgdat);
         to_vec = mem_cgroup_lruvec(to, pgdat);
  
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
  
-       if (PageAnon(page)) {
-               if (page_mapped(page)) {
+       if (folio_test_anon(folio)) {
+               if (folio_mapped(folio)) {
                         __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
                         __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
-                       if (PageTransHuge(page)) {
+                       if (folio_test_transhuge(folio)) {
                                 __mod_lruvec_state(from_vec, NR_ANON_THPS,
                                                    -nr_pages);
                                 __mod_lruvec_state(to_vec, NR_ANON_THPS,
@@ -5617,18 +5634,18 @@ static int mem_cgroup_move_account(struct page *page,
                 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
                 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
  
-               if (PageSwapBacked(page)) {
+               if (folio_test_swapbacked(folio)) {
                         __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
                         __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
                 }
  
-               if (page_mapped(page)) {
+               if (folio_mapped(folio)) {
                         __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
                         __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
                 }
  
-               if (PageDirty(page)) {
-                       struct address_space *mapping = page_mapping(page);
+               if (folio_test_dirty(folio)) {
+                       struct address_space *mapping = folio_mapping(folio);
  
                         if (mapping_can_writeback(mapping)) {
                                 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
@@ -5639,7 +5656,7 @@ static int mem_cgroup_move_account(struct page *page,
                 }
         }
  
-       if (PageWriteback(page)) {
+       if (folio_test_writeback(folio)) {
                 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
                 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
         }
@@ -5662,20 +5679,21 @@ static int mem_cgroup_move_account(struct page *page,
         css_get(&to->css);
         css_put(&from->css);
  
-       page->memcg_data = (unsigned long)to;
+       folio->memcg_data = (unsigned long)to;
  
-       __unlock_page_memcg(from);
+       __folio_memcg_unlock(from);
  
         ret = 0;
+       nid = folio_nid(folio);
  
         local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
-       memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
-       memcg_check_events(from, page);
+       mem_cgroup_charge_statistics(to, nr_pages);
+       memcg_check_events(to, nid);
+       mem_cgroup_charge_statistics(from, -nr_pages);
+       memcg_check_events(from, nid);
         local_irq_enable();
  out_unlock:
-       unlock_page(page);
+       folio_unlock(folio);
  out:
         return ret;
  }
@@ -6680,9 +6698,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                         atomic_long_read(&parent->memory.children_low_usage)));
  }
  
-static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
+static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+                       gfp_t gfp)
  {
-       unsigned int nr_pages = thp_nr_pages(page);
+       long nr_pages = folio_nr_pages(folio);
         int ret;
  
         ret = try_charge(memcg, gfp, nr_pages);
@@ -6690,38 +6709,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
                 goto out;
  
         css_get(&memcg->css);
-       commit_charge(page, memcg);
+       commit_charge(folio, memcg);
  
         local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
-       memcg_check_events(memcg, page);
+       mem_cgroup_charge_statistics(memcg, nr_pages);
+       memcg_check_events(memcg, folio_nid(folio));
         local_irq_enable();
  out:
         return ret;
  }
  
-/**
- * __mem_cgroup_charge - charge a newly allocated page to a cgroup
- * @page: page to charge
- * @mm: mm context of the victim
- * @gfp_mask: reclaim mode
- *
- * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary. if @mm is NULL, try to
- * charge to the active memcg.
- *
- * Do not use this for pages allocated for swapin.
- *
- * Returns 0 on success. Otherwise, an error code is returned.
- */
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
-                       gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
  {
         struct mem_cgroup *memcg;
         int ret;
  
         memcg = get_mem_cgroup_from_mm(mm);
-       ret = charge_memcg(page, memcg, gfp_mask);
+       ret = charge_memcg(folio, memcg, gfp);
         css_put(&memcg->css);
  
         return ret;
@@ -6742,6 +6746,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
  int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                                   gfp_t gfp, swp_entry_t entry)
  {
+       struct folio *folio = page_folio(page);
         struct mem_cgroup *memcg;
         unsigned short id;
         int ret;
@@ -6756,7 +6761,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                 memcg = get_mem_cgroup_from_mm(mm);
         rcu_read_unlock();
  
-       ret = charge_memcg(page, memcg, gfp);
+       ret = charge_memcg(folio, memcg, gfp);
  
         css_put(&memcg->css);
         return ret;
@@ -6800,7 +6805,7 @@ struct uncharge_gather {
         unsigned long nr_memory;
         unsigned long pgpgout;
         unsigned long nr_kmem;
-       struct page *dummy_page;
+       int nid;
  };
  
  static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6824,36 +6829,36 @@ static void uncharge_batch(const struct uncharge_gather *ug)
         local_irq_save(flags);
         __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
         __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-       memcg_check_events(ug->memcg, ug->dummy_page);
+       memcg_check_events(ug->memcg, ug->nid);
         local_irq_restore(flags);
  
-       /* drop reference from uncharge_page */
+       /* drop reference from uncharge_folio */
         css_put(&ug->memcg->css);
  }
  
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
  {
-       unsigned long nr_pages;
+       long nr_pages;
         struct mem_cgroup *memcg;
         struct obj_cgroup *objcg;
-       bool use_objcg = PageMemcgKmem(page);
+       bool use_objcg = folio_memcg_kmem(folio);
  
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  
         /*
          * Nobody should be changing or seriously looking at
-        * page memcg or objcg at this point, we have fully
-        * exclusive access to the page.
+        * folio memcg or objcg at this point, we have fully
+        * exclusive access to the folio.
          */
         if (use_objcg) {
-               objcg = __page_objcg(page);
+               objcg = __folio_objcg(folio);
                 /*
                  * This get matches the put at the end of the function and
                  * kmem pages do not hold memcg references anymore.
                  */
                 memcg = get_mem_cgroup_from_objcg(objcg);
         } else {
-               memcg = __page_memcg(page);
+               memcg = __folio_memcg(folio);
         }
  
         if (!memcg)
@@ -6865,19 +6870,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                         uncharge_gather_clear(ug);
                 }
                 ug->memcg = memcg;
-               ug->dummy_page = page;
+               ug->nid = folio_nid(folio);
  
                 /* pairs with css_put in uncharge_batch */
                 css_get(&memcg->css);
         }
  
-       nr_pages = compound_nr(page);
+       nr_pages = folio_nr_pages(folio);
  
         if (use_objcg) {
                 ug->nr_memory += nr_pages;
                 ug->nr_kmem += nr_pages;
  
-               page->memcg_data = 0;
+               folio->memcg_data = 0;
                 obj_cgroup_put(objcg);
         } else {
                 /* LRU pages aren't accounted at the root level */
@@ -6885,28 +6890,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                         ug->nr_memory += nr_pages;
                 ug->pgpgout++;
  
-               page->memcg_data = 0;
+               folio->memcg_data = 0;
         }
  
         css_put(&memcg->css);
  }
  
-/**
- * __mem_cgroup_uncharge - uncharge a page
- * @page: page to uncharge
- *
- * Uncharge a page previously charged with __mem_cgroup_charge().
- */
-void __mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio)
  {
         struct uncharge_gather ug;
  
-       /* Don't touch page->lru of any random page, pre-check: */
-       if (!page_memcg(page))
+       /* Don't touch folio->lru of any random page, pre-check: */
+       if (!folio_memcg(folio))
                 return;
  
         uncharge_gather_clear(&ug);
-       uncharge_page(page, &ug);
+       uncharge_folio(folio, &ug);
         uncharge_batch(&ug);
  }
  
@@ -6920,52 +6919,49 @@ void __mem_cgroup_uncharge(struct page *page)
  void __mem_cgroup_uncharge_list(struct list_head *page_list)
  {
         struct uncharge_gather ug;
-       struct page *page;
+       struct folio *folio;
  
         uncharge_gather_clear(&ug);
-       list_for_each_entry(page, page_list, lru)
-               uncharge_page(page, &ug);
+       list_for_each_entry(folio, page_list, lru)
+               uncharge_folio(folio, &ug);
         if (ug.memcg)
                 uncharge_batch(&ug);
  }
  
  /**
- * mem_cgroup_migrate - charge a page's replacement
- * @oldpage: currently circulating page
- * @newpage: replacement page
+ * mem_cgroup_migrate - Charge a folio's replacement.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
   *
- * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * Charge @new as a replacement folio for @old. @old will
   * be uncharged upon free.
   *
- * Both pages must be locked, @newpage->mapping must be set up.
+ * Both folios must be locked, @new->mapping must be set up.
   */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
  {
         struct mem_cgroup *memcg;
-       unsigned int nr_pages;
+       long nr_pages = folio_nr_pages(new);
         unsigned long flags;
  
-       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
-       VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
-                      newpage);
+       VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+       VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+       VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+       VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
  
         if (mem_cgroup_disabled())
                 return;
  
-       /* Page cache replacement: new page already charged? */
-       if (page_memcg(newpage))
+       /* Page cache replacement: new folio already charged? */
+       if (folio_memcg(new))
                 return;
  
-       memcg = page_memcg(oldpage);
-       VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
+       memcg = folio_memcg(old);
+       VM_WARN_ON_ONCE_FOLIO(!memcg, old);
         if (!memcg)
                 return;
  
         /* Force-charge the new page. The old one will be freed soon */
-       nr_pages = thp_nr_pages(newpage);
-
         if (!mem_cgroup_is_root(memcg)) {
                 page_counter_charge(&memcg->memory, nr_pages);
                 if (do_memsw_account())
@@ -6973,11 +6969,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
         }
  
         css_get(&memcg->css);
-       commit_charge(newpage, memcg);
+       commit_charge(new, memcg);
  
         local_irq_save(flags);
-       mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
-       memcg_check_events(memcg, newpage);
+       mem_cgroup_charge_statistics(memcg, nr_pages);
+       memcg_check_events(memcg, folio_nid(new));
         local_irq_restore(flags);
  }
  
@@ -7204,8 +7200,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
          * only synchronisation we have for updating the per-CPU variables.
          */
         VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -nr_entries);
-       memcg_check_events(memcg, page);
+       mem_cgroup_charge_statistics(memcg, -nr_entries);
+       memcg_check_events(memcg, page_to_nid(page));
  
         css_put(&memcg->css);
  }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 3e6449f..93078a2 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p)
                  * Poisoned page might never drop its ref count to 0 so we have
                  * to uncharge it manually from its memcg.
                  */
-               mem_cgroup_uncharge(p);
+               mem_cgroup_uncharge(page_folio(p));
  
                 /*
                  * drop the page count elevated by isolate_lru_page()
@@ -1147,20 +1147,6 @@ static int __get_hwpoison_page(struct page *page)
         if (!HWPoisonHandlable(head))
                 return -EBUSY;
  
-       if (PageTransHuge(head)) {
-               /*
-                * Non anonymous thp exists only in allocation/free time. We
-                * can't handle such a case correctly, so let's give it up.
-                * This should be better than triggering BUG_ON when kernel
-                * tries to touch the "partially handled" page.
-                */
-               if (!PageAnon(head)) {
-                       pr_err("Memory failure: %#lx: non anonymous thp\n",
-                               page_to_pfn(page));
-                       return 0;
-               }
-       }
-
         if (get_page_unless_zero(head)) {
                 if (head == compound_head(page))
                         return 1;
@@ -1708,6 +1694,20 @@ try_again:
         }
  
         if (PageTransHuge(hpage)) {
+               /*
+                * The flag must be set after the refcount is bumped
+                * otherwise it may race with THP split.
+                * And the flag can't be set in get_hwpoison_page() since
+                * it is called by soft offline too and it is just called
+                * for !MF_COUNT_INCREASE.  So here seems to be the best
+                * place.
+                *
+                * Don't need care about the above error handling paths for
+                * get_hwpoison_page() since they handle either free page
+                * or unhandlable page.  The refcount is bumped iff the
+                * page is a valid handlable page.
+                */
+               SetPageHasHWPoisoned(hpage);
                 if (try_to_split_thp_page(p, "Memory Failure") < 0) {
                         action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
                         res = -EBUSY;
diff --git a/mm/memory.c b/mm/memory.c

index adf9b9e..4b1de80 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -990,7 +990,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
         if (!new_page)
                 return NULL;
  
-       if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+       if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
                 put_page(new_page);
                 return NULL;
         }
@@ -3019,7 +3019,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 }
         }
  
-       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
                 goto oom_free_new;
         cgroup_throttle_swaprate(new_page, GFP_KERNEL);
  
@@ -3539,7 +3539,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
  
                                 shadow = get_shadow_from_swap_cache(entry);
                                 if (shadow)
-                                       workingset_refault(page, shadow);
+                                       workingset_refault(page_folio(page),
+                                                               shadow);
  
                                 lru_cache_add(page);
  
@@ -3769,7 +3770,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         if (!page)
                 goto oom;
  
-       if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
                 goto oom_free_page;
         cgroup_throttle_swaprate(page, GFP_KERNEL);
  
@@ -3907,6 +3908,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                 return ret;
  
         /*
+        * Just backoff if any subpage of a THP is corrupted otherwise
+        * the corrupted page may mapped by PMD silently to escape the
+        * check.  This kind of THP just can be PTE mapped.  Access to
+        * the corrupted subpage should trigger SIGBUS as expected.
+        */
+       if (unlikely(PageHasHWPoisoned(page)))
+               return ret;
+
+       /*
          * Archs like ppc64 need additional space to store information
          * related to pte entry. Use the preallocated table for that.
          */
@@ -4193,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
         if (!vmf->cow_page)
                 return VM_FAULT_OOM;
  
-       if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
+       if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
+                               GFP_KERNEL)) {
                 put_page(vmf->cow_page);
                 return VM_FAULT_OOM;
         }
@@ -4258,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
   * We enter with non-exclusive mmap_lock (to exclude vma changes,
   * but allow concurrent faults).
   * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
   * If mmap_lock is released, vma may become invalid (for example
   * by other thread calling munmap()).
   */
@@ -4499,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
   * concurrent faults).
   *
   * The mmap_lock may have been released depending on flags and our return value.
- * See filemap_fault() and __lock_page_or_retry().
+ * See filemap_fault() and __folio_lock_or_retry().
   */
  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
  {
@@ -4603,7 +4614,7 @@ unlock:
   * By the time we get here, we already hold the mm semaphore
   *
   * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
   */
  static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
@@ -4759,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
   * By the time we get here, we already hold the mm semaphore
   *
   * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __folio_lock_or_retry().
   */
  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                            unsigned int flags, struct pt_regs *regs)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index d12e060..f4b4be7 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2196,6 +2196,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
  }
  EXPORT_SYMBOL(alloc_pages);
  
+struct folio *folio_alloc(gfp_t gfp, unsigned order)
+{
+       struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+       if (page && order > 1)
+               prep_transhuge_page(page);
+       return (struct folio *)page;
+}
+EXPORT_SYMBOL(folio_alloc);
+
  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
  {
         struct mempolicy *pol = mpol_dup(vma_policy(src));
diff --git a/mm/mempool.c b/mm/mempool.c

index 0b8afbe..b933d0f 100644 (file)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -17,7 +17,6 @@
  #include <linux/kmemleak.h>
  #include <linux/export.h>
  #include <linux/mempool.h>
-#include <linux/blkdev.h>
  #include <linux/writeback.h>
  #include "slab.h"
  
diff --git a/mm/memremap.c b/mm/memremap.c

index ed593bf..5a66a71 100644 (file)
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -505,7 +505,7 @@ void free_devmap_managed_page(struct page *page)
  
         __ClearPageWaiters(page);
  
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
  
         /*
          * When a device_private page is freed, the page->mapping field
diff --git a/mm/migrate.c b/mm/migrate.c

index 1852d78..efa9941 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -364,7 +364,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
          */
         expected_count += is_device_private_page(page);
         if (mapping)
-               expected_count += thp_nr_pages(page) + page_has_private(page);
+               expected_count += compound_nr(page) + page_has_private(page);
  
         return expected_count;
  }
@@ -377,74 +377,75 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
   * 2 for pages with a mapping
   * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
   */
-int migrate_page_move_mapping(struct address_space *mapping,
-               struct page *newpage, struct page *page, int extra_count)
+int folio_migrate_mapping(struct address_space *mapping,
+               struct folio *newfolio, struct folio *folio, int extra_count)
  {
-       XA_STATE(xas, &mapping->i_pages, page_index(page));
+       XA_STATE(xas, &mapping->i_pages, folio_index(folio));
         struct zone *oldzone, *newzone;
         int dirty;
-       int expected_count = expected_page_refs(mapping, page) + extra_count;
-       int nr = thp_nr_pages(page);
+       int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
+       long nr = folio_nr_pages(folio);
  
         if (!mapping) {
                 /* Anonymous page without mapping */
-               if (page_count(page) != expected_count)
+               if (folio_ref_count(folio) != expected_count)
                         return -EAGAIN;
  
                 /* No turning back from here */
-               newpage->index = page->index;
-               newpage->mapping = page->mapping;
-               if (PageSwapBacked(page))
-                       __SetPageSwapBacked(newpage);
+               newfolio->index = folio->index;
+               newfolio->mapping = folio->mapping;
+               if (folio_test_swapbacked(folio))
+                       __folio_set_swapbacked(newfolio);
  
                 return MIGRATEPAGE_SUCCESS;
         }
  
-       oldzone = page_zone(page);
-       newzone = page_zone(newpage);
+       oldzone = folio_zone(folio);
+       newzone = folio_zone(newfolio);
  
         xas_lock_irq(&xas);
-       if (page_count(page) != expected_count || xas_load(&xas) != page) {
+       if (folio_ref_count(folio) != expected_count ||
+           xas_load(&xas) != folio) {
                 xas_unlock_irq(&xas);
                 return -EAGAIN;
         }
  
-       if (!page_ref_freeze(page, expected_count)) {
+       if (!folio_ref_freeze(folio, expected_count)) {
                 xas_unlock_irq(&xas);
                 return -EAGAIN;
         }
  
         /*
-        * Now we know that no one else is looking at the page:
+        * Now we know that no one else is looking at the folio:
          * no turning back from here.
          */
-       newpage->index = page->index;
-       newpage->mapping = page->mapping;
-       page_ref_add(newpage, nr); /* add cache reference */
-       if (PageSwapBacked(page)) {
-               __SetPageSwapBacked(newpage);
-               if (PageSwapCache(page)) {
-                       SetPageSwapCache(newpage);
-                       set_page_private(newpage, page_private(page));
+       newfolio->index = folio->index;
+       newfolio->mapping = folio->mapping;
+       folio_ref_add(newfolio, nr); /* add cache reference */
+       if (folio_test_swapbacked(folio)) {
+               __folio_set_swapbacked(newfolio);
+               if (folio_test_swapcache(folio)) {
+                       folio_set_swapcache(newfolio);
+                       newfolio->private = folio_get_private(folio);
                 }
         } else {
-               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+               VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
         }
  
         /* Move dirty while page refs frozen and newpage not yet exposed */
-       dirty = PageDirty(page);
+       dirty = folio_test_dirty(folio);
         if (dirty) {
-               ClearPageDirty(page);
-               SetPageDirty(newpage);
+               folio_clear_dirty(folio);
+               folio_set_dirty(newfolio);
         }
  
-       xas_store(&xas, newpage);
-       if (PageTransHuge(page)) {
+       xas_store(&xas, newfolio);
+       if (nr > 1) {
                 int i;
  
                 for (i = 1; i < nr; i++) {
                         xas_next(&xas);
-                       xas_store(&xas, newpage);
+                       xas_store(&xas, newfolio);
                 }
         }
  
@@ -453,7 +454,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
          * to one less reference.
          * We know this isn't the last reference.
          */
-       page_ref_unfreeze(page, expected_count - nr);
+       folio_ref_unfreeze(folio, expected_count - nr);
  
         xas_unlock(&xas);
         /* Leave irq disabled to prevent preemption while updating stats */
@@ -472,18 +473,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
                 struct lruvec *old_lruvec, *new_lruvec;
                 struct mem_cgroup *memcg;
  
-               memcg = page_memcg(page);
+               memcg = folio_memcg(folio);
                 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
                 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
  
                 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
                 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
-               if (PageSwapBacked(page) && !PageSwapCache(page)) {
+               if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
                         __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
                         __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
                 }
  #ifdef CONFIG_SWAP
-               if (PageSwapCache(page)) {
+               if (folio_test_swapcache(folio)) {
                         __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
                         __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
                 }
@@ -499,11 +500,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
  
         return MIGRATEPAGE_SUCCESS;
  }
-EXPORT_SYMBOL(migrate_page_move_mapping);
+EXPORT_SYMBOL(folio_migrate_mapping);
  
  /*
   * The expected number of remaining references is the same as that
- * of migrate_page_move_mapping().
+ * of folio_migrate_mapping().
   */
  int migrate_huge_page_move_mapping(struct address_space *mapping,
                                    struct page *newpage, struct page *page)
@@ -538,91 +539,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
  }
  
  /*
- * Copy the page to its new location
+ * Copy the flags and some other ancillary information
   */
-void migrate_page_states(struct page *newpage, struct page *page)
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
  {
         int cpupid;
  
-       if (PageError(page))
-               SetPageError(newpage);
-       if (PageReferenced(page))
-               SetPageReferenced(newpage);
-       if (PageUptodate(page))
-               SetPageUptodate(newpage);
-       if (TestClearPageActive(page)) {
-               VM_BUG_ON_PAGE(PageUnevictable(page), page);
-               SetPageActive(newpage);
-       } else if (TestClearPageUnevictable(page))
-               SetPageUnevictable(newpage);
-       if (PageWorkingset(page))
-               SetPageWorkingset(newpage);
-       if (PageChecked(page))
-               SetPageChecked(newpage);
-       if (PageMappedToDisk(page))
-               SetPageMappedToDisk(newpage);
-
-       /* Move dirty on pages not done by migrate_page_move_mapping() */
-       if (PageDirty(page))
-               SetPageDirty(newpage);
-
-       if (page_is_young(page))
-               set_page_young(newpage);
-       if (page_is_idle(page))
-               set_page_idle(newpage);
+       if (folio_test_error(folio))
+               folio_set_error(newfolio);
+       if (folio_test_referenced(folio))
+               folio_set_referenced(newfolio);
+       if (folio_test_uptodate(folio))
+               folio_mark_uptodate(newfolio);
+       if (folio_test_clear_active(folio)) {
+               VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+               folio_set_active(newfolio);
+       } else if (folio_test_clear_unevictable(folio))
+               folio_set_unevictable(newfolio);
+       if (folio_test_workingset(folio))
+               folio_set_workingset(newfolio);
+       if (folio_test_checked(folio))
+               folio_set_checked(newfolio);
+       if (folio_test_mappedtodisk(folio))
+               folio_set_mappedtodisk(newfolio);
+
+       /* Move dirty on pages not done by folio_migrate_mapping() */
+       if (folio_test_dirty(folio))
+               folio_set_dirty(newfolio);
+
+       if (folio_test_young(folio))
+               folio_set_young(newfolio);
+       if (folio_test_idle(folio))
+               folio_set_idle(newfolio);
  
         /*
          * Copy NUMA information to the new page, to prevent over-eager
          * future migrations of this same page.
          */
-       cpupid = page_cpupid_xchg_last(page, -1);
-       page_cpupid_xchg_last(newpage, cpupid);
+       cpupid = page_cpupid_xchg_last(&folio->page, -1);
+       page_cpupid_xchg_last(&newfolio->page, cpupid);
  
-       ksm_migrate_page(newpage, page);
+       folio_migrate_ksm(newfolio, folio);
         /*
          * Please do not reorder this without considering how mm/ksm.c's
          * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
          */
-       if (PageSwapCache(page))
-               ClearPageSwapCache(page);
-       ClearPagePrivate(page);
+       if (folio_test_swapcache(folio))
+               folio_clear_swapcache(folio);
+       folio_clear_private(folio);
  
         /* page->private contains hugetlb specific flags */
-       if (!PageHuge(page))
-               set_page_private(page, 0);
+       if (!folio_test_hugetlb(folio))
+               folio->private = NULL;
  
         /*
          * If any waiters have accumulated on the new page then
          * wake them up.
          */
-       if (PageWriteback(newpage))
-               end_page_writeback(newpage);
+       if (folio_test_writeback(newfolio))
+               folio_end_writeback(newfolio);
  
         /*
          * PG_readahead shares the same bit with PG_reclaim.  The above
          * end_page_writeback() may clear PG_readahead mistakenly, so set the
          * bit after that.
          */
-       if (PageReadahead(page))
-               SetPageReadahead(newpage);
+       if (folio_test_readahead(folio))
+               folio_set_readahead(newfolio);
  
-       copy_page_owner(page, newpage);
+       folio_copy_owner(newfolio, folio);
  
-       if (!PageHuge(page))
-               mem_cgroup_migrate(page, newpage);
+       if (!folio_test_hugetlb(folio))
+               mem_cgroup_migrate(folio, newfolio);
  }
-EXPORT_SYMBOL(migrate_page_states);
+EXPORT_SYMBOL(folio_migrate_flags);
  
-void migrate_page_copy(struct page *newpage, struct page *page)
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
  {
-       if (PageHuge(page) || PageTransHuge(page))
-               copy_huge_page(newpage, page);
-       else
-               copy_highpage(newpage, page);
-
-       migrate_page_states(newpage, page);
+       folio_copy(newfolio, folio);
+       folio_migrate_flags(newfolio, folio);
  }
-EXPORT_SYMBOL(migrate_page_copy);
+EXPORT_SYMBOL(folio_migrate_copy);
  
  /************************************************************
   *                    Migration functions
@@ -638,19 +635,21 @@ int migrate_page(struct address_space *mapping,
                 struct page *newpage, struct page *page,
                 enum migrate_mode mode)
  {
+       struct folio *newfolio = page_folio(newpage);
+       struct folio *folio = page_folio(page);
         int rc;
  
-       BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+       BUG_ON(folio_test_writeback(folio));    /* Writeback must be complete */
  
-       rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+       rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
  
         if (rc != MIGRATEPAGE_SUCCESS)
                 return rc;
  
         if (mode != MIGRATE_SYNC_NO_COPY)
-               migrate_page_copy(newpage, page);
+               folio_migrate_copy(newfolio, folio);
         else
-               migrate_page_states(newpage, page);
+               folio_migrate_flags(newfolio, folio);
         return MIGRATEPAGE_SUCCESS;
  }
  EXPORT_SYMBOL(migrate_page);
@@ -2468,7 +2467,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
   * @page: struct page to check
   *
   * Pinned pages cannot be migrated. This is the same test as in
- * migrate_page_move_mapping(), except that here we allow migration of a
+ * folio_migrate_mapping(), except that here we allow migration of a
   * ZONE_DEVICE page.
   */
  static bool migrate_vma_check_page(struct page *page)
@@ -2846,7 +2845,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
  
         if (unlikely(anon_vma_prepare(vma)))
                 goto abort;
-       if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
                 goto abort;
  
         /*
diff --git a/mm/mlock.c b/mm/mlock.c

index 16d2ee1..e263d62 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
         /* Phase 1: page isolation */
         for (i = 0; i < nr; i++) {
                 struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
  
                 if (TestClearPageMlocked(page)) {
                         /*
@@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
                          * so we can spare the get_page() here.
                          */
                         if (TestClearPageLRU(page)) {
-                               lruvec = relock_page_lruvec_irq(page, lruvec);
+                               lruvec = folio_lruvec_relock_irq(folio, lruvec);
                                 del_page_from_lru_list(page, lruvec);
                                 continue;
                         } else
diff --git a/mm/nommu.c b/mm/nommu.c

index 02d2427..41ef204 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -27,7 +27,6 @@
  #include <linux/pagemap.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
-#include <linux/blkdev.h>
  #include <linux/backing-dev.h>
  #include <linux/compiler.h>
  #include <linux/mount.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 831340e..989f35a 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1150,7 +1150,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
         struct task_struct *task;
         struct task_struct *p;
         unsigned int f_flags;
-       bool reap = true;
+       bool reap = false;
         struct pid *pid;
         long ret = 0;
  
@@ -1177,15 +1177,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
                 goto put_task;
         }
  
-       mm = p->mm;
-       mmgrab(mm);
-
-       /* If the work has been done already, just exit with success */
-       if (test_bit(MMF_OOM_SKIP, &mm->flags))
-               reap = false;
-       else if (!task_will_free_mem(p)) {
-               reap = false;
-               ret = -EINVAL;
+       if (mmget_not_zero(p->mm)) {
+               mm = p->mm;
+               if (task_will_free_mem(p))
+                       reap = true;
+               else {
+                       /* Error only if the work has not been done already */
+                       if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+                               ret = -EINVAL;
+               }
         }
         task_unlock(p);
  
@@ -1201,7 +1201,8 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
         mmap_read_unlock(mm);
  
  drop_mm:
-       mmdrop(mm);
+       if (mm)
+               mmput(mm);
  put_task:
         put_task_struct(task);
  put_pid:
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 4812a17..9c64490 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
         return cur_time;
  }
  
-static void wb_domain_writeout_inc(struct wb_domain *dom,
+static void wb_domain_writeout_add(struct wb_domain *dom,
                                    struct fprop_local_percpu *completions,
-                                  unsigned int max_prop_frac)
+                                  unsigned int max_prop_frac, long nr)
  {
-       __fprop_inc_percpu_max(&dom->completions, completions,
-                              max_prop_frac);
+       __fprop_add_percpu_max(&dom->completions, completions,
+                              max_prop_frac, nr);
         /* First event after period switching was turned off? */
         if (unlikely(!dom->period_time)) {
                 /*
@@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
  
  /*
   * Increment @wb's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
+ * completion count. Called from __folio_end_writeback().
   */
-static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
  {
         struct wb_domain *cgdom;
  
-       inc_wb_stat(wb, WB_WRITTEN);
-       wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
-                              wb->bdi->max_prop_frac);
+       wb_stat_mod(wb, WB_WRITTEN, nr);
+       wb_domain_writeout_add(&global_wb_domain, &wb->completions,
+                              wb->bdi->max_prop_frac, nr);
  
         cgdom = mem_cgroup_wb_domain(wb);
         if (cgdom)
-               wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
-                                      wb->bdi->max_prop_frac);
+               wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
+                                      wb->bdi->max_prop_frac, nr);
  }
  
  void wb_writeout_inc(struct bdi_writeback *wb)
@@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
         unsigned long flags;
  
         local_irq_save(flags);
-       __wb_writeout_inc(wb);
+       __wb_writeout_add(wb, 1);
         local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(wb_writeout_inc);
@@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
          * write_bandwidth = ---------------------------------------------------
          *                                          period
          *
-        * @written may have decreased due to account_page_redirty().
+        * @written may have decreased due to folio_account_redirty().
          * Avoid underflowing @bw calculation.
          */
         bw = written - min(written, wb->written_stamp);
@@ -2381,44 +2381,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
  }
  
  /**
- * write_one_page - write out a single page and wait on I/O
- * @page: the page to write
+ * folio_write_one - write out a single folio and wait on I/O.
+ * @folio: The folio to write.
   *
- * The page must be locked by the caller and will be unlocked upon return.
+ * The folio must be locked by the caller and will be unlocked upon return.
   *
   * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
   * function returns.
   *
   * Return: %0 on success, negative error code otherwise
   */
-int write_one_page(struct page *page)
+int folio_write_one(struct folio *folio)
  {
-       struct address_space *mapping = page->mapping;
+       struct address_space *mapping = folio->mapping;
         int ret = 0;
         struct writeback_control wbc = {
                 .sync_mode = WB_SYNC_ALL,
-               .nr_to_write = 1,
+               .nr_to_write = folio_nr_pages(folio),
         };
  
-       BUG_ON(!PageLocked(page));
+       BUG_ON(!folio_test_locked(folio));
  
-       wait_on_page_writeback(page);
+       folio_wait_writeback(folio);
  
-       if (clear_page_dirty_for_io(page)) {
-               get_page(page);
-               ret = mapping->a_ops->writepage(page, &wbc);
+       if (folio_clear_dirty_for_io(folio)) {
+               folio_get(folio);
+               ret = mapping->a_ops->writepage(&folio->page, &wbc);
                 if (ret == 0)
-                       wait_on_page_writeback(page);
-               put_page(page);
+                       folio_wait_writeback(folio);
+               folio_put(folio);
         } else {
-               unlock_page(page);
+               folio_unlock(folio);
         }
  
         if (!ret)
                 ret = filemap_check_errors(mapping);
         return ret;
  }
-EXPORT_SYMBOL(write_one_page);
+EXPORT_SYMBOL(folio_write_one);
  
  /*
   * For address_spaces which do not use buffers nor write back.
@@ -2438,29 +2438,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback);
   *
   * NOTE: This relies on being atomic wrt interrupts.
   */
-static void account_page_dirtied(struct page *page,
+static void folio_account_dirtied(struct folio *folio,
                 struct address_space *mapping)
  {
         struct inode *inode = mapping->host;
  
-       trace_writeback_dirty_page(page, mapping);
+       trace_writeback_dirty_folio(folio, mapping);
  
         if (mapping_can_writeback(mapping)) {
                 struct bdi_writeback *wb;
+               long nr = folio_nr_pages(folio);
  
-               inode_attach_wb(inode, page);
+               inode_attach_wb(inode, &folio->page);
                 wb = inode_to_wb(inode);
  
-               __inc_lruvec_page_state(page, NR_FILE_DIRTY);
-               __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               __inc_node_page_state(page, NR_DIRTIED);
-               inc_wb_stat(wb, WB_RECLAIMABLE);
-               inc_wb_stat(wb, WB_DIRTIED);
-               task_io_account_write(PAGE_SIZE);
-               current->nr_dirtied++;
-               __this_cpu_inc(bdp_ratelimits);
+               __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+               __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
+               __node_stat_mod_folio(folio, NR_DIRTIED, nr);
+               wb_stat_mod(wb, WB_RECLAIMABLE, nr);
+               wb_stat_mod(wb, WB_DIRTIED, nr);
+               task_io_account_write(nr * PAGE_SIZE);
+               current->nr_dirtied += nr;
+               __this_cpu_add(bdp_ratelimits, nr);
  
-               mem_cgroup_track_foreign_dirty(page, wb);
+               mem_cgroup_track_foreign_dirty(folio, wb);
         }
  }
  
@@ -2469,130 +2470,152 @@ static void account_page_dirtied(struct page *page,
   *
   * Caller must hold lock_page_memcg().
   */
-void account_page_cleaned(struct page *page, struct address_space *mapping,
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
                           struct bdi_writeback *wb)
  {
         if (mapping_can_writeback(mapping)) {
-               dec_lruvec_page_state(page, NR_FILE_DIRTY);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               dec_wb_stat(wb, WB_RECLAIMABLE);
-               task_io_account_cancelled_write(PAGE_SIZE);
+               long nr = folio_nr_pages(folio);
+               lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+               wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+               task_io_account_cancelled_write(nr * PAGE_SIZE);
         }
  }
  
  /*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
+ * Mark the folio dirty, and set it dirty in the page cache, and mark
+ * the inode dirty.
   *
- * If warn is true, then emit a warning if the page is not uptodate and has
+ * If warn is true, then emit a warning if the folio is not uptodate and has
   * not been truncated.
   *
   * The caller must hold lock_page_memcg().
   */
-void __set_page_dirty(struct page *page, struct address_space *mapping,
+void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
                              int warn)
  {
         unsigned long flags;
  
         xa_lock_irqsave(&mapping->i_pages, flags);
-       if (page->mapping) {    /* Race with truncate? */
-               WARN_ON_ONCE(warn && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
-               __xa_set_mark(&mapping->i_pages, page_index(page),
+       if (folio->mapping) {   /* Race with truncate? */
+               WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
+               folio_account_dirtied(folio, mapping);
+               __xa_set_mark(&mapping->i_pages, folio_index(folio),
                                 PAGECACHE_TAG_DIRTY);
         }
         xa_unlock_irqrestore(&mapping->i_pages, flags);
  }
  
-/*
- * For address_spaces which do not use buffers.  Just tag the page as dirty in
- * the xarray.
+/**
+ * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
+ * @mapping: Address space this folio belongs to.
+ * @folio: Folio to be marked as dirty.
+ *
+ * Filesystems which do not use buffer heads should call this function
+ * from their set_page_dirty address space operation.  It ignores the
+ * contents of folio_get_private(), so if the filesystem marks individual
+ * blocks as dirty, the filesystem should handle that itself.
   *
- * This is also used when a single buffer is being dirtied: we want to set the
- * page dirty in that case, but not all the buffers.  This is a "bottom-up"
- * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ * This is also sometimes used by filesystems which use buffer_heads when
+ * a single buffer is being dirtied: we want to set the folio dirty in
+ * that case, but not all the buffers.  This is a "bottom-up" dirtying,
+ * whereas __set_page_dirty_buffers() is a "top-down" dirtying.
   *
- * The caller must ensure this doesn't race with truncation.  Most will simply
- * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * the pte lock held, which also locks out truncation.
+ * The caller must ensure this doesn't race with truncation.  Most will
+ * simply hold the folio lock, but e.g. zap_pte_range() calls with the
+ * folio mapped and the pte lock held, which also locks out truncation.
   */
-int __set_page_dirty_nobuffers(struct page *page)
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
  {
-       lock_page_memcg(page);
-       if (!TestSetPageDirty(page)) {
-               struct address_space *mapping = page_mapping(page);
+       folio_memcg_lock(folio);
+       if (folio_test_set_dirty(folio)) {
+               folio_memcg_unlock(folio);
+               return false;
+       }
  
-               if (!mapping) {
-                       unlock_page_memcg(page);
-                       return 1;
-               }
-               __set_page_dirty(page, mapping, !PagePrivate(page));
-               unlock_page_memcg(page);
+       __folio_mark_dirty(folio, mapping, !folio_test_private(folio));
+       folio_memcg_unlock(folio);
  
-               if (mapping->host) {
-                       /* !PageAnon && !swapper_space */
-                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-               }
-               return 1;
+       if (mapping->host) {
+               /* !PageAnon && !swapper_space */
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
         }
-       unlock_page_memcg(page);
-       return 0;
+       return true;
  }
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+EXPORT_SYMBOL(filemap_dirty_folio);
  
-/*
- * Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
- * systematic errors in balanced_dirty_ratelimit and the dirty pages position
- * control.
+/**
+ * folio_account_redirty - Manually account for redirtying a page.
+ * @folio: The folio which is being redirtied.
+ *
+ * Most filesystems should call folio_redirty_for_writepage() instead
+ * of this fuction.  If your filesystem is doing writeback outside the
+ * context of a writeback_control(), it can call this when redirtying
+ * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
+ * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
+ * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
+ * in balanced_dirty_ratelimit and the dirty pages position control.
   */
-void account_page_redirty(struct page *page)
+void folio_account_redirty(struct folio *folio)
  {
-       struct address_space *mapping = page->mapping;
+       struct address_space *mapping = folio->mapping;
  
         if (mapping && mapping_can_writeback(mapping)) {
                 struct inode *inode = mapping->host;
                 struct bdi_writeback *wb;
                 struct wb_lock_cookie cookie = {};
+               long nr = folio_nr_pages(folio);
  
                 wb = unlocked_inode_to_wb_begin(inode, &cookie);
-               current->nr_dirtied--;
-               dec_node_page_state(page, NR_DIRTIED);
-               dec_wb_stat(wb, WB_DIRTIED);
+               current->nr_dirtied -= nr;
+               node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+               wb_stat_mod(wb, WB_DIRTIED, -nr);
                 unlocked_inode_to_wb_end(inode, &cookie);
         }
  }
-EXPORT_SYMBOL(account_page_redirty);
+EXPORT_SYMBOL(folio_account_redirty);
  
-/*
- * When a writepage implementation decides that it doesn't want to write this
- * page for some reason, it should redirty the locked page via
- * redirty_page_for_writepage() and it should then unlock the page and return 0
+/**
+ * folio_redirty_for_writepage - Decline to write a dirty folio.
+ * @wbc: The writeback control.
+ * @folio: The folio.
+ *
+ * When a writepage implementation decides that it doesn't want to write
+ * @folio for some reason, it should call this function, unlock @folio and
+ * return 0.
+ *
+ * Return: True if we redirtied the folio.  False if someone else dirtied
+ * it first.
   */
-int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+bool folio_redirty_for_writepage(struct writeback_control *wbc,
+               struct folio *folio)
  {
-       int ret;
+       bool ret;
+       long nr = folio_nr_pages(folio);
+
+       wbc->pages_skipped += nr;
+       ret = filemap_dirty_folio(folio->mapping, folio);
+       folio_account_redirty(folio);
  
-       wbc->pages_skipped++;
-       ret = __set_page_dirty_nobuffers(page);
-       account_page_redirty(page);
         return ret;
  }
-EXPORT_SYMBOL(redirty_page_for_writepage);
+EXPORT_SYMBOL(folio_redirty_for_writepage);
  
-/*
- * Dirty a page.
+/**
+ * folio_mark_dirty - Mark a folio as being modified.
+ * @folio: The folio.
   *
- * For pages with a mapping this should be done under the page lock for the
- * benefit of asynchronous memory errors who prefer a consistent dirty state.
- * This rule can be broken in some special cases, but should be better not to.
+ * For folios with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
+ * Return: True if the folio was newly dirtied, false if it was already dirty.
   */
-int set_page_dirty(struct page *page)
+bool folio_mark_dirty(struct folio *folio)
  {
-       struct address_space *mapping = page_mapping(page);
+       struct address_space *mapping = folio_mapping(folio);
  
-       page = compound_head(page);
         if (likely(mapping)) {
                 /*
                  * readahead/lru_deactivate_page could remain
@@ -2604,17 +2627,17 @@ int set_page_dirty(struct page *page)
                  * it will confuse readahead and make it restart the size rampup
                  * process. But it's a trivial problem.
                  */
-               if (PageReclaim(page))
-                       ClearPageReclaim(page);
-               return mapping->a_ops->set_page_dirty(page);
+               if (folio_test_reclaim(folio))
+                       folio_clear_reclaim(folio);
+               return mapping->a_ops->set_page_dirty(&folio->page);
         }
-       if (!PageDirty(page)) {
-               if (!TestSetPageDirty(page))
-                       return 1;
+       if (!folio_test_dirty(folio)) {
+               if (!folio_test_set_dirty(folio))
+                       return true;
         }
-       return 0;
+       return false;
  }
-EXPORT_SYMBOL(set_page_dirty);
+EXPORT_SYMBOL(folio_mark_dirty);
  
  /*
   * set_page_dirty() is racy if the caller has no reference against
@@ -2650,49 +2673,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
   * page without actually doing it through the VM. Can you say "ext3 is
   * horribly ugly"? Thought you could.
   */
-void __cancel_dirty_page(struct page *page)
+void __folio_cancel_dirty(struct folio *folio)
  {
-       struct address_space *mapping = page_mapping(page);
+       struct address_space *mapping = folio_mapping(folio);
  
         if (mapping_can_writeback(mapping)) {
                 struct inode *inode = mapping->host;
                 struct bdi_writeback *wb;
                 struct wb_lock_cookie cookie = {};
  
-               lock_page_memcg(page);
+               folio_memcg_lock(folio);
                 wb = unlocked_inode_to_wb_begin(inode, &cookie);
  
-               if (TestClearPageDirty(page))
-                       account_page_cleaned(page, mapping, wb);
+               if (folio_test_clear_dirty(folio))
+                       folio_account_cleaned(folio, mapping, wb);
  
                 unlocked_inode_to_wb_end(inode, &cookie);
-               unlock_page_memcg(page);
+               folio_memcg_unlock(folio);
         } else {
-               ClearPageDirty(page);
+               folio_clear_dirty(folio);
         }
  }
-EXPORT_SYMBOL(__cancel_dirty_page);
+EXPORT_SYMBOL(__folio_cancel_dirty);
  
  /*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- *
- * This is for preparing to put the page under writeout.  We leave the page
- * tagged as dirty in the xarray so that a concurrent write-for-sync
- * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
- * implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and xarray dirty tag
- * back into sync.
- *
- * This incoherency between the page's dirty flag and xarray tag is
- * unfortunate, but it only exists while the page is locked.
+ * Clear a folio's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the folio was previously dirty.
+ *
+ * This is for preparing to put the folio under writeout.  We leave
+ * the folio tagged as dirty in the xarray so that a concurrent
+ * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
+ * The ->writepage implementation will run either folio_start_writeback()
+ * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
+ * and xarray dirty tag back into sync.
+ *
+ * This incoherency between the folio's dirty flag and xarray tag is
+ * unfortunate, but it only exists while the folio is locked.
   */
-int clear_page_dirty_for_io(struct page *page)
+bool folio_clear_dirty_for_io(struct folio *folio)
  {
-       struct address_space *mapping = page_mapping(page);
-       int ret = 0;
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret = false;
  
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  
         if (mapping && mapping_can_writeback(mapping)) {
                 struct inode *inode = mapping->host;
@@ -2705,48 +2728,49 @@ int clear_page_dirty_for_io(struct page *page)
                  * We use this sequence to make sure that
                  *  (a) we account for dirty stats properly
                  *  (b) we tell the low-level filesystem to
-                *      mark the whole page dirty if it was
+                *      mark the whole folio dirty if it was
                  *      dirty in a pagetable. Only to then
-                *  (c) clean the page again and return 1 to
+                *  (c) clean the folio again and return 1 to
                  *      cause the writeback.
                  *
                  * This way we avoid all nasty races with the
                  * dirty bit in multiple places and clearing
                  * them concurrently from different threads.
                  *
-                * Note! Normally the "set_page_dirty(page)"
+                * Note! Normally the "folio_mark_dirty(folio)"
                  * has no effect on the actual dirty bit - since
                  * that will already usually be set. But we
                  * need the side effects, and it can help us
                  * avoid races.
                  *
-                * We basically use the page "master dirty bit"
+                * We basically use the folio "master dirty bit"
                  * as a serialization point for all the different
                  * threads doing their things.
                  */
-               if (page_mkclean(page))
-                       set_page_dirty(page);
+               if (folio_mkclean(folio))
+                       folio_mark_dirty(folio);
                 /*
                  * We carefully synchronise fault handlers against
-                * installing a dirty pte and marking the page dirty
+                * installing a dirty pte and marking the folio dirty
                  * at this point.  We do this by having them hold the
-                * page lock while dirtying the page, and pages are
+                * page lock while dirtying the folio, and folios are
                  * always locked coming in here, so we get the desired
                  * exclusion.
                  */
                 wb = unlocked_inode_to_wb_begin(inode, &cookie);
-               if (TestClearPageDirty(page)) {
-                       dec_lruvec_page_state(page, NR_FILE_DIRTY);
-                       dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-                       dec_wb_stat(wb, WB_RECLAIMABLE);
-                       ret = 1;
+               if (folio_test_clear_dirty(folio)) {
+                       long nr = folio_nr_pages(folio);
+                       lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+                       zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+                       wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+                       ret = true;
                 }
                 unlocked_inode_to_wb_end(inode, &cookie);
                 return ret;
         }
-       return TestClearPageDirty(page);
+       return folio_test_clear_dirty(folio);
  }
-EXPORT_SYMBOL(clear_page_dirty_for_io);
+EXPORT_SYMBOL(folio_clear_dirty_for_io);
  
  static void wb_inode_writeback_start(struct bdi_writeback *wb)
  {
@@ -2766,27 +2790,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb)
         queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
  }
  
-int test_clear_page_writeback(struct page *page)
+bool __folio_end_writeback(struct folio *folio)
  {
-       struct address_space *mapping = page_mapping(page);
-       int ret;
+       long nr = folio_nr_pages(folio);
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret;
  
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
         if (mapping && mapping_use_writeback_tags(mapping)) {
                 struct inode *inode = mapping->host;
                 struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
  
                 xa_lock_irqsave(&mapping->i_pages, flags);
-               ret = TestClearPageWriteback(page);
+               ret = folio_test_clear_writeback(folio);
                 if (ret) {
-                       __xa_clear_mark(&mapping->i_pages, page_index(page),
+                       __xa_clear_mark(&mapping->i_pages, folio_index(folio),
                                                 PAGECACHE_TAG_WRITEBACK);
                         if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                                 struct bdi_writeback *wb = inode_to_wb(inode);
  
-                               dec_wb_stat(wb, WB_WRITEBACK);
-                               __wb_writeout_inc(wb);
+                               wb_stat_mod(wb, WB_WRITEBACK, -nr);
+                               __wb_writeout_add(wb, nr);
                                 if (!mapping_tagged(mapping,
                                                     PAGECACHE_TAG_WRITEBACK))
                                         wb_inode_writeback_end(wb);
@@ -2799,32 +2824,34 @@ int test_clear_page_writeback(struct page *page)
  
                 xa_unlock_irqrestore(&mapping->i_pages, flags);
         } else {
-               ret = TestClearPageWriteback(page);
+               ret = folio_test_clear_writeback(folio);
         }
         if (ret) {
-               dec_lruvec_page_state(page, NR_WRITEBACK);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               inc_node_page_state(page, NR_WRITTEN);
+               lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+               node_stat_mod_folio(folio, NR_WRITTEN, nr);
         }
-       unlock_page_memcg(page);
+       folio_memcg_unlock(folio);
         return ret;
  }
  
-int __test_set_page_writeback(struct page *page, bool keep_write)
+bool __folio_start_writeback(struct folio *folio, bool keep_write)
  {
-       struct address_space *mapping = page_mapping(page);
-       int ret, access_ret;
+       long nr = folio_nr_pages(folio);
+       struct address_space *mapping = folio_mapping(folio);
+       bool ret;
+       int access_ret;
  
-       lock_page_memcg(page);
+       folio_memcg_lock(folio);
         if (mapping && mapping_use_writeback_tags(mapping)) {
-               XA_STATE(xas, &mapping->i_pages, page_index(page));
+               XA_STATE(xas, &mapping->i_pages, folio_index(folio));
                 struct inode *inode = mapping->host;
                 struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
  
                 xas_lock_irqsave(&xas, flags);
                 xas_load(&xas);
-               ret = TestSetPageWriteback(page);
+               ret = folio_test_set_writeback(folio);
                 if (!ret) {
                         bool on_wblist;
  
@@ -2835,84 +2862,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                         if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                                 struct bdi_writeback *wb = inode_to_wb(inode);
  
-                               inc_wb_stat(wb, WB_WRITEBACK);
+                               wb_stat_mod(wb, WB_WRITEBACK, nr);
                                 if (!on_wblist)
                                         wb_inode_writeback_start(wb);
                         }
  
                         /*
-                        * We can come through here when swapping anonymous
-                        * pages, so we don't necessarily have an inode to track
-                        * for sync.
+                        * We can come through here when swapping
+                        * anonymous folios, so we don't necessarily
+                        * have an inode to track for sync.
                          */
                         if (mapping->host && !on_wblist)
                                 sb_mark_inode_writeback(mapping->host);
                 }
-               if (!PageDirty(page))
+               if (!folio_test_dirty(folio))
                         xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                 if (!keep_write)
                         xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                 xas_unlock_irqrestore(&xas, flags);
         } else {
-               ret = TestSetPageWriteback(page);
+               ret = folio_test_set_writeback(folio);
         }
         if (!ret) {
-               inc_lruvec_page_state(page, NR_WRITEBACK);
-               inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+               lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
+               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
         }
-       unlock_page_memcg(page);
-       access_ret = arch_make_page_accessible(page);
+       folio_memcg_unlock(folio);
+       access_ret = arch_make_folio_accessible(folio);
         /*
          * If writeback has been triggered on a page that cannot be made
          * accessible, it is too late to recover here.
          */
-       VM_BUG_ON_PAGE(access_ret != 0, page);
+       VM_BUG_ON_FOLIO(access_ret != 0, folio);
  
         return ret;
-
  }
-EXPORT_SYMBOL(__test_set_page_writeback);
+EXPORT_SYMBOL(__folio_start_writeback);
  
-/*
- * Wait for a page to complete writeback
+/**
+ * folio_wait_writeback - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete.
+ *
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
   */
-void wait_on_page_writeback(struct page *page)
+void folio_wait_writeback(struct folio *folio)
  {
-       while (PageWriteback(page)) {
-               trace_wait_on_page_writeback(page, page_mapping(page));
-               wait_on_page_bit(page, PG_writeback);
+       while (folio_test_writeback(folio)) {
+               trace_folio_wait_writeback(folio, folio_mapping(folio));
+               folio_wait_bit(folio, PG_writeback);
         }
  }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+EXPORT_SYMBOL_GPL(folio_wait_writeback);
  
-/*
- * Wait for a page to complete writeback.  Returns -EINTR if we get a
- * fatal signal while waiting.
+/**
+ * folio_wait_writeback_killable - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete or a fatal signal to arrive.
+ *
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
+ * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
   */
-int wait_on_page_writeback_killable(struct page *page)
+int folio_wait_writeback_killable(struct folio *folio)
  {
-       while (PageWriteback(page)) {
-               trace_wait_on_page_writeback(page, page_mapping(page));
-               if (wait_on_page_bit_killable(page, PG_writeback))
+       while (folio_test_writeback(folio)) {
+               trace_folio_wait_writeback(folio, folio_mapping(folio));
+               if (folio_wait_bit_killable(folio, PG_writeback))
                         return -EINTR;
         }
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
  
  /**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page:      The page to wait on.
+ * folio_wait_stable() - wait for writeback to finish, if necessary.
+ * @folio: The folio to wait on.
+ *
+ * This function determines if the given folio is related to a backing
+ * device that requires folio contents to be held stable during writeback.
+ * If so, then it will wait for any pending writeback to complete.
   *
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback.  If so, then
- * it will wait for any pending writeback to complete.
+ * Context: Sleeps.  Must be called in process context and with
+ * no spinlocks held.  Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
   */
-void wait_for_stable_page(struct page *page)
+void folio_wait_stable(struct folio *folio)
  {
-       page = thp_head(page);
-       if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
-               wait_on_page_writeback(page);
+       if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
+               folio_wait_writeback(folio);
  }
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
+EXPORT_SYMBOL_GPL(folio_wait_stable);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index b37435c..fee18ad 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -724,7 +724,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
  
  void free_compound_page(struct page *page)
  {
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
         free_the_page(page, compound_order(page));
  }
  
@@ -1312,8 +1312,10 @@ static __always_inline bool free_pages_prepare(struct page *page,
  
                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
  
-               if (compound)
+               if (compound) {
                         ClearPageDoubleMap(page);
+                       ClearPageHasHWPoisoned(page);
+               }
                 for (i = 1; i < (1 << order); i++) {
                         if (compound)
                                 bad += free_tail_pages_check(page, page + i);
@@ -5223,6 +5225,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
         if (unlikely(page_array && nr_pages - nr_populated == 0))
                 goto out;
  
+       /* Bulk allocator does not support memcg accounting. */
+       if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT))
+               goto failed;
+
         /* Use the single page allocator for one page. */
         if (nr_pages - nr_populated == 1)
                 goto failed;
@@ -5400,6 +5406,18 @@ out:
  }
  EXPORT_SYMBOL(__alloc_pages);
  
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+               nodemask_t *nodemask)
+{
+       struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+                       preferred_nid, nodemask);
+
+       if (page && order > 1)
+               prep_transhuge_page(page);
+       return (struct folio *)page;
+}
+EXPORT_SYMBOL(__folio_alloc);
+
  /*
   * Common helper functions. Never use with __GFP_HIGHMEM because the returned
   * address cannot represent highmem pages. Use alloc_pages and then kmap if
diff --git a/mm/page_io.c b/mm/page_io.c

index c493ce9..9725c7e 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -38,7 +38,7 @@ void end_swap_bio_write(struct bio *bio)
                  * Also print a dire warning that things will go BAD (tm)
                  * very quickly.
                  *
-                * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+                * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
                  */
                 set_page_dirty(page);
                 pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
@@ -317,7 +317,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                          * temporary failure if the system has limited
                          * memory for allocating transmit buffers.
                          * Mark the page dirty and avoid
-                        * rotate_reclaimable_page but rate-limit the
+                        * folio_rotate_reclaimable but rate-limit the
                          * messages but do not flag PageError like
                          * the normal direct-to-bio case as it could
                          * be temporary.
@@ -358,8 +358,6 @@ int swap_readpage(struct page *page, bool synchronous)
         struct bio *bio;
         int ret = 0;
         struct swap_info_struct *sis = page_swap_info(page);
-       blk_qc_t qc;
-       struct gendisk *disk;
         unsigned long pflags;
  
         VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@@ -409,26 +407,24 @@ int swap_readpage(struct page *page, bool synchronous)
         bio->bi_iter.bi_sector = swap_page_sector(page);
         bio->bi_end_io = end_swap_bio_read;
         bio_add_page(bio, page, thp_size(page), 0);
-
-       disk = bio->bi_bdev->bd_disk;
         /*
          * Keep this task valid during swap readpage because the oom killer may
          * attempt to access it in the page fault retry time check.
          */
         if (synchronous) {
-               bio->bi_opf |= REQ_HIPRI;
+               bio->bi_opf |= REQ_POLLED;
                 get_task_struct(current);
                 bio->bi_private = current;
         }
         count_vm_event(PSWPIN);
         bio_get(bio);
-       qc = submit_bio(bio);
+       submit_bio(bio);
         while (synchronous) {
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 if (!READ_ONCE(bio->bi_private))
                         break;
  
-               if (!blk_poll(disk->queue, qc, true))
+               if (!bio_poll(bio, NULL, 0))
                         blk_io_schedule();
         }
         __set_current_state(TASK_RUNNING);
diff --git a/mm/page_owner.c b/mm/page_owner.c

index 62402d2..d24ed22 100644 (file)
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -210,10 +210,10 @@ void __split_page_owner(struct page *page, unsigned int nr)
         }
  }
  
-void __copy_page_owner(struct page *oldpage, struct page *newpage)
+void __folio_copy_owner(struct folio *newfolio, struct folio *old)
  {
-       struct page_ext *old_ext = lookup_page_ext(oldpage);
-       struct page_ext *new_ext = lookup_page_ext(newpage);
+       struct page_ext *old_ext = lookup_page_ext(&old->page);
+       struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
         struct page_owner *old_page_owner, *new_page_owner;
  
         if (unlikely(!old_ext || !new_ext))
@@ -231,11 +231,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
         new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
  
         /*
-        * We don't clear the bit on the oldpage as it's going to be freed
+        * We don't clear the bit on the old folio as it's going to be freed
          * after migration. Until then, the info can be useful in case of
          * a bug, and the overall stats will be off a bit only temporarily.
          * Also, migrate_misplaced_transhuge_page() can still fail the
-        * migration and then we want the oldpage to retain the info. But
+        * migration and then we want the old folio to retain the info. But
          * in that case we also don't need to explicitly clear the info from
          * the new page, which will be freed.
          */
diff --git a/mm/readahead.c b/mm/readahead.c

index 41b75d7..e71e719 100644 (file)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -12,7 +12,6 @@
  #include <linux/dax.h>
  #include <linux/gfp.h>
  #include <linux/export.h>
-#include <linux/blkdev.h>
  #include <linux/backing-dev.h>
  #include <linux/task_io_accounting_ops.h>
  #include <linux/pagevec.h>
diff --git a/mm/rmap.c b/mm/rmap.c

index 6aebd17..3a1059c 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,7 +34,7 @@
   *                   mapping->private_lock (in __set_page_dirty_buffers)
   *                     lock_page_memcg move_lock (in __set_page_dirty_buffers)
   *                       i_pages lock (widely used)
- *                         lruvec->lru_lock (in lock_page_lruvec_irq)
+ *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
   *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
   *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
   *                     sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -981,7 +981,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
         return true;
  }
  
-int page_mkclean(struct page *page)
+int folio_mkclean(struct folio *folio)
  {
         int cleaned = 0;
         struct address_space *mapping;
@@ -991,20 +991,20 @@ int page_mkclean(struct page *page)
                 .invalid_vma = invalid_mkclean_vma,
         };
  
-       BUG_ON(!PageLocked(page));
+       BUG_ON(!folio_test_locked(folio));
  
-       if (!page_mapped(page))
+       if (!folio_mapped(folio))
                 return 0;
  
-       mapping = page_mapping(page);
+       mapping = folio_mapping(folio);
         if (!mapping)
                 return 0;
  
-       rmap_walk(page, &rwc);
+       rmap_walk(&folio->page, &rwc);
  
         return cleaned;
  }
-EXPORT_SYMBOL_GPL(page_mkclean);
+EXPORT_SYMBOL_GPL(folio_mkclean);
  
  /**
   * page_move_anon_rmap - move a page to our anon_vma
diff --git a/mm/secretmem.c b/mm/secretmem.c

index c2dda40..22b310a 100644 (file)
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -218,8 +218,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
  
         file->f_flags |= O_LARGEFILE;
  
-       fd_install(fd, file);
         atomic_inc(&secretmem_users);
+       fd_install(fd, file);
         return fd;
  
  err_put_fd:
diff --git a/mm/shmem.c b/mm/shmem.c

index b5860f4..17e344e 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -59,7 +59,6 @@ static struct vfsmount *shm_mnt;
  #include <linux/backing-dev.h>
  #include <linux/shmem_fs.h>
  #include <linux/writeback.h>
-#include <linux/blkdev.h>
  #include <linux/pagevec.h>
  #include <linux/percpu_counter.h>
  #include <linux/falloc.h>
@@ -710,7 +709,7 @@ static int shmem_add_to_page_cache(struct page *page,
         page->index = index;
  
         if (!PageSwapCache(page)) {
-               error = mem_cgroup_charge(page, charge_mm, gfp);
+               error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
                 if (error) {
                         if (PageTransHuge(page)) {
                                 count_vm_event(THP_FILE_FALLBACK);
@@ -1637,6 +1636,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                                 struct shmem_inode_info *info, pgoff_t index)
  {
         struct page *oldpage, *newpage;
+       struct folio *old, *new;
         struct address_space *swap_mapping;
         swp_entry_t entry;
         pgoff_t swap_index;
@@ -1673,7 +1673,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
         xa_lock_irq(&swap_mapping->i_pages);
         error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
         if (!error) {
-               mem_cgroup_migrate(oldpage, newpage);
+               old = page_folio(oldpage);
+               new = page_folio(newpage);
+               mem_cgroup_migrate(old, new);
                 __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
                 __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
         }
diff --git a/mm/swap.c b/mm/swap.c

index af3cad4..8ff9ba7 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
  static void __page_cache_release(struct page *page)
  {
         if (PageLRU(page)) {
+               struct folio *folio = page_folio(page);
                 struct lruvec *lruvec;
                 unsigned long flags;
  
-               lruvec = lock_page_lruvec_irqsave(page, &flags);
+               lruvec = folio_lruvec_lock_irqsave(folio, &flags);
                 del_page_from_lru_list(page, lruvec);
                 __clear_page_lru_flags(page);
                 unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page)
  static void __put_single_page(struct page *page)
  {
         __page_cache_release(page);
-       mem_cgroup_uncharge(page);
+       mem_cgroup_uncharge(page_folio(page));
         free_unref_page(page, 0);
  }
  
@@ -188,12 +189,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
  
         for (i = 0; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
  
                 /* block memcg migration during page moving between lru */
                 if (!TestClearPageLRU(page))
                         continue;
  
-               lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
+               lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
                 (*move_fn)(page, lruvec);
  
                 SetPageLRU(page);
@@ -206,11 +208,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
  
  static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
  {
-       if (!PageUnevictable(page)) {
-               del_page_from_lru_list(page, lruvec);
-               ClearPageActive(page);
-               add_page_to_lru_list_tail(page, lruvec);
-               __count_vm_events(PGROTATED, thp_nr_pages(page));
+       struct folio *folio = page_folio(page);
+
+       if (!folio_test_unevictable(folio)) {
+               lruvec_del_folio(lruvec, folio);
+               folio_clear_active(folio);
+               lruvec_add_folio_tail(lruvec, folio);
+               __count_vm_events(PGROTATED, folio_nr_pages(folio));
         }
  }
  
@@ -227,23 +231,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
  }
  
  /*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim.  If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+ * Writeback is about to end against a folio which has been marked for
+ * immediate reclaim.  If it still appears to be reclaimable, move it
+ * to the tail of the inactive list.
   *
- * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
+ * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
   */
-void rotate_reclaimable_page(struct page *page)
+void folio_rotate_reclaimable(struct folio *folio)
  {
-       if (!PageLocked(page) && !PageDirty(page) &&
-           !PageUnevictable(page) && PageLRU(page)) {
+       if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
+           !folio_test_unevictable(folio) && folio_test_lru(folio)) {
                 struct pagevec *pvec;
                 unsigned long flags;
  
-               get_page(page);
+               folio_get(folio);
                 local_lock_irqsave(&lru_rotate.lock, flags);
                 pvec = this_cpu_ptr(&lru_rotate.pvec);
-               if (pagevec_add_and_need_flush(pvec, page))
+               if (pagevec_add_and_need_flush(pvec, &folio->page))
                         pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
                 local_unlock_irqrestore(&lru_rotate.lock, flags);
         }
@@ -289,21 +293,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
         } while ((lruvec = parent_lruvec(lruvec)));
  }
  
-void lru_note_cost_page(struct page *page)
+void lru_note_cost_folio(struct folio *folio)
  {
-       lru_note_cost(mem_cgroup_page_lruvec(page),
-                     page_is_file_lru(page), thp_nr_pages(page));
+       lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
+                       folio_nr_pages(folio));
  }
  
-static void __activate_page(struct page *page, struct lruvec *lruvec)
+static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
  {
-       if (!PageActive(page) && !PageUnevictable(page)) {
-               int nr_pages = thp_nr_pages(page);
+       if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
+               long nr_pages = folio_nr_pages(folio);
  
-               del_page_from_lru_list(page, lruvec);
-               SetPageActive(page);
-               add_page_to_lru_list(page, lruvec);
-               trace_mm_lru_activate(page);
+               lruvec_del_folio(lruvec, folio);
+               folio_set_active(folio);
+               lruvec_add_folio(lruvec, folio);
+               trace_mm_lru_activate(folio);
  
                 __count_vm_events(PGACTIVATE, nr_pages);
                 __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
@@ -312,6 +316,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
  }
  
  #ifdef CONFIG_SMP
+static void __activate_page(struct page *page, struct lruvec *lruvec)
+{
+       return __folio_activate(page_folio(page), lruvec);
+}
+
  static void activate_page_drain(int cpu)
  {
         struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
@@ -325,16 +334,16 @@ static bool need_activate_page_drain(int cpu)
         return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
  }
  
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
  {
-       page = compound_head(page);
-       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+       if (folio_test_lru(folio) && !folio_test_active(folio) &&
+           !folio_test_unevictable(folio)) {
                 struct pagevec *pvec;
  
+               folio_get(folio);
                 local_lock(&lru_pvecs.lock);
                 pvec = this_cpu_ptr(&lru_pvecs.activate_page);
-               get_page(page);
-               if (pagevec_add_and_need_flush(pvec, page))
+               if (pagevec_add_and_need_flush(pvec, &folio->page))
                         pagevec_lru_move_fn(pvec, __activate_page);
                 local_unlock(&lru_pvecs.lock);
         }
@@ -345,21 +354,20 @@ static inline void activate_page_drain(int cpu)
  {
  }
  
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
  {
         struct lruvec *lruvec;
  
-       page = compound_head(page);
-       if (TestClearPageLRU(page)) {
-               lruvec = lock_page_lruvec_irq(page);
-               __activate_page(page, lruvec);
+       if (folio_test_clear_lru(folio)) {
+               lruvec = folio_lruvec_lock_irq(folio);
+               __folio_activate(folio, lruvec);
                 unlock_page_lruvec_irq(lruvec);
-               SetPageLRU(page);
+               folio_set_lru(folio);
         }
  }
  #endif
  
-static void __lru_cache_activate_page(struct page *page)
+static void __lru_cache_activate_folio(struct folio *folio)
  {
         struct pagevec *pvec;
         int i;
@@ -380,8 +388,8 @@ static void __lru_cache_activate_page(struct page *page)
         for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
                 struct page *pagevec_page = pvec->pages[i];
  
-               if (pagevec_page == page) {
-                       SetPageActive(page);
+               if (pagevec_page == &folio->page) {
+                       folio_set_active(folio);
                         break;
                 }
         }
@@ -399,61 +407,59 @@ static void __lru_cache_activate_page(struct page *page)
   * When a newly allocated page is not yet visible, so safe for non-atomic ops,
   * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
   */
-void mark_page_accessed(struct page *page)
+void folio_mark_accessed(struct folio *folio)
  {
-       page = compound_head(page);
-
-       if (!PageReferenced(page)) {
-               SetPageReferenced(page);
-       } else if (PageUnevictable(page)) {
+       if (!folio_test_referenced(folio)) {
+               folio_set_referenced(folio);
+       } else if (folio_test_unevictable(folio)) {
                 /*
                  * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                  * this list is never rotated or maintained, so marking an
                  * evictable page accessed has no effect.
                  */
-       } else if (!PageActive(page)) {
+       } else if (!folio_test_active(folio)) {
                 /*
                  * If the page is on the LRU, queue it for activation via
                  * lru_pvecs.activate_page. Otherwise, assume the page is on a
                  * pagevec, mark it active and it'll be moved to the active
                  * LRU on the next drain.
                  */
-               if (PageLRU(page))
-                       activate_page(page);
+               if (folio_test_lru(folio))
+                       folio_activate(folio);
                 else
-                       __lru_cache_activate_page(page);
-               ClearPageReferenced(page);
-               workingset_activation(page);
+                       __lru_cache_activate_folio(folio);
+               folio_clear_referenced(folio);
+               workingset_activation(folio);
         }
-       if (page_is_idle(page))
-               clear_page_idle(page);
+       if (folio_test_idle(folio))
+               folio_clear_idle(folio);
  }
-EXPORT_SYMBOL(mark_page_accessed);
+EXPORT_SYMBOL(folio_mark_accessed);
  
  /**
- * lru_cache_add - add a page to a page list
- * @page: the page to be added to the LRU.
+ * folio_add_lru - Add a folio to an LRU list.
+ * @folio: The folio to be added to the LRU.
   *
- * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * Queue the folio for addition to the LRU. The decision on whether
   * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * pagevec is drained. This gives a chance for the caller of folio_add_lru()
+ * have the folio added to the active list using folio_mark_accessed().
   */
-void lru_cache_add(struct page *page)
+void folio_add_lru(struct folio *folio)
  {
         struct pagevec *pvec;
  
-       VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  
-       get_page(page);
+       folio_get(folio);
         local_lock(&lru_pvecs.lock);
         pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-       if (pagevec_add_and_need_flush(pvec, page))
+       if (pagevec_add_and_need_flush(pvec, &folio->page))
                 __pagevec_lru_add(pvec);
         local_unlock(&lru_pvecs.lock);
  }
-EXPORT_SYMBOL(lru_cache_add);
+EXPORT_SYMBOL(folio_add_lru);
  
  /**
   * lru_cache_add_inactive_or_unevictable
@@ -888,11 +894,12 @@ void release_pages(struct page **pages, int nr)
         int i;
         LIST_HEAD(pages_to_free);
         struct lruvec *lruvec = NULL;
-       unsigned long flags;
+       unsigned long flags = 0;
         unsigned int lock_batch;
  
         for (i = 0; i < nr; i++) {
                 struct page *page = pages[i];
+               struct folio *folio = page_folio(page);
  
                 /*
                  * Make sure the IRQ-safe lock-holding time does not get
@@ -904,7 +911,7 @@ void release_pages(struct page **pages, int nr)
                         lruvec = NULL;
                 }
  
-               page = compound_head(page);
+               page = &folio->page;
                 if (is_huge_zero_page(page))
                         continue;
  
@@ -943,7 +950,7 @@ void release_pages(struct page **pages, int nr)
                 if (PageLRU(page)) {
                         struct lruvec *prev_lruvec = lruvec;
  
-                       lruvec = relock_page_lruvec_irqsave(page, lruvec,
+                       lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
                                                                         &flags);
                         if (prev_lruvec != lruvec)
                                 lock_batch = 0;
@@ -985,17 +992,18 @@ void __pagevec_release(struct pagevec *pvec)
  }
  EXPORT_SYMBOL(__pagevec_release);
  
-static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
+static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
  {
-       int was_unevictable = TestClearPageUnevictable(page);
-       int nr_pages = thp_nr_pages(page);
+       int was_unevictable = folio_test_clear_unevictable(folio);
+       long nr_pages = folio_nr_pages(folio);
  
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  
         /*
-        * Page becomes evictable in two ways:
+        * A folio becomes evictable in two ways:
          * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
-        * 2) Before acquiring LRU lock to put the page to correct LRU and then
+        * 2) Before acquiring LRU lock to put the folio on the correct LRU
+        *    and then
          *   a) do PageLRU check with lock [check_move_unevictable_pages]
          *   b) do PageLRU check before lock [clear_page_mlock]
          *
@@ -1004,35 +1012,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
          *
          * #0: __pagevec_lru_add_fn             #1: clear_page_mlock
          *
-        * SetPageLRU()                         TestClearPageMlocked()
+        * folio_set_lru()                      folio_test_clear_mlocked()
          * smp_mb() // explicit ordering        // above provides strict
          *                                      // ordering
-        * PageMlocked()                        PageLRU()
+        * folio_test_mlocked()                 folio_test_lru()
          *
          *
-        * if '#1' does not observe setting of PG_lru by '#0' and fails
-        * isolation, the explicit barrier will make sure that page_evictable
-        * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
-        * can be reordered after PageMlocked check and can make '#1' to fail
-        * the isolation of the page whose Mlocked bit is cleared (#0 is also
-        * looking at the same page) and the evictable page will be stranded
-        * in an unevictable LRU.
+        * if '#1' does not observe setting of PG_lru by '#0' and
+        * fails isolation, the explicit barrier will make sure that
+        * folio_evictable check will put the folio on the correct
+        * LRU. Without smp_mb(), folio_set_lru() can be reordered
+        * after folio_test_mlocked() check and can make '#1' fail the
+        * isolation of the folio whose mlocked bit is cleared (#0 is
+        * also looking at the same folio) and the evictable folio will
+        * be stranded on an unevictable LRU.
          */
-       SetPageLRU(page);
+       folio_set_lru(folio);
         smp_mb__after_atomic();
  
-       if (page_evictable(page)) {
+       if (folio_evictable(folio)) {
                 if (was_unevictable)
                         __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
         } else {
-               ClearPageActive(page);
-               SetPageUnevictable(page);
+               folio_clear_active(folio);
+               folio_set_unevictable(folio);
                 if (!was_unevictable)
                         __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
         }
  
-       add_page_to_lru_list(page, lruvec);
-       trace_mm_lru_insertion(page);
+       lruvec_add_folio(lruvec, folio);
+       trace_mm_lru_insertion(folio);
  }
  
  /*
@@ -1046,10 +1055,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
         unsigned long flags = 0;
  
         for (i = 0; i < pagevec_count(pvec); i++) {
-               struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(pvec->pages[i]);
  
-               lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
-               __pagevec_lru_add_fn(page, lruvec);
+               lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+               __pagevec_lru_add_fn(folio, lruvec);
         }
         if (lruvec)
                 unlock_page_lruvec_irqrestore(lruvec, flags);
diff --git a/mm/swap_state.c b/mm/swap_state.c

index bc7cee6..8d41042 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
         mem_cgroup_swapin_uncharge_swap(entry);
  
         if (shadow)
-               workingset_refault(page, shadow);
+               workingset_refault(page_folio(page), shadow);
  
         /* Caller will initiate read into locked page */
         lru_cache_add(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 22d10f7..41c9e92 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -18,7 +18,7 @@
  #include <linux/pagemap.h>
  #include <linux/namei.h>
  #include <linux/shmem_fs.h>
-#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
  #include <linux/random.h>
  #include <linux/writeback.h>
  #include <linux/proc_fs.h>
@@ -3534,13 +3534,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
  }
  
  /*
- * out-of-line __page_file_ methods to avoid include hell.
+ * out-of-line methods to avoid include hell.
   */
-struct address_space *__page_file_mapping(struct page *page)
+struct address_space *swapcache_mapping(struct folio *folio)
  {
-       return page_swap_info(page)->swap_file->f_mapping;
+       return page_swap_info(&folio->page)->swap_file->f_mapping;
  }
-EXPORT_SYMBOL_GPL(__page_file_mapping);
+EXPORT_SYMBOL_GPL(swapcache_mapping);
  
  pgoff_t __page_file_index(struct page *page)
  {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index 7a90084..36e5f6a 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -164,7 +164,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
         __SetPageUptodate(page);
  
         ret = -ENOMEM;
-       if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
                 goto out_release;
  
         ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
diff --git a/mm/util.c b/mm/util.c

index bacabe4..e58151a 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -654,81 +654,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
  }
  EXPORT_SYMBOL(kvrealloc);
  
-static inline void *__page_rmapping(struct page *page)
-{
-       unsigned long mapping;
-
-       mapping = (unsigned long)page->mapping;
-       mapping &= ~PAGE_MAPPING_FLAGS;
-
-       return (void *)mapping;
-}
-
  /* Neutral page->mapping pointer to address_space or anon_vma or other */
  void *page_rmapping(struct page *page)
  {
-       page = compound_head(page);
-       return __page_rmapping(page);
+       return folio_raw_mapping(page_folio(page));
  }
  
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
+/**
+ * folio_mapped - Is this folio mapped into userspace?
+ * @folio: The folio.
+ *
+ * Return: True if any page in this folio is referenced by user page tables.
   */
-bool page_mapped(struct page *page)
+bool folio_mapped(struct folio *folio)
  {
-       int i;
+       long i, nr;
  
-       if (likely(!PageCompound(page)))
-               return atomic_read(&page->_mapcount) >= 0;
-       page = compound_head(page);
-       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+       if (folio_test_single(folio))
+               return atomic_read(&folio->_mapcount) >= 0;
+       if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
                 return true;
-       if (PageHuge(page))
+       if (folio_test_hugetlb(folio))
                 return false;
-       for (i = 0; i < compound_nr(page); i++) {
-               if (atomic_read(&page[i]._mapcount) >= 0)
+
+       nr = folio_nr_pages(folio);
+       for (i = 0; i < nr; i++) {
+               if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
                         return true;
         }
         return false;
  }
-EXPORT_SYMBOL(page_mapped);
+EXPORT_SYMBOL(folio_mapped);
  
  struct anon_vma *page_anon_vma(struct page *page)
  {
-       unsigned long mapping;
+       struct folio *folio = page_folio(page);
+       unsigned long mapping = (unsigned long)folio->mapping;
  
-       page = compound_head(page);
-       mapping = (unsigned long)page->mapping;
         if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                 return NULL;
-       return __page_rmapping(page);
+       return (void *)(mapping - PAGE_MAPPING_ANON);
  }
  
-struct address_space *page_mapping(struct page *page)
+/**
+ * folio_mapping - Find the mapping where this folio is stored.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the swap mapping
+ * this page is stored in (which is different from the mapping for the
+ * swap file or swap device where the data is stored).
+ *
+ * You can call this for folios which aren't in the swap cache or page
+ * cache and it will return NULL.
+ */
+struct address_space *folio_mapping(struct folio *folio)
  {
         struct address_space *mapping;
  
-       page = compound_head(page);
-
         /* This happens if someone calls flush_dcache_page on slab page */
-       if (unlikely(PageSlab(page)))
+       if (unlikely(folio_test_slab(folio)))
                 return NULL;
  
-       if (unlikely(PageSwapCache(page))) {
-               swp_entry_t entry;
+       if (unlikely(folio_test_swapcache(folio)))
+               return swap_address_space(folio_swap_entry(folio));
  
-               entry.val = page_private(page);
-               return swap_address_space(entry);
-       }
-
-       mapping = page->mapping;
+       mapping = folio->mapping;
         if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                 return NULL;
  
         return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
  }
-EXPORT_SYMBOL(page_mapping);
+EXPORT_SYMBOL(folio_mapping);
  
  /* Slow path of page_mapcount() for compound pages */
  int __page_mapcount(struct page *page)
@@ -750,13 +747,26 @@ int __page_mapcount(struct page *page)
  }
  EXPORT_SYMBOL_GPL(__page_mapcount);
  
-void copy_huge_page(struct page *dst, struct page *src)
+/**
+ * folio_copy - Copy the contents of one folio to another.
+ * @dst: Folio to copy to.
+ * @src: Folio to copy from.
+ *
+ * The bytes in the folio represented by @src are copied to @dst.
+ * Assumes the caller has validated that @dst is at least as large as @src.
+ * Can be called in atomic context for order-0 folios, but if the folio is
+ * larger, it may sleep.
+ */
+void folio_copy(struct folio *dst, struct folio *src)
  {
-       unsigned i, nr = compound_nr(src);
+       long i = 0;
+       long nr = folio_nr_pages(src);
  
-       for (i = 0; i < nr; i++) {
+       for (;;) {
+               copy_highpage(folio_page(dst, i), folio_page(src, i));
+               if (++i == nr)
+                       break;
                 cond_resched();
-               copy_highpage(nth_page(dst, i), nth_page(src, i));
         }
  }
  
@@ -1079,3 +1089,14 @@ void page_offline_end(void)
         up_write(&page_offline_rwsem);
  }
  EXPORT_SYMBOL(page_offline_end);
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio)
+{
+       long i, nr = folio_nr_pages(folio);
+
+       for (i = 0; i < nr; i++)
+               flush_dcache_page(folio_page(folio, i));
+}
+EXPORT_SYMBOL(flush_dcache_folio);
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index d77830f..e8a807c 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                 unsigned int order, unsigned int nr_pages, struct page **pages)
  {
         unsigned int nr_allocated = 0;
+       struct page *page;
+       int i;
  
         /*
          * For order-0 pages we make use of bulk allocator, if
@@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
          * to fails, fallback to a single page allocator that is
          * more permissive.
          */
-       if (!order) {
+       if (!order && nid != NUMA_NO_NODE) {
                 while (nr_allocated < nr_pages) {
                         unsigned int nr, nr_pages_request;
  
@@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                         if (nr != nr_pages_request)
                                 break;
                 }
-       } else
+       } else if (order)
                 /*
                  * Compound pages required for remap_vmalloc_page if
                  * high-order pages.
@@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                 gfp |= __GFP_COMP;
  
         /* High-order pages or fallback path if "bulk" fails. */
-       while (nr_allocated < nr_pages) {
-               struct page *page;
-               int i;
  
-               page = alloc_pages_node(nid, gfp, order);
+       while (nr_allocated < nr_pages) {
+               if (nid == NUMA_NO_NODE)
+                       page = alloc_pages(gfp, order);
+               else
+                       page = alloc_pages_node(nid, gfp, order);
                 if (unlikely(!page))
                         break;
  
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 74296c2..306229c 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2090,6 +2090,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
   */
  int isolate_lru_page(struct page *page)
  {
+       struct folio *folio = page_folio(page);
         int ret = -EBUSY;
  
         VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2099,7 +2100,7 @@ int isolate_lru_page(struct page *page)
                 struct lruvec *lruvec;
  
                 get_page(page);
-               lruvec = lock_page_lruvec_irq(page);
+               lruvec = folio_lruvec_lock_irq(folio);
                 del_page_from_lru_list(page, lruvec);
                 unlock_page_lruvec_irq(lruvec);
                 ret = 0;
@@ -2199,7 +2200,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
                  * All pages were isolated from the same lruvec (and isolation
                  * inhibits memcg migration).
                  */
-               VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
+               VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
                 add_page_to_lru_list(page, lruvec);
                 nr_pages = thp_nr_pages(page);
                 nr_moved += nr_pages;
@@ -4665,6 +4666,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
  
         for (i = 0; i < pvec->nr; i++) {
                 struct page *page = pvec->pages[i];
+               struct folio *folio = page_folio(page);
                 int nr_pages;
  
                 if (PageTransTail(page))
@@ -4677,7 +4679,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
                 if (!TestClearPageLRU(page))
                         continue;
  
-               lruvec = relock_page_lruvec_irq(page, lruvec);
+               lruvec = folio_lruvec_relock_irq(folio, lruvec);
                 if (page_evictable(page) && PageUnevictable(page)) {
                         del_page_from_lru_list(page, lruvec);
                         ClearPageUnevictable(page);
diff --git a/mm/workingset.c b/mm/workingset.c

index d5b81e4..109ab97 100644 (file)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
  }
  
  /**
- * workingset_refault - evaluate the refault of a previously evicted page
- * @page: the freshly allocated replacement page
- * @shadow: shadow entry of the evicted page
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
   *
   * Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node and the memcg whose memory
+ * evicted folio in the context of the node and the memcg whose memory
   * pressure caused the eviction.
   */
-void workingset_refault(struct page *page, void *shadow)
+void workingset_refault(struct folio *folio, void *shadow)
  {
-       bool file = page_is_file_lru(page);
+       bool file = folio_is_file_lru(folio);
         struct mem_cgroup *eviction_memcg;
         struct lruvec *eviction_lruvec;
         unsigned long refault_distance;
@@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow)
         unsigned long refault;
         bool workingset;
         int memcgid;
+       long nr;
  
         unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
  
         rcu_read_lock();
         /*
          * Look up the memcg associated with the stored ID. It might
-        * have been deleted since the page's eviction.
+        * have been deleted since the folio's eviction.
          *
          * Note that in rare events the ID could have been recycled
-        * for a new cgroup that refaults a shared page. This is
+        * for a new cgroup that refaults a shared folio. This is
          * impossible to tell from the available data. However, this
          * should be a rare and limited disturbance, and activations
          * are always speculative anyway. Ultimately, it's the aging
@@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow)
         refault_distance = (refault - eviction) & EVICTION_MASK;
  
         /*
-        * The activation decision for this page is made at the level
+        * The activation decision for this folio is made at the level
          * where the eviction occurred, as that is where the LRU order
-        * during page reclaim is being determined.
+        * during folio reclaim is being determined.
          *
-        * However, the cgroup that will own the page is the one that
+        * However, the cgroup that will own the folio is the one that
          * is actually experiencing the refault event.
          */
-       memcg = page_memcg(page);
+       nr = folio_nr_pages(folio);
+       memcg = folio_memcg(folio);
         lruvec = mem_cgroup_lruvec(memcg, pgdat);
  
-       inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
+       mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
  
         mem_cgroup_flush_stats();
         /*
@@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow)
         if (refault_distance > workingset_size)
                 goto out;
  
-       SetPageActive(page);
-       workingset_age_nonresident(lruvec, thp_nr_pages(page));
-       inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+       folio_set_active(folio);
+       workingset_age_nonresident(lruvec, nr);
+       mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
  
-       /* Page was active prior to eviction */
+       /* Folio was active prior to eviction */
         if (workingset) {
-               SetPageWorkingset(page);
+               folio_set_workingset(folio);
                 /* XXX: Move to lru_cache_add() when it supports new vs putback */
-               lru_note_cost_page(page);
-               inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
+               lru_note_cost_folio(folio);
+               mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
         }
  out:
         rcu_read_unlock();
@@ -393,12 +395,11 @@ out:
  
  /**
   * workingset_activation - note a page activation
- * @page: page that is being activated
+ * @folio: Folio that is being activated.
   */
-void workingset_activation(struct page *page)
+void workingset_activation(struct folio *folio)
  {
         struct mem_cgroup *memcg;
-       struct lruvec *lruvec;
  
         rcu_read_lock();
         /*
@@ -408,11 +409,10 @@ void workingset_activation(struct page *page)
          * XXX: See workingset_refault() - this should return
          * root_mem_cgroup even for !CONFIG_MEMCG.
          */
-       memcg = page_memcg_rcu(page);
+       memcg = folio_memcg_rcu(folio);
         if (!mem_cgroup_disabled() && !memcg)
                 goto out;
-       lruvec = mem_cgroup_page_lruvec(page);
-       workingset_age_nonresident(lruvec, thp_nr_pages(page));
+       workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
  out:
         rcu_read_unlock();
  }
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c

index 1669744..1768784 100644 (file)
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1560,10 +1560,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
                 return 0;
  
         bat_priv->bla.claim_hash = batadv_hash_new(128);
-       bat_priv->bla.backbone_hash = batadv_hash_new(32);
+       if (!bat_priv->bla.claim_hash)
+               return -ENOMEM;
  
-       if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash)
+       bat_priv->bla.backbone_hash = batadv_hash_new(32);
+       if (!bat_priv->bla.backbone_hash) {
+               batadv_hash_destroy(bat_priv->bla.claim_hash);
                 return -ENOMEM;
+       }
  
         batadv_hash_set_lock_class(bat_priv->bla.claim_hash,
                                    &batadv_claim_hash_lock_class_key);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c

index 3ddd66e..5207cd8 100644 (file)
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -190,29 +190,41 @@ int batadv_mesh_init(struct net_device *soft_iface)
  
         bat_priv->gw.generation = 0;
  
-       ret = batadv_v_mesh_init(bat_priv);
-       if (ret < 0)
-               goto err;
-
         ret = batadv_originator_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_orig;
+       }
  
         ret = batadv_tt_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_tt;
+       }
+
+       ret = batadv_v_mesh_init(bat_priv);
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_v;
+       }
  
         ret = batadv_bla_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_bla;
+       }
  
         ret = batadv_dat_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_dat;
+       }
  
         ret = batadv_nc_mesh_init(bat_priv);
-       if (ret < 0)
-               goto err;
+       if (ret < 0) {
+               atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+               goto err_nc;
+       }
  
         batadv_gw_init(bat_priv);
         batadv_mcast_init(bat_priv);
@@ -222,8 +234,20 @@ int batadv_mesh_init(struct net_device *soft_iface)
  
         return 0;
  
-err:
-       batadv_mesh_free(soft_iface);
+err_nc:
+       batadv_dat_free(bat_priv);
+err_dat:
+       batadv_bla_free(bat_priv);
+err_bla:
+       batadv_v_mesh_free(bat_priv);
+err_v:
+       batadv_tt_free(bat_priv);
+err_tt:
+       batadv_originator_free(bat_priv);
+err_orig:
+       batadv_purge_outstanding_packets(bat_priv, NULL);
+       atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+
         return ret;
  }
  
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c

index 9f06132..0a7f1d3 100644 (file)
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -152,8 +152,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
                                    &batadv_nc_coding_hash_lock_class_key);
  
         bat_priv->nc.decoding_hash = batadv_hash_new(128);
-       if (!bat_priv->nc.decoding_hash)
+       if (!bat_priv->nc.decoding_hash) {
+               batadv_hash_destroy(bat_priv->nc.coding_hash);
                 goto err;
+       }
  
         batadv_hash_set_lock_class(bat_priv->nc.decoding_hash,
                                    &batadv_nc_decoding_hash_lock_class_key);
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c

index e0b3dac..4b7ad66 100644 (file)
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -4162,8 +4162,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
                 return ret;
  
         ret = batadv_tt_global_init(bat_priv);
-       if (ret < 0)
+       if (ret < 0) {
+               batadv_tt_local_table_free(bat_priv);
                 return ret;
+       }
  
         batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1,
                                      batadv_tt_tvlv_unicast_handler_v1,
diff --git a/net/core/dev.c b/net/core/dev.c

index 7ee9fec..eb3a366 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3163,6 +3163,12 @@ static u16 skb_tx_hash(const struct net_device *dev,
  
                 qoffset = sb_dev->tc_to_txq[tc].offset;
                 qcount = sb_dev->tc_to_txq[tc].count;
+               if (unlikely(!qcount)) {
+                       net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+                                            sb_dev->name, qoffset, tc);
+                       qoffset = 0;
+                       qcount = dev->real_num_tx_queues;
+               }
         }
  
         if (skb_rx_queue_recorded(skb)) {
@@ -3906,7 +3912,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
         skb_reset_mac_header(skb);
         __skb_pull(skb, skb_network_offset(skb));
         skb->pkt_type = PACKET_LOOPBACK;
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
+       if (skb->ip_summed == CHECKSUM_NONE)
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
         WARN_ON(!skb_dst(skb));
         skb_dst_force(skb);
         netif_rx_ni(skb);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c

index f619777..b2e49eb 100644 (file)
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1973,9 +1973,9 @@ int netdev_register_kobject(struct net_device *ndev)
  int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
                         const struct net *net_new)
  {
+       kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID;
+       kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID;
         struct device *dev = &ndev->dev;
-       kuid_t old_uid, new_uid;
-       kgid_t old_gid, new_gid;
         int error;
  
         net_ns_get_ownership(net_old, &old_uid, &old_gid);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 2170bea..fe93584 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -80,6 +80,7 @@
  #include <linux/indirect_call_wrapper.h>
  
  #include "datagram.h"
+#include "sock_destructor.h"
  
  struct kmem_cache *skbuff_head_cache __ro_after_init;
  static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
@@ -1804,30 +1805,39 @@ EXPORT_SYMBOL(skb_realloc_headroom);
  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
  {
         int delta = headroom - skb_headroom(skb);
+       int osize = skb_end_offset(skb);
+       struct sock *sk = skb->sk;
  
         if (WARN_ONCE(delta <= 0,
                       "%s is expecting an increase in the headroom", __func__))
                 return skb;
  
-       /* pskb_expand_head() might crash, if skb is shared */
-       if (skb_shared(skb)) {
+       delta = SKB_DATA_ALIGN(delta);
+       /* pskb_expand_head() might crash, if skb is shared. */
+       if (skb_shared(skb) || !is_skb_wmem(skb)) {
                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  
-               if (likely(nskb)) {
-                       if (skb->sk)
-                               skb_set_owner_w(nskb, skb->sk);
-                       consume_skb(skb);
-               } else {
-                       kfree_skb(skb);
-               }
+               if (unlikely(!nskb))
+                       goto fail;
+
+               if (sk)
+                       skb_set_owner_w(nskb, sk);
+               consume_skb(skb);
                 skb = nskb;
         }
-       if (skb &&
-           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
-               kfree_skb(skb);
-               skb = NULL;
+       if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
+               goto fail;
+
+       if (sk && is_skb_wmem(skb)) {
+               delta = skb_end_offset(skb) - osize;
+               refcount_add(delta, &sk->sk_wmem_alloc);
+               skb->truesize += delta;
         }
         return skb;
+
+fail:
+       kfree_skb(skb);
+       return NULL;
  }
  EXPORT_SYMBOL(skb_expand_head);
  
diff --git a/net/core/skmsg.c b/net/core/skmsg.c

index 2d6249b..a86ef7e 100644 (file)
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
  }
  EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
  
+bool sk_msg_is_readable(struct sock *sk)
+{
+       struct sk_psock *psock;
+       bool empty = true;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock))
+               empty = list_empty(&psock->ingress_msg);
+       rcu_read_unlock();
+       return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
  static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
                                                   struct sk_buff *skb)
  {
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h

new file mode 100644 (file)

index 0000000..2f396e6
--- /dev/null
+++ b/net/core/sock_destructor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+       return skb->destructor == sock_wfree ||
+              skb->destructor == __sock_wfree ||
+              (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c

index c8496c1..5f88526 100644 (file)
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = {
                 .mode           = 0600,
                 .proc_handler   = proc_dolongvec_minmax_bpf_restricted,
                 .extra1         = &long_one,
-               .extra2         = &long_max,
+               .extra2         = &bpf_jit_limit_max,
         },
  #endif
         {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index e8b48df..f5c336f 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target)
  {
         if (tcp_epollin_ready(sk, target))
                 return true;
-
-       if (sk->sk_prot->stream_memory_read)
-               return sk->sk_prot->stream_memory_read(sk);
-       return false;
+       return sk_is_readable(sk);
  }
  
  /*
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c

index d3e9386..5f4d6f4 100644 (file)
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
  EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
  
  #ifdef CONFIG_BPF_SYSCALL
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
-       struct sk_psock *psock;
-       bool empty = true;
-
-       rcu_read_lock();
-       psock = sk_psock(sk);
-       if (likely(psock))
-               empty = list_empty(&psock->ingress_msg);
-       rcu_read_unlock();
-       return !empty;
-}
-
  static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
                              long timeo)
  {
@@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
         bool cork = false, enospc = sk_msg_full(msg);
         struct sock *sk_redir;
         u32 tosend, delta = 0;
+       u32 eval = __SK_NONE;
         int ret;
  
  more_data:
@@ -275,13 +263,24 @@ more_data:
         case __SK_REDIRECT:
                 sk_redir = psock->sk_redir;
                 sk_msg_apply_bytes(psock, tosend);
+               if (!psock->apply_bytes) {
+                       /* Clean up before releasing the sock lock. */
+                       eval = psock->eval;
+                       psock->eval = __SK_NONE;
+                       psock->sk_redir = NULL;
+               }
                 if (psock->cork) {
                         cork = true;
                         psock->cork = NULL;
                 }
                 sk_msg_return(sk, msg, tosend);
                 release_sock(sk);
+
                 ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+
+               if (eval == __SK_REDIRECT)
+                       sock_put(sk_redir);
+
                 lock_sock(sk);
                 if (unlikely(ret < 0)) {
                         int free = sk_msg_free_nocharge(sk, msg);
@@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
         prot[TCP_BPF_BASE].unhash               = sock_map_unhash;
         prot[TCP_BPF_BASE].close                = sock_map_close;
         prot[TCP_BPF_BASE].recvmsg              = tcp_bpf_recvmsg;
-       prot[TCP_BPF_BASE].stream_memory_read   = tcp_bpf_stream_read;
+       prot[TCP_BPF_BASE].sock_is_readable     = sk_msg_is_readable;
  
         prot[TCP_BPF_TX]                        = prot[TCP_BPF_BASE];
         prot[TCP_BPF_TX].sendmsg                = tcp_bpf_sendmsg;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 8536b2a..2fffcf2 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2867,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
             !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
                 mask &= ~(EPOLLIN | EPOLLRDNORM);
  
+       /* psock ingress_msg queue should not contain any bad checksum frames */
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
         return mask;
  
  }
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c

index 7a1d5f4..bbe6569 100644 (file)
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
         *prot        = *base;
         prot->close  = sock_map_close;
         prot->recvmsg = udp_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
  }
  
  static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c

index 97095b7..5dcfd53 100644 (file)
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -672,7 +672,7 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
                                  u8 *ie, u8 ie_len)
  {
         struct ieee80211_supported_band *sband;
-       const u8 *cap;
+       const struct element *cap;
         const struct ieee80211_he_operation *he_oper = NULL;
  
         sband = ieee80211_get_sband(sdata);
@@ -687,9 +687,10 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
  
         sdata->vif.bss_conf.he_support = true;
  
-       cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len);
-       if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3]))
-               he_oper = (void *)(cap + 3);
+       cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ie_len);
+       if (cap && cap->datalen >= 1 + sizeof(*he_oper) &&
+           cap->datalen >= 1 + ieee80211_he_oper_size(cap->data + 1))
+               he_oper = (void *)(cap->data + 1);
  
         if (he_oper)
                 sdata->vif.bss_conf.he_oper.params =
diff --git a/net/mptcp/options.c b/net/mptcp/options.c

index c41273c..f0f22eb 100644 (file)
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -485,11 +485,11 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                 mpext = mptcp_get_ext(skb);
                 data_len = mpext ? mpext->data_len : 0;
  
-               /* we will check ext_copy.data_len in mptcp_write_options() to
+               /* we will check ops->data_len in mptcp_write_options() to
                  * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
                  * TCPOLEN_MPTCP_MPC_ACK
                  */
-               opts->ext_copy.data_len = data_len;
+               opts->data_len = data_len;
                 opts->suboptions = OPTION_MPTCP_MPC_ACK;
                 opts->sndr_key = subflow->local_key;
                 opts->rcvr_key = subflow->remote_key;
@@ -505,9 +505,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                         len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                         if (opts->csum_reqd) {
                                 /* we need to propagate more info to csum the pseudo hdr */
-                               opts->ext_copy.data_seq = mpext->data_seq;
-                               opts->ext_copy.subflow_seq = mpext->subflow_seq;
-                               opts->ext_copy.csum = mpext->csum;
+                               opts->data_seq = mpext->data_seq;
+                               opts->subflow_seq = mpext->subflow_seq;
+                               opts->csum = mpext->csum;
                                 len += TCPOLEN_MPTCP_DSS_CHECKSUM;
                         }
                         *size = ALIGN(len, 4);
@@ -1227,7 +1227,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp)
                 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
  }
  
-static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
+static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum)
  {
         struct csum_pseudo_header header;
         __wsum csum;
@@ -1237,15 +1237,21 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
          * always the 64-bit value, irrespective of what length is used in the
          * DSS option itself.
          */
-       header.data_seq = cpu_to_be64(mpext->data_seq);
-       header.subflow_seq = htonl(mpext->subflow_seq);
-       header.data_len = htons(mpext->data_len);
+       header.data_seq = cpu_to_be64(data_seq);
+       header.subflow_seq = htonl(subflow_seq);
+       header.data_len = htons(data_len);
         header.csum = 0;
  
-       csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum));
+       csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum));
         return (__force u16)csum_fold(csum);
  }
  
+static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
+{
+       return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+                                mpext->csum);
+}
+
  void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                          struct mptcp_out_options *opts)
  {
@@ -1337,7 +1343,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                         len = TCPOLEN_MPTCP_MPC_SYN;
                 } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) {
                         len = TCPOLEN_MPTCP_MPC_SYNACK;
-               } else if (opts->ext_copy.data_len) {
+               } else if (opts->data_len) {
                         len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                         if (opts->csum_reqd)
                                 len += TCPOLEN_MPTCP_DSS_CHECKSUM;
@@ -1366,14 +1372,17 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
  
                 put_unaligned_be64(opts->rcvr_key, ptr);
                 ptr += 2;
-               if (!opts->ext_copy.data_len)
+               if (!opts->data_len)
                         goto mp_capable_done;
  
                 if (opts->csum_reqd) {
-                       put_unaligned_be32(opts->ext_copy.data_len << 16 |
-                                          mptcp_make_csum(&opts->ext_copy), ptr);
+                       put_unaligned_be32(opts->data_len << 16 |
+                                          __mptcp_make_csum(opts->data_seq,
+                                                            opts->subflow_seq,
+                                                            opts->data_len,
+                                                            opts->csum), ptr);
                 } else {
-                       put_unaligned_be32(opts->ext_copy.data_len << 16 |
+                       put_unaligned_be32(opts->data_len << 16 |
                                            TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
                 }
                 ptr += 1;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c

index 32df65f..fb3da4d 100644 (file)
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -156,6 +156,12 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort(
                                         void *arg,
                                         struct sctp_cmd_seq *commands);
  
+static enum sctp_disposition
+__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                          const struct sctp_association *asoc,
+                          const union sctp_subtype type, void *arg,
+                          struct sctp_cmd_seq *commands);
+
  /* Small helper function that checks if the chunk length
   * is of the appropriate length.  The 'required_length' argument
   * is set to be the size of a specific chunk we are testing.
@@ -337,6 +343,14 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
         if (!chunk->singleton)
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
+       /* Make sure that the INIT chunk has a valid length.
+        * Normally, this would cause an ABORT with a Protocol Violation
+        * error, but since we don't have an association, we'll
+        * just discard the packet.
+        */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
         /* If the packet is an OOTB packet which is temporarily on the
          * control endpoint, respond with an ABORT.
          */
@@ -351,14 +365,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
         if (chunk->sctp_hdr->vtag != 0)
                 return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
  
-       /* Make sure that the INIT chunk has a valid length.
-        * Normally, this would cause an ABORT with a Protocol Violation
-        * error, but since we don't have an association, we'll
-        * just discard the packet.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
         /* If the INIT is coming toward a closing socket, we'll send back
          * and ABORT.  Essentially, this catches the race of INIT being
          * backloged to the socket at the same time as the user issues close().
@@ -704,6 +710,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
         struct sock *sk;
         int error = 0;
  
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
         /* If the packet is an OOTB packet which is temporarily on the
          * control endpoint, respond with an ABORT.
          */
@@ -718,7 +727,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
          * in sctp_unpack_cookie().
          */
         if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
  
         /* If the endpoint is not listening or if the number of associations
          * on the TCP-style socket exceed the max backlog, respond with an
@@ -1524,20 +1534,16 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
         if (!chunk->singleton)
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
+       /* Make sure that the INIT chunk has a valid length. */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
         /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
          * Tag.
          */
         if (chunk->sctp_hdr->vtag != 0)
                 return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
  
-       /* Make sure that the INIT chunk has a valid length.
-        * In this case, we generate a protocol violation since we have
-        * an association established.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
-
         if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port)
                 return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands);
  
@@ -1882,9 +1888,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
          * its peer.
         */
         if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) {
-               disposition = sctp_sf_do_9_2_reshutack(net, ep, asoc,
-                               SCTP_ST_CHUNK(chunk->chunk_hdr->type),
-                               chunk, commands);
+               disposition = __sctp_sf_do_9_2_reshutack(net, ep, asoc,
+                                                        SCTP_ST_CHUNK(chunk->chunk_hdr->type),
+                                                        chunk, commands);
                 if (SCTP_DISPOSITION_NOMEM == disposition)
                         goto nomem;
  
@@ -2202,9 +2208,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
          * enough for the chunk header.  Cookie length verification is
          * done later.
          */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) {
+               if (!sctp_vtag_verify(chunk, asoc))
+                       asoc = NULL;
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands);
+       }
  
         /* "Decode" the chunk.  We have no optional parameters so we
          * are in good shape.
@@ -2341,7 +2349,7 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort(
          */
         if (SCTP_ADDR_DEL ==
                     sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
         if (!sctp_err_chunk_valid(chunk))
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2387,7 +2395,7 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort(
          */
         if (SCTP_ADDR_DEL ==
                     sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
         if (!sctp_err_chunk_valid(chunk))
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2657,7 +2665,7 @@ enum sctp_disposition sctp_sf_do_9_1_abort(
          */
         if (SCTP_ADDR_DEL ==
                     sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
         if (!sctp_err_chunk_valid(chunk))
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2970,13 +2978,11 @@ enum sctp_disposition sctp_sf_do_9_2_shut_ctsn(
   * that belong to this association, it should discard the INIT chunk and
   * retransmit the SHUTDOWN ACK chunk.
   */
-enum sctp_disposition sctp_sf_do_9_2_reshutack(
-                                       struct net *net,
-                                       const struct sctp_endpoint *ep,
-                                       const struct sctp_association *asoc,
-                                       const union sctp_subtype type,
-                                       void *arg,
-                                       struct sctp_cmd_seq *commands)
+static enum sctp_disposition
+__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                          const struct sctp_association *asoc,
+                          const union sctp_subtype type, void *arg,
+                          struct sctp_cmd_seq *commands)
  {
         struct sctp_chunk *chunk = arg;
         struct sctp_chunk *reply;
@@ -3010,6 +3016,26 @@ nomem:
         return SCTP_DISPOSITION_NOMEM;
  }
  
+enum sctp_disposition
+sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
+                        const struct sctp_association *asoc,
+                        const union sctp_subtype type, void *arg,
+                        struct sctp_cmd_seq *commands)
+{
+       struct sctp_chunk *chunk = arg;
+
+       if (!chunk->singleton)
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+       if (chunk->sctp_hdr->vtag != 0)
+               return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+       return __sctp_sf_do_9_2_reshutack(net, ep, asoc, type, arg, commands);
+}
+
  /*
   * sctp_sf_do_ecn_cwr
   *
@@ -3662,6 +3688,9 @@ enum sctp_disposition sctp_sf_ootb(struct net *net,
  
         SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
  
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               asoc = NULL;
+
         ch = (struct sctp_chunkhdr *)chunk->chunk_hdr;
         do {
                 /* Report violation if the chunk is less then minimal */
@@ -3777,12 +3806,6 @@ static enum sctp_disposition sctp_sf_shut_8_4_5(
  
         SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
  
-       /* If the chunk length is invalid, we don't want to process
-        * the reset of the packet.
-        */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
-               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
         /* We need to discard the rest of the packet to prevent
          * potential boomming attacks from additional bundled chunks.
          * This is documented in SCTP Threats ID.
@@ -3810,6 +3833,9 @@ enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net,
  {
         struct sctp_chunk *chunk = arg;
  
+       if (!sctp_vtag_verify(chunk, asoc))
+               asoc = NULL;
+
         /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
         if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3845,6 +3871,11 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net,
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
         }
  
+       /* Make sure that the ASCONF ADDIP chunk has a valid length.  */
+       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
+
         /* ADD-IP: Section 4.1.1
          * This chunk MUST be sent in an authenticated way by using
          * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
@@ -3853,13 +3884,7 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net,
          */
         if (!asoc->peer.asconf_capable ||
             (!net->sctp.addip_noauth && !chunk->auth))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
-                                            commands);
-
-       /* Make sure that the ASCONF ADDIP chunk has a valid length.  */
-       if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
         hdr = (struct sctp_addiphdr *)chunk->skb->data;
         serial = ntohl(hdr->serial);
@@ -3988,6 +4013,12 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
                 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
         }
  
+       /* Make sure that the ADDIP chunk has a valid length.  */
+       if (!sctp_chunk_length_valid(asconf_ack,
+                                    sizeof(struct sctp_addip_chunk)))
+               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+                                                 commands);
+
         /* ADD-IP, Section 4.1.2:
          * This chunk MUST be sent in an authenticated way by using
          * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
@@ -3996,14 +4027,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
          */
         if (!asoc->peer.asconf_capable ||
             (!net->sctp.addip_noauth && !asconf_ack->auth))
-               return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
-                                            commands);
-
-       /* Make sure that the ADDIP chunk has a valid length.  */
-       if (!sctp_chunk_length_valid(asconf_ack,
-                                    sizeof(struct sctp_addip_chunk)))
-               return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-                                                 commands);
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
  
         addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data;
         rcvd_serial = ntohl(addip_hdr->serial);
@@ -4575,6 +4599,9 @@ enum sctp_disposition sctp_sf_discard_chunk(struct net *net,
  {
         struct sctp_chunk *chunk = arg;
  
+       if (asoc && !sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
         /* Make sure that the chunk has a valid length.
          * Since we don't know the chunk type, we use a general
          * chunkhdr structure to make a comparison.
@@ -4642,6 +4669,9 @@ enum sctp_disposition sctp_sf_violation(struct net *net,
  {
         struct sctp_chunk *chunk = arg;
  
+       if (!sctp_vtag_verify(chunk, asoc))
+               return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
         /* Make sure that the chunk has a valid length. */
         if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -6348,6 +6378,7 @@ static struct sctp_packet *sctp_ootb_pkt_new(
                  * yet.
                  */
                 switch (chunk->chunk_hdr->type) {
+               case SCTP_CID_INIT:
                 case SCTP_CID_INIT_ACK:
                 {
                         struct sctp_initack_chunk *initack;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c

index c038efc..78b663d 100644 (file)
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1057,7 +1057,7 @@ static void smc_connect_work(struct work_struct *work)
         if (smc->clcsock->sk->sk_err) {
                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
         } else if ((1 << smc->clcsock->sk->sk_state) &
-                                       (TCPF_SYN_SENT | TCP_SYN_RECV)) {
+                                       (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
                 if ((rc == -EPIPE) &&
                     ((1 << smc->clcsock->sk->sk_state) &
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c

index 72f4b72..f1d3234 100644 (file)
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1822,7 +1822,7 @@ void smc_llc_link_active(struct smc_link *link)
                             link->smcibdev->ibdev->name, link->ibport);
         link->state = SMC_LNK_ACTIVE;
         if (link->lgr->llc_testlink_time) {
-               link->llc_testlink_time = link->lgr->llc_testlink_time * HZ;
+               link->llc_testlink_time = link->lgr->llc_testlink_time;
                 schedule_delayed_work(&link->llc_testlink_wrk,
                                       link->llc_testlink_time);
         }
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c

index c9391d3..dc60c32 100644 (file)
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -2285,43 +2285,53 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr)
         u16 key_gen = msg_key_gen(hdr);
         u16 size = msg_data_sz(hdr);
         u8 *data = msg_data(hdr);
+       unsigned int keylen;
+
+       /* Verify whether the size can exist in the packet */
+       if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) {
+               pr_debug("%s: message data size is too small\n", rx->name);
+               goto exit;
+       }
+
+       keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+
+       /* Verify the supplied size values */
+       if (unlikely(size != keylen + sizeof(struct tipc_aead_key) ||
+                    keylen > TIPC_AEAD_KEY_SIZE_MAX)) {
+               pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name);
+               goto exit;
+       }
  
         spin_lock(&rx->lock);
         if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) {
                 pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name,
                        rx->skey, key_gen, rx->key_gen);
-               goto exit;
+               goto exit_unlock;
         }
  
         /* Allocate memory for the key */
         skey = kmalloc(size, GFP_ATOMIC);
         if (unlikely(!skey)) {
                 pr_err("%s: unable to allocate memory for skey\n", rx->name);
-               goto exit;
+               goto exit_unlock;
         }
  
         /* Copy key from msg data */
-       skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+       skey->keylen = keylen;
         memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME);
         memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32),
                skey->keylen);
  
-       /* Sanity check */
-       if (unlikely(size != tipc_aead_key_size(skey))) {
-               kfree(skey);
-               skey = NULL;
-               goto exit;
-       }
-
         rx->key_gen = key_gen;
         rx->skey_mode = msg_key_mode(hdr);
         rx->skey = skey;
         rx->nokey = 0;
         mb(); /* for nokey flag */
  
-exit:
+exit_unlock:
         spin_unlock(&rx->lock);
  
+exit:
         /* Schedule the key attaching on this crypto */
         if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0)))
                 return true;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c

index fde56ff..9ab81db 100644 (file)
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
  
         prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
         prot[TLS_BASE][TLS_SW].recvmsg            = tls_sw_recvmsg;
-       prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+       prot[TLS_BASE][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
         prot[TLS_BASE][TLS_SW].close              = tls_sk_proto_close;
  
         prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
         prot[TLS_SW][TLS_SW].recvmsg            = tls_sw_recvmsg;
-       prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
+       prot[TLS_SW][TLS_SW].sock_is_readable   = tls_sw_sock_is_readable;
         prot[TLS_SW][TLS_SW].close              = tls_sk_proto_close;
  
  #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c

index 4feb95e..1b08b87 100644 (file)
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -35,6 +35,7 @@
   * SOFTWARE.
   */
  
+#include <linux/bug.h>
  #include <linux/sched/signal.h>
  #include <linux/module.h>
  #include <linux/splice.h>
@@ -43,6 +44,14 @@
  #include <net/strparser.h>
  #include <net/tls.h>
  
+noinline void tls_err_abort(struct sock *sk, int err)
+{
+       WARN_ON_ONCE(err >= 0);
+       /* sk->sk_err should contain a positive error code. */
+       sk->sk_err = -err;
+       sk_error_report(sk);
+}
+
  static int __skb_nsg(struct sk_buff *skb, int offset, int len,
                       unsigned int recursion_level)
  {
@@ -419,7 +428,7 @@ int tls_tx_records(struct sock *sk, int flags)
  
  tx_err:
         if (rc < 0 && rc != -EAGAIN)
-               tls_err_abort(sk, EBADMSG);
+               tls_err_abort(sk, -EBADMSG);
  
         return rc;
  }
@@ -450,7 +459,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
  
                 /* If err is already set on socket, return the same code */
                 if (sk->sk_err) {
-                       ctx->async_wait.err = sk->sk_err;
+                       ctx->async_wait.err = -sk->sk_err;
                 } else {
                         ctx->async_wait.err = err;
                         tls_err_abort(sk, err);
@@ -763,7 +772,7 @@ static int tls_push_record(struct sock *sk, int flags,
                                msg_pl->sg.size + prot->tail_size, i);
         if (rc < 0) {
                 if (rc != -EINPROGRESS) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                         if (split) {
                                 tls_ctx->pending_open_record_frags = true;
                                 tls_merge_open_record(sk, rec, tmp, orig_end);
@@ -1827,7 +1836,7 @@ int tls_sw_recvmsg(struct sock *sk,
                 err = decrypt_skb_update(sk, skb, &msg->msg_iter,
                                          &chunk, &zc, async_capable);
                 if (err < 0 && err != -EINPROGRESS) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                         goto recv_end;
                 }
  
@@ -2007,7 +2016,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
                 }
  
                 if (err < 0) {
-                       tls_err_abort(sk, EBADMSG);
+                       tls_err_abort(sk, -EBADMSG);
                         goto splice_read_end;
                 }
                 ctx->decrypted = 1;
@@ -2026,7 +2035,7 @@ splice_read_end:
         return copied ? : err;
  }
  
-bool tls_sw_stream_read(const struct sock *sk)
+bool tls_sw_sock_is_readable(struct sock *sk)
  {
         struct tls_context *tls_ctx = tls_get_ctx(sk);
         struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c

index 89f9e85..78e08e8 100644 (file)
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3052,6 +3052,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
         /* readable? */
         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                 mask |= EPOLLIN | EPOLLRDNORM;
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
  
         /* Connection-based need to check for termination and startup */
         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
@@ -3091,6 +3093,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
         /* readable? */
         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                 mask |= EPOLLIN | EPOLLRDNORM;
+       if (sk_is_readable(sk))
+               mask |= EPOLLIN | EPOLLRDNORM;
  
         /* Connection-based need to check for termination and startup */
         if (sk->sk_type == SOCK_SEQPACKET) {
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c

index b927e2b..452376c 100644 (file)
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -102,6 +102,7 @@ static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto
         *prot        = *base;
         prot->close  = sock_map_close;
         prot->recvmsg = unix_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
  }
  
  static void unix_stream_bpf_rebuild_protos(struct proto *prot,
@@ -110,6 +111,7 @@ static void unix_stream_bpf_rebuild_protos(struct proto *prot,
         *prot        = *base;
         prot->close  = sock_map_close;
         prot->recvmsg = unix_bpf_recvmsg;
+       prot->sock_is_readable = sk_msg_is_readable;
         prot->unhash  = sock_map_unhash;
  }
  
diff --git a/net/wireless/core.c b/net/wireless/core.c

index 0332312..aaba847 100644 (file)
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -524,6 +524,7 @@ use_default_name:
         INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk);
         INIT_WORK(&rdev->mgmt_registrations_update_wk,
                   cfg80211_mgmt_registrations_update_wk);
+       spin_lock_init(&rdev->mgmt_registrations_lock);
  
  #ifdef CONFIG_CFG80211_DEFAULT_PS
         rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
@@ -1279,7 +1280,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev)
         INIT_LIST_HEAD(&wdev->event_list);
         spin_lock_init(&wdev->event_lock);
         INIT_LIST_HEAD(&wdev->mgmt_registrations);
-       spin_lock_init(&wdev->mgmt_registrations_lock);
         INIT_LIST_HEAD(&wdev->pmsr_list);
         spin_lock_init(&wdev->pmsr_lock);
         INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
diff --git a/net/wireless/core.h b/net/wireless/core.h

index b35d0db..1720abf 100644 (file)
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -100,6 +100,8 @@ struct cfg80211_registered_device {
         struct work_struct propagate_cac_done_wk;
  
         struct work_struct mgmt_registrations_update_wk;
+       /* lock for all wdev lists */
+       spinlock_t mgmt_registrations_lock;
  
         /* must be last because of the way we do wiphy_priv(),
          * and it should at least be aligned to NETDEV_ALIGN */
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c

index 3aa69b3..783acd2 100644 (file)
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -452,9 +452,9 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
  
         lockdep_assert_held(&rdev->wiphy.mtx);
  
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
         if (!wdev->mgmt_registrations_need_update) {
-               spin_unlock_bh(&wdev->mgmt_registrations_lock);
+               spin_unlock_bh(&rdev->mgmt_registrations_lock);
                 return;
         }
  
@@ -479,7 +479,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
         rcu_read_unlock();
  
         wdev->mgmt_registrations_need_update = 0;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         rdev_update_mgmt_frame_registrations(rdev, wdev, &upd);
  }
@@ -503,6 +503,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
                                 int match_len, bool multicast_rx,
                                 struct netlink_ext_ack *extack)
  {
+       struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
         struct cfg80211_mgmt_registration *reg, *nreg;
         int err = 0;
         u16 mgmt_type;
@@ -548,7 +549,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
         if (!nreg)
                 return -ENOMEM;
  
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
  
         list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                 int mlen = min(match_len, reg->match_len);
@@ -583,7 +584,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
                 list_add(&nreg->list, &wdev->mgmt_registrations);
         }
         wdev->mgmt_registrations_need_update = 1;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         cfg80211_mgmt_registrations_update(wdev);
  
@@ -591,7 +592,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
  
   out:
         kfree(nreg);
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         return err;
  }
@@ -602,7 +603,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
         struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
         struct cfg80211_mgmt_registration *reg, *tmp;
  
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
  
         list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                 if (reg->nlportid != nlportid)
@@ -615,7 +616,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
                 schedule_work(&rdev->mgmt_registrations_update_wk);
         }
  
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         if (nlportid && rdev->crit_proto_nlportid == nlportid) {
                 rdev->crit_proto_nlportid = 0;
@@ -628,15 +629,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
  
  void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
  {
+       struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
         struct cfg80211_mgmt_registration *reg, *tmp;
  
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
         list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                 list_del(&reg->list);
                 kfree(reg);
         }
         wdev->mgmt_registrations_need_update = 1;
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         cfg80211_mgmt_registrations_update(wdev);
  }
@@ -784,7 +786,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
         data = buf + ieee80211_hdrlen(mgmt->frame_control);
         data_len = len - ieee80211_hdrlen(mgmt->frame_control);
  
-       spin_lock_bh(&wdev->mgmt_registrations_lock);
+       spin_lock_bh(&rdev->mgmt_registrations_lock);
  
         list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                 if (reg->frame_type != ftype)
@@ -808,7 +810,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
                 break;
         }
  
-       spin_unlock_bh(&wdev->mgmt_registrations_lock);
+       spin_unlock_bh(&rdev->mgmt_registrations_lock);
  
         trace_cfg80211_return_bool(result);
         return result;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c

index 11c68b1..adc0d14 100644 (file)
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -418,14 +418,17 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss,
         }
         ssid_len = ssid[1];
         ssid = ssid + 2;
-       rcu_read_unlock();
  
         /* check if nontrans_bss is in the list */
         list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) {
-               if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len))
+               if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) {
+                       rcu_read_unlock();
                         return 0;
+               }
         }
  
+       rcu_read_unlock();
+
         /* add to the list */
         list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list);
         return 0;
diff --git a/net/wireless/util.c b/net/wireless/util.c

index 18dba3d..a1a99a5 100644 (file)
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1028,14 +1028,14 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
             !(rdev->wiphy.interface_modes & (1 << ntype)))
                 return -EOPNOTSUPP;
  
-       /* if it's part of a bridge, reject changing type to station/ibss */
-       if (netif_is_bridge_port(dev) &&
-           (ntype == NL80211_IFTYPE_ADHOC ||
-            ntype == NL80211_IFTYPE_STATION ||
-            ntype == NL80211_IFTYPE_P2P_CLIENT))
-               return -EBUSY;
-
         if (ntype != otype) {
+               /* if it's part of a bridge, reject changing type to station/ibss */
+               if (netif_is_bridge_port(dev) &&
+                   (ntype == NL80211_IFTYPE_ADHOC ||
+                    ntype == NL80211_IFTYPE_STATION ||
+                    ntype == NL80211_IFTYPE_P2P_CLIENT))
+                       return -EBUSY;
+
                 dev->ieee80211_ptr->use_4addr = false;
                 dev->ieee80211_ptr->mesh_id_up_len = 0;
                 wdev_lock(dev->ieee80211_ptr);
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index 5cd7020..b856afa 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -787,6 +787,8 @@ $(OUTPUT)dlfilters/%.o: dlfilters/%.c include/perf/perf_dlfilter.h
         $(Q)$(MKDIR) -p $(OUTPUT)dlfilters
         $(QUIET_CC)$(CC) -c -Iinclude $(EXTRA_CFLAGS) -o $@ -fpic $<
  
+.SECONDARY: $(DLFILTERS:.so=.o)
+
  $(OUTPUT)dlfilters/%.so: $(OUTPUT)dlfilters/%.o
         $(QUIET_LINK)$(CC) $(EXTRA_CFLAGS) -shared -o $@ $<
  
diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c

index 3018a05..20cd624 100644 (file)
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -45,7 +45,7 @@ static const Dwfl_Callbacks offline_callbacks = {
   */
  static int check_return_reg(int ra_regno, Dwarf_Frame *frame)
  {
-       Dwarf_Op ops_mem[2];
+       Dwarf_Op ops_mem[3];
         Dwarf_Op dummy;
         Dwarf_Op *ops = &dummy;
         size_t nops;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 6211d0b..c32c2eb 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -459,7 +459,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session)
                 return -EINVAL;
  
         if (PRINT_FIELD(WEIGHT) &&
-           evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", PERF_OUTPUT_WEIGHT))
+           evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT))
                 return -EINVAL;
  
         if (PRINT_FIELD(SYM) &&
@@ -4039,11 +4039,15 @@ script_found:
                 goto out_delete;
  
         uname(&uts);
-       if (data.is_pipe ||  /* assume pipe_mode indicates native_arch */
-           !strcmp(uts.machine, session->header.env.arch) ||
-           (!strcmp(uts.machine, "x86_64") &&
-            !strcmp(session->header.env.arch, "i386")))
+       if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */
                 native_arch = true;
+       } else if (session->header.env.arch) {
+               if (!strcmp(uts.machine, session->header.env.arch))
+                       native_arch = true;
+               else if (!strcmp(uts.machine, "x86_64") &&
+                        !strcmp(session->header.env.arch, "i386"))
+                       native_arch = true;
+       }
  
         script.session = session;
         script__setup_sample_type(&script);
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c

index 5c59790..d88bb65 100644 (file)
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -949,7 +949,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
         int err, n;
         u32 key;
         char b;
-       int retries = 100;
  
         zero_verdict_count(verd_mapfd);
  
@@ -1002,17 +1001,11 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
                 goto close_peer1;
         if (pass != 1)
                 FAIL("%s: want pass count 1, have %d", log_prefix, pass);
-again:
-       n = read(c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
         if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
  
  close_peer1:
         xclose(p1);
@@ -1571,7 +1564,6 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
         const char *log_prefix = redir_mode_str(mode);
         int c0, c1, p0, p1;
         unsigned int pass;
-       int retries = 100;
         int err, n;
         int sfd[2];
         u32 key;
@@ -1606,17 +1598,11 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
         if (pass != 1)
                 FAIL("%s: want pass count 1, have %d", log_prefix, pass);
  
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
         if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
  
  close:
         xclose(c1);
@@ -1748,7 +1734,6 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
         const char *log_prefix = redir_mode_str(mode);
         int c0, c1, p0, p1;
         unsigned int pass;
-       int retries = 100;
         int err, n;
         u32 key;
         char b;
@@ -1781,17 +1766,11 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
         if (pass != 1)
                 FAIL("%s: want pass count 1, have %d", log_prefix, pass);
  
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
         if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
  
  close_cli1:
         xclose(c1);
@@ -1841,7 +1820,6 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
         const char *log_prefix = redir_mode_str(mode);
         int c0, c1, p0, p1;
         unsigned int pass;
-       int retries = 100;
         int err, n;
         int sfd[2];
         u32 key;
@@ -1876,17 +1854,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
         if (pass != 1)
                 FAIL("%s: want pass count 1, have %d", log_prefix, pass);
  
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
         if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
  
  close_cli1:
         xclose(c1);
@@ -1932,7 +1904,6 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
         int sfd[2];
         u32 key;
         char b;
-       int retries = 100;
  
         zero_verdict_count(verd_mapfd);
  
@@ -1963,17 +1934,11 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
         if (pass != 1)
                 FAIL("%s: want pass count 1, have %d", log_prefix, pass);
  
-again:
-       n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0) {
-               if (errno == EAGAIN && retries--) {
-                       usleep(1000);
-                       goto again;
-               }
-               FAIL_ERRNO("%s: read", log_prefix);
-       }
+       n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC);
+       if (n < 0)
+               FAIL_ERRNO("%s: recv_timeout", log_prefix);
         if (n == 0)
-               FAIL("%s: incomplete read", log_prefix);
+               FAIL("%s: incomplete recv", log_prefix);
  
  close:
         xclose(c1);
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh

index 8e67a25..3313566 100755 (executable)
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -445,10 +445,13 @@ cleanup()
                 ip -netns ${NSA} link set dev ${NSA_DEV} down
                 ip -netns ${NSA} link del dev ${NSA_DEV}
  
+               ip netns pids ${NSA} | xargs kill 2>/dev/null
                 ip netns del ${NSA}
         fi
  
+       ip netns pids ${NSB} | xargs kill 2>/dev/null
         ip netns del ${NSB}
+       ip netns pids ${NSC} | xargs kill 2>/dev/null
         ip netns del ${NSC} >/dev/null 2>&1
  }
  
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c

index 1af16d2..52497b7 100644 (file)
--- a/tools/testing/selftests/vm/split_huge_page_test.c
+++ b/tools/testing/selftests/vm/split_huge_page_test.c
@@ -341,7 +341,7 @@ void split_file_backed_thp(void)
         }
  
         /* write something to the file, so a file-backed THP can be allocated */
-       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
+       num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
         close(fd);
  
         if (num_written < 1) {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)