Merge tag 'char-misc-5.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregk...

[platform/kernel/linux-starfive.git] / drivers / misc / habanalabs / common / habanalabs.h
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h

index 41af347..d933878 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -28,17 +28,18 @@
  #define HL_NAME                                "habanalabs"
  
  /* Use upper bits of mmap offset to store habana driver specific information.
- * bits[63:62] - Encode mmap type
+ * bits[63:61] - Encode mmap type
   * bits[45:0]  - mmap offset value
   *
   * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
   *  defines are w.r.t to PAGE_SIZE
   */
-#define HL_MMAP_TYPE_SHIFT             (62 - PAGE_SHIFT)
-#define HL_MMAP_TYPE_MASK              (0x3ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_SHIFT             (61 - PAGE_SHIFT)
+#define HL_MMAP_TYPE_MASK              (0x7ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_BLOCK             (0x4ull << HL_MMAP_TYPE_SHIFT)
  #define HL_MMAP_TYPE_CB                        (0x2ull << HL_MMAP_TYPE_SHIFT)
  
-#define HL_MMAP_OFFSET_VALUE_MASK      (0x3FFFFFFFFFFFull >> PAGE_SHIFT)
+#define HL_MMAP_OFFSET_VALUE_MASK      (0x1FFFFFFFFFFFull >> PAGE_SHIFT)
  #define HL_MMAP_OFFSET_VALUE_GET(off)  (off & HL_MMAP_OFFSET_VALUE_MASK)
  
  #define HL_PENDING_RESET_PER_SEC       10
@@ -408,6 +409,9 @@ struct hl_mmu_properties {
   * @sync_stream_first_mon: first monitor available for sync stream use
   * @first_available_user_sob: first sob available for the user
   * @first_available_user_mon: first monitor available for the user
+ * @first_available_user_msix_interrupt: first available msix interrupt
+ *                                       reserved for the user
+ * @first_available_cq: first available CQ for the user.
   * @tpc_enabled_mask: which TPCs are enabled.
   * @completion_queues_count: number of completion queues.
   * @fw_security_disabled: true if security measures are disabled in firmware,
@@ -416,6 +420,7 @@ struct hl_mmu_properties {
   *                            from BOOT_DEV_STS0
   * @dram_supports_virtual_memory: is there an MMU towards the DRAM
   * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
+ * @num_functional_hbms: number of functional HBMs in each DCORE.
   */
  struct asic_fixed_properties {
         struct hw_queue_properties      *hw_queues_props;
@@ -468,18 +473,22 @@ struct asic_fixed_properties {
         u16                             sync_stream_first_mon;
         u16                             first_available_user_sob[HL_MAX_DCORES];
         u16                             first_available_user_mon[HL_MAX_DCORES];
+       u16                             first_available_user_msix_interrupt;
+       u16                             first_available_cq[HL_MAX_DCORES];
         u8                              tpc_enabled_mask;
         u8                              completion_queues_count;
         u8                              fw_security_disabled;
         u8                              fw_security_status_valid;
         u8                              dram_supports_virtual_memory;
         u8                              hard_reset_done_by_fw;
+       u8                              num_functional_hbms;
  };
  
  /**
   * struct hl_fence - software synchronization primitive
   * @completion: fence is implemented using completion
   * @refcount: refcount for this fence
+ * @cs_sequence: sequence of the corresponding command submission
   * @error: mark this fence with error
   * @timestamp: timestamp upon completion
   *
@@ -487,6 +496,7 @@ struct asic_fixed_properties {
  struct hl_fence {
         struct completion       completion;
         struct kref             refcount;
+       u64                     cs_sequence;
         int                     error;
         ktime_t                 timestamp;
  };
@@ -846,6 +856,19 @@ enum div_select_defs {
   * @collective_wait_init_cs: Generate collective master/slave packets
   *                           and place them in the relevant cs jobs
   * @collective_wait_create_jobs: allocate collective wait cs jobs
+ * @scramble_addr: Routine to scramble the address prior of mapping it
+ *                 in the MMU.
+ * @descramble_addr: Routine to de-scramble the address prior of
+ *                   showing it to users.
+ * @ack_protection_bits_errors: ack and dump all security violations
+ * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it.
+ *                   also returns the size of the block if caller supplies
+ *                   a valid pointer for it
+ * @hw_block_mmap: mmap a HW block with a given id.
+ * @enable_events_from_fw: send interrupt to firmware to notify them the
+ *                         driver is ready to receive asynchronous events. This
+ *                         function should be called during the first init and
+ *                         after every hard-reset of the device
   */
  struct hl_asic_funcs {
         int (*early_init)(struct hl_device *hdev);
@@ -918,8 +941,8 @@ struct hl_asic_funcs {
         void (*set_clock_gating)(struct hl_device *hdev);
         void (*disable_clock_gating)(struct hl_device *hdev);
         int (*debug_coresight)(struct hl_device *hdev, void *data);
-       bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
-                               struct seq_file *s);
+       bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
+                                       u8 mask_len, struct seq_file *s);
         int (*soft_reset_late_init)(struct hl_device *hdev);
         void (*hw_queues_lock)(struct hl_device *hdev);
         void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -955,6 +978,14 @@ struct hl_asic_funcs {
         int (*collective_wait_create_jobs)(struct hl_device *hdev,
                         struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
                         u32 collective_engine_id);
+       u64 (*scramble_addr)(struct hl_device *hdev, u64 addr);
+       u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
+       void (*ack_protection_bits_errors)(struct hl_device *hdev);
+       int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr,
+                               u32 *block_size, u32 *block_id);
+       int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+                       u32 block_id, u32 block_size);
+       void (*enable_events_from_fw)(struct hl_device *hdev);
  };
  
  
@@ -1012,6 +1043,20 @@ struct hl_cs_counters_atomic {
  };
  
  /**
+ * struct hl_pending_cb - pending command buffer structure
+ * @cb_node: cb node in pending cb list
+ * @cb: command buffer to send in next submission
+ * @cb_size: command buffer size
+ * @hw_queue_id: destination queue id
+ */
+struct hl_pending_cb {
+       struct list_head        cb_node;
+       struct hl_cb            *cb;
+       u32                     cb_size;
+       u32                     hw_queue_id;
+};
+
+/**
   * struct hl_ctx - user/kernel context.
   * @mem_hash: holds mapping from virtual address to virtual memory area
   *             descriptor (hl_vm_phys_pg_list or hl_userptr).
@@ -1026,6 +1071,8 @@ struct hl_cs_counters_atomic {
   * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
   *            MMU hash or walking the PGT requires talking this lock.
   * @debugfs_list: node in debugfs list of contexts.
+ * pending_cb_list: list of pending command buffers waiting to be sent upon
+ *                  next user command submission context.
   * @cs_counters: context command submission counters.
   * @cb_va_pool: device VA pool for command buffers which are mapped to the
   *              device's MMU.
@@ -1034,11 +1081,17 @@ struct hl_cs_counters_atomic {
   *                     index to cs_pending array.
   * @dram_default_hops: array that holds all hops addresses needed for default
   *                     DRAM mapping.
+ * @pending_cb_lock: spinlock to protect pending cb list
   * @cs_lock: spinlock to protect cs_sequence.
   * @dram_phys_mem: amount of used physical DRAM memory by this context.
   * @thread_ctx_switch_token: token to prevent multiple threads of the same
   *                             context from running the context switch phase.
   *                             Only a single thread should run it.
+ * @thread_pending_cb_token: token to prevent multiple threads from processing
+ *                             the pending CB list. Only a single thread should
+ *                             process the list since it is protected by a
+ *                             spinlock and we don't want to halt the entire
+ *                             command submission sequence.
   * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
   *                             the context switch phase from moving to their
   *                             execution phase before the context switch phase
@@ -1057,13 +1110,16 @@ struct hl_ctx {
         struct mutex                    mem_hash_lock;
         struct mutex                    mmu_lock;
         struct list_head                debugfs_list;
+       struct list_head                pending_cb_list;
         struct hl_cs_counters_atomic    cs_counters;
         struct gen_pool                 *cb_va_pool;
         u64                             cs_sequence;
         u64                             *dram_default_hops;
+       spinlock_t                      pending_cb_lock;
         spinlock_t                      cs_lock;
         atomic64_t                      dram_phys_mem;
         atomic_t                        thread_ctx_switch_token;
+       atomic_t                        thread_pending_cb_token;
         u32                             thread_ctx_switch_wait_token;
         u32                             asid;
         u32                             handle;
@@ -1124,8 +1180,11 @@ struct hl_userptr {
   * @finish_work: workqueue object to run when CS is completed by H/W.
   * @work_tdr: delayed work node for TDR.
   * @mirror_node : node in device mirror list of command submissions.
+ * @staged_cs_node: node in the staged cs list.
   * @debugfs_list: node in debugfs list of command submissions.
   * @sequence: the sequence number of this CS.
+ * @staged_sequence: the sequence of the staged submission this CS is part of,
+ *                   relevant only if staged_cs is set.
   * @type: CS_TYPE_*.
   * @submitted: true if CS was submitted to H/W.
   * @completed: true if CS was completed by device.
@@ -1133,7 +1192,11 @@ struct hl_userptr {
   * @tdr_active: true if TDR was activated for this CS (to prevent
   *             double TDR activation).
   * @aborted: true if CS was aborted due to some device error.
- * @timestamp: true if a timestmap must be captured upon completion
+ * @timestamp: true if a timestmap must be captured upon completion.
+ * @staged_last: true if this is the last staged CS and needs completion.
+ * @staged_first: true if this is the first staged CS and we need to receive
+ *                timeout for this CS.
+ * @staged_cs: true if this CS is part of a staged submission.
   */
  struct hl_cs {
         u16                     *jobs_in_queue_cnt;
@@ -1146,8 +1209,10 @@ struct hl_cs {
         struct work_struct      finish_work;
         struct delayed_work     work_tdr;
         struct list_head        mirror_node;
+       struct list_head        staged_cs_node;
         struct list_head        debugfs_list;
         u64                     sequence;
+       u64                     staged_sequence;
         enum hl_cs_type         type;
         u8                      submitted;
         u8                      completed;
@@ -1155,6 +1220,9 @@ struct hl_cs {
         u8                      tdr_active;
         u8                      aborted;
         u8                      timestamp;
+       u8                      staged_last;
+       u8                      staged_first;
+       u8                      staged_cs;
  };
  
  /**
@@ -1225,6 +1293,7 @@ struct hl_cs_job {
   *                    MSG_PROT packets. Relevant only for GAUDI as GOYA doesn't
   *                    have streams so the engine can't be busy by another
   *                    stream.
+ * @completion: true if we need completion for this CS.
   */
  struct hl_cs_parser {
         struct hl_cb            *user_cb;
@@ -1239,6 +1308,7 @@ struct hl_cs_parser {
         u8                      job_id;
         u8                      is_kernel_allocated_cb;
         u8                      contains_dma_pkt;
+       u8                      completion;
  };
  
  /*
@@ -1688,12 +1758,20 @@ struct hl_mmu_per_hop_info {
   * struct hl_mmu_hop_info - A structure describing the TLB hops and their
   * hop-entries that were created in order to translate a virtual address to a
   * physical one.
+ * @scrambled_vaddr: The value of the virtual address after scrambling. This
+ *                   address replaces the original virtual-address when mapped
+ *                   in the MMU tables.
+ * @unscrambled_paddr: The un-scrambled physical address.
   * @hop_info: Array holding the per-hop information used for the translation.
   * @used_hops: The number of hops used for the translation.
+ * @range_type: virtual address range type.
   */
  struct hl_mmu_hop_info {
+       u64 scrambled_vaddr;
+       u64 unscrambled_paddr;
         struct hl_mmu_per_hop_info hop_info[MMU_ARCH_5_HOPS];
         u32 used_hops;
+       enum hl_va_range_type range_type;
  };
  
  /**
@@ -1766,7 +1844,6 @@ struct hl_mmu_funcs {
   * @asic_funcs: ASIC specific functions.
   * @asic_specific: ASIC specific information to use only from ASIC files.
   * @vm: virtual memory manager for MMU.
- * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
   * @hwmon_dev: H/W monitor device.
   * @pm_mng_profile: current power management profile.
   * @hl_chip_info: ASIC's sensors information.
@@ -1844,6 +1921,7 @@ struct hl_mmu_funcs {
   *                          user processes
   * @device_fini_pending: true if device_fini was called and might be
   *                       waiting for the reset thread to finish
+ * @supports_staged_submission: true if staged submissions are supported
   */
  struct hl_device {
         struct pci_dev                  *pdev;
@@ -1881,7 +1959,6 @@ struct hl_device {
         const struct hl_asic_funcs      *asic_funcs;
         void                            *asic_specific;
         struct hl_vm                    vm;
-       struct mutex                    mmu_cache_lock;
         struct device                   *hwmon_dev;
         enum hl_pm_mng_profile          pm_mng_profile;
         struct hwmon_chip_info          *hl_chip_info;
@@ -1950,6 +2027,7 @@ struct hl_device {
         u8                              needs_reset;
         u8                              process_kill_trial_cnt;
         u8                              device_fini_pending;
+       u8                              supports_staged_submission;
  
         /* Parameters for bring-up */
         u64                             nic_ports_mask;
@@ -2067,7 +2145,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
  int hl_hw_queue_schedule_cs(struct hl_cs *cs);
  u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
  void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
-void hl_int_hw_queue_update_ci(struct hl_cs *cs);
+void hl_hw_queue_update_ci(struct hl_cs *cs);
  void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset);
  
  #define hl_queue_inc_ptr(p)            hl_hw_queue_add_ptr(p, 1)
@@ -2123,6 +2201,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
                         bool map_cb, u64 *handle);
  int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
  int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
  struct hl_cb *hl_cb_get(struct hl_device *hdev,        struct hl_cb_mgr *mgr,
                         u32 handle);
  void hl_cb_put(struct hl_cb *cb);
@@ -2136,6 +2215,7 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx);
  void hl_cb_va_pool_fini(struct hl_ctx *ctx);
  
  void hl_cs_rollback_all(struct hl_device *hdev);
+void hl_pending_cb_list_flush(struct hl_ctx *ctx);
  struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
                 enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
  void hl_sob_reset_error(struct kref *ref);
@@ -2143,6 +2223,10 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
  void hl_fence_put(struct hl_fence *fence);
  void hl_fence_get(struct hl_fence *fence);
  void cs_get(struct hl_cs *cs);
+bool cs_needs_completion(struct hl_cs *cs);
+bool cs_needs_timeout(struct hl_cs *cs);
+bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
+struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
  
  void goya_set_asic_funcs(struct hl_device *hdev);
  void gaudi_set_asic_funcs(struct hl_device *hdev);
@@ -2184,6 +2268,8 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
  int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
  int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
                         struct hl_mmu_hop_info *hops);
+u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
+u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
  bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
  
  int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
@@ -2201,7 +2287,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
                                         void *vaddr);
  int hl_fw_send_heartbeat(struct hl_device *hdev);
  int hl_fw_cpucp_info_get(struct hl_device *hdev,
-                       u32 cpu_security_boot_status_reg);
+                       u32 cpu_security_boot_status_reg,
+                       u32 boot_err0_reg);
  int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
  int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
                 struct hl_info_pci_counters *counters);