{
union hl_cb_args *args = data;
struct hl_device *hdev = hpriv->hdev;
+ enum hl_device_status status;
u64 handle = 0;
int rc;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_warn_ratelimited(hdev->dev,
"Device is %s. Can't execute CB IOCTL\n",
- atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ hdev->status[status]);
return -EBUSY;
}
if (hdev->reset_on_lockup)
hl_device_reset(hdev, false, false);
+ else
+ hdev->needs_reset = true;
}
static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
struct hl_device *hdev = hpriv->hdev;
struct hl_ctx *ctx = hpriv->ctx;
u32 cs_type_flags, num_chunks;
+ enum hl_device_status status;
enum hl_cs_type cs_type;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_warn_ratelimited(hdev->dev,
"Device is %s. Can't submit new CS\n",
- atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ hdev->status[status]);
return -EBUSY;
}
struct cpucp_packet pkt;
int rc;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -EBUSY;
memset(&pkt, 0, sizeof(pkt));
struct cpucp_packet pkt;
int rc;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -EBUSY;
memset(&pkt, 0, sizeof(pkt));
struct cpucp_packet pkt;
int rc;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return;
memset(&pkt, 0, sizeof(pkt));
#define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
-{
- if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
- return true;
- else
- return false;
-}
-
enum hl_device_status hl_device_status(struct hl_device *hdev)
{
enum hl_device_status status;
status = HL_DEVICE_STATUS_MALFUNCTION;
else if (atomic_read(&hdev->in_reset))
status = HL_DEVICE_STATUS_IN_RESET;
+ else if (hdev->needs_reset)
+ status = HL_DEVICE_STATUS_NEEDS_RESET;
else
status = HL_DEVICE_STATUS_OPERATIONAL;
return status;
}
+bool hl_device_operational(struct hl_device *hdev,
+ enum hl_device_status *status)
+{
+ enum hl_device_status current_status;
+
+ current_status = hl_device_status(hdev);
+ if (status)
+ *status = current_status;
+
+ switch (current_status) {
+ case HL_DEVICE_STATUS_IN_RESET:
+ case HL_DEVICE_STATUS_MALFUNCTION:
+ case HL_DEVICE_STATUS_NEEDS_RESET:
+ return false;
+ case HL_DEVICE_STATUS_OPERATIONAL:
+ default:
+ return true;
+ }
+}
+
static void hpriv_release(struct kref *ref)
{
struct hl_fpriv *hpriv;
struct hl_device *hdev = container_of(work, struct hl_device,
work_heartbeat.work);
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
goto reschedule;
if (!hdev->asic_funcs->send_heartbeat(hdev))
}
atomic_set(&hdev->in_reset, 0);
+ hdev->needs_reset = false;
if (hard_reset)
hdev->hard_reset_cnt++;
* DEVICES
*/
+#define HL_STR_MAX 32
+
+#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
+
/* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
* x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
*/
* @hwmon_dev: H/W monitor device.
* @pm_mng_profile: current power management profile.
* @hl_chip_info: ASIC's sensors information.
+ * @device_status_description: device status description.
* @hl_debugfs: device's debugfs manager.
* @cb_pool: list of preallocated CBs.
* @cb_pool_lock: protects the CB pool.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
* @supports_cb_mapping: is mapping a CB to the device's MMU supported.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ * due to lockup.
*/
struct hl_device {
struct pci_dev *pdev;
struct device *dev_ctrl;
struct delayed_work work_freq;
struct delayed_work work_heartbeat;
- char asic_name[32];
+ char asic_name[HL_STR_MAX];
+ char status[HL_DEV_STS_MAX][HL_STR_MAX];
enum hl_asic_type asic_type;
struct hl_cq *completion_queue;
struct workqueue_struct **cq_wq;
u8 supports_coresight;
u8 supports_soft_reset;
u8 supports_cb_mapping;
+ u8 needs_reset;
/* Parameters for bring-up */
u64 nic_ports_mask;
int hl_device_open(struct inode *inode, struct file *filp);
int hl_device_open_ctrl(struct inode *inode, struct file *filp);
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
+bool hl_device_operational(struct hl_device *hdev,
+ enum hl_device_status *status);
enum hl_device_status hl_device_status(struct hl_device *hdev);
int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
*/
int hl_device_open(struct inode *inode, struct file *filp)
{
+ enum hl_device_status status;
struct hl_device *hdev;
struct hl_fpriv *hpriv;
int rc;
mutex_lock(&hdev->fpriv_list_lock);
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_err_ratelimited(hdev->dev,
- "Can't open %s because it is disabled or in reset\n",
- dev_name(hdev->dev));
+ "Can't open %s because it is %s\n",
+ dev_name(hdev->dev), hdev->status[status]);
rc = -EPERM;
goto out_err;
}
mutex_lock(&hdev->fpriv_list_lock);
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
dev_err_ratelimited(hdev->dev_ctrl,
"Can't open %s because it is disabled or in reset\n",
dev_name(hdev->dev_ctrl));
hdev->asic_type = asic_type;
}
+ /* Assign status description string */
+ strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
+ "disabled", HL_STR_MAX);
+ strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
+ "in reset", HL_STR_MAX);
+ strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
+ "needs reset", HL_STR_MAX);
+
hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup;
hdev->memory_scrub = memory_scrub;
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
+ enum hl_device_status status;
struct hl_info_args *args = data;
struct hl_device *hdev = hpriv->hdev;
+
int rc;
/*
break;
}
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_warn_ratelimited(dev,
"Device is %s. Can't execute INFO IOCTL\n",
- atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ hdev->status[status]);
return -EBUSY;
}
{
struct hl_debug_args *args = data;
struct hl_device *hdev = hpriv->hdev;
+ enum hl_device_status status;
+
int rc = 0;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_warn_ratelimited(hdev->dev,
"Device is %s. Can't execute DEBUG IOCTL\n",
- atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ hdev->status[status]);
return -EBUSY;
}
*/
int hl_hw_queue_schedule_cs(struct hl_cs *cs)
{
+ enum hl_device_status status;
struct hl_cs_counters_atomic *cntr;
struct hl_ctx *ctx = cs->ctx;
struct hl_device *hdev = ctx->hdev;
hdev->asic_funcs->hw_queues_lock(hdev);
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
atomic64_inc(&ctx->cs_counters.device_in_reset_drop_cnt);
- atomic64_inc(&cntr->device_in_reset_drop_cnt);
dev_err(hdev->dev,
- "device is disabled or in reset, CS rejected!\n");
+ "device is %s, CS rejected!\n", hdev->status[status]);
rc = -EPERM;
goto out;
}
struct hl_device *hdev = dev_get_drvdata(dev);
int rc;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
switch (type) {
{
struct hl_device *hdev = dev_get_drvdata(dev);
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
switch (type) {
int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
{
+ enum hl_device_status status;
union hl_mem_args *args = data;
struct hl_device *hdev = hpriv->hdev;
struct hl_ctx *ctx = hpriv->ctx;
u32 handle = 0;
int rc;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, &status)) {
dev_warn_ratelimited(hdev->dev,
"Device is %s. Can't execute MEMORY IOCTL\n",
- atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ hdev->status[status]);
return -EBUSY;
}
str = "In reset";
else if (hdev->disabled)
str = "Malfunction";
+ else if (hdev->needs_reset)
+ str = "Needs Reset";
else
str = "Operational";
struct hl_device *hdev = dev_get_drvdata(dev);
long val;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
val = hl_get_max_power(hdev);
unsigned long value;
int rc;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto out;
}
char *data;
int rc;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
if (!max_size)
{
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, false);
struct gaudi_device *gaudi = hdev->asic_specific;
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, false);
int rc;
u64 value;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto fail;
}
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, true);
{
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, false);
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, false);
int rc;
long value;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto fail;
}
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, TPC_PLL, false);
int rc;
long value;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto fail;
}
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, IC_PLL, false);
int rc;
long value;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto fail;
}
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, true);
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, TPC_PLL, true);
struct hl_device *hdev = dev_get_drvdata(dev);
long value;
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
value = hl_get_frequency(hdev, IC_PLL, true);
{
struct hl_device *hdev = dev_get_drvdata(dev);
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
return sprintf(buf, "%s\n",
{
struct hl_device *hdev = dev_get_drvdata(dev);
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto out;
}
{
struct hl_device *hdev = dev_get_drvdata(dev);
- if (hl_device_disabled_or_in_reset(hdev))
+ if (!hl_device_operational(hdev, NULL))
return -ENODEV;
return sprintf(buf, "%u\n", hdev->high_pll);
long value;
int rc;
- if (hl_device_disabled_or_in_reset(hdev)) {
+ if (!hl_device_operational(hdev, NULL)) {
count = -ENODEV;
goto out;
}
enum hl_device_status {
HL_DEVICE_STATUS_OPERATIONAL,
HL_DEVICE_STATUS_IN_RESET,
- HL_DEVICE_STATUS_MALFUNCTION
+ HL_DEVICE_STATUS_MALFUNCTION,
+ HL_DEVICE_STATUS_NEEDS_RESET
};
/* Opcode for management ioctl