drivers/misc/habanalabs/common/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * Copyright 2016-2021 HabanaLabs, Ltd.
   5  * All Rights Reserved.
   6  */
   7
   8 #include <uapi/misc/habanalabs.h>
   9 #include "habanalabs.h"
  10 #include "../include/hw_ip/mmu/mmu_general.h"
  11
  12 #include <linux/uaccess.h>
  13 #include <linux/slab.h>
  14 #include <linux/vmalloc.h>
  15 #include <linux/pci-p2pdma.h>
  16
  17 MODULE_IMPORT_NS(DMA_BUF);
  18
  19 #define HL_MMU_DEBUG    0
  20
  21 /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */
  22 #define DRAM_POOL_PAGE_SIZE SZ_8M
  23
  24 static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,
  25                         struct hl_mem_in *args, u64 *handle);
  26
  27 static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size)
  28 {
  29         struct asic_fixed_properties *prop = &hdev->asic_prop;
  30         u32 psize;
  31
  32         /*
  33          * for ASIC that supports setting the allocation page size by user we will address
  34          * user's choice only if it is not 0 (as 0 means taking the default page size)
  35          */
  36         if (prop->supports_user_set_page_size && args->alloc.page_size) {
  37                 psize = args->alloc.page_size;
  38
  39                 if (!hdev->asic_funcs->is_valid_dram_page_size(psize)) {
  40                         dev_err(hdev->dev, "user page size (%#x) is not valid\n", psize);
  41                         return -EINVAL;
  42                 }
  43         } else {
  44                 psize = hdev->asic_prop.dram_page_size;
  45         }
  46
  47         *page_size = psize;
  48
  49         return 0;
  50 }
  51
  52 /*
  53  * The va ranges in context object contain a list with the available chunks of
  54  * device virtual memory.
  55  * There is one range for host allocations and one for DRAM allocations.
  56  *
  57  * On initialization each range contains one chunk of all of its available
  58  * virtual range which is a half of the total device virtual range.
  59  *
  60  * On each mapping of physical pages, a suitable virtual range chunk (with a
  61  * minimum size) is selected from the list. If the chunk size equals the
  62  * requested size, the chunk is returned. Otherwise, the chunk is split into
  63  * two chunks - one to return as result and a remainder to stay in the list.
  64  *
  65  * On each Unmapping of a virtual address, the relevant virtual chunk is
  66  * returned to the list. The chunk is added to the list and if its edges match
  67  * the edges of the adjacent chunks (means a contiguous chunk can be created),
  68  * the chunks are merged.
  69  *
  70  * On finish, the list is checked to have only one chunk of all the relevant
  71  * virtual range (which is a half of the device total virtual range).
  72  * If not (means not all mappings were unmapped), a warning is printed.
  73  */
  74
  75 /*
  76  * alloc_device_memory() - allocate device memory.
  77  * @ctx: pointer to the context structure.
  78  * @args: host parameters containing the requested size.
  79  * @ret_handle: result handle.
  80  *
  81  * This function does the following:
  82  * - Allocate the requested size rounded up to 'dram_page_size' pages.
  83  * - Return unique handle for later map/unmap/free.
  84  */
  85 static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
  86                                 u32 *ret_handle)
  87 {
  88         struct hl_device *hdev = ctx->hdev;
  89         struct hl_vm *vm = &hdev->vm;
  90         struct hl_vm_phys_pg_pack *phys_pg_pack;
  91         u64 paddr = 0, total_size, num_pgs, i;
  92         u32 num_curr_pgs, page_size;
  93         bool contiguous;
  94         int handle, rc;
  95
  96         num_curr_pgs = 0;
  97
  98         rc = set_alloc_page_size(hdev, args, &page_size);
  99         if (rc)
 100                 return rc;
 101
 102         num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);
 103         total_size = num_pgs * page_size;
 104
 105         if (!total_size) {
 106                 dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
 107                 return -EINVAL;
 108         }
 109
 110         contiguous = args->flags & HL_MEM_CONTIGUOUS;
 111
 112         if (contiguous) {
 113                 if (is_power_of_2(page_size))
 114                         paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool,
 115                                                                      total_size, NULL, page_size);
 116                 else
 117                         paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);
 118                 if (!paddr) {
 119                         dev_err(hdev->dev,
 120                                 "failed to allocate %llu contiguous pages with total size of %llu\n",
 121                                 num_pgs, total_size);
 122                         return -ENOMEM;
 123                 }
 124         }
 125
 126         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 127         if (!phys_pg_pack) {
 128                 rc = -ENOMEM;
 129                 goto pages_pack_err;
 130         }
 131
 132         phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
 133         phys_pg_pack->asid = ctx->asid;
 134         phys_pg_pack->npages = num_pgs;
 135         phys_pg_pack->page_size = page_size;
 136         phys_pg_pack->total_size = total_size;
 137         phys_pg_pack->flags = args->flags;
 138         phys_pg_pack->contiguous = contiguous;
 139
 140         phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);
 141         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 142                 rc = -ENOMEM;
 143                 goto pages_arr_err;
 144         }
 145
 146         if (phys_pg_pack->contiguous) {
 147                 for (i = 0 ; i < num_pgs ; i++)
 148                         phys_pg_pack->pages[i] = paddr + i * page_size;
 149         } else {
 150                 for (i = 0 ; i < num_pgs ; i++) {
 151                         if (is_power_of_2(page_size))
 152                                 phys_pg_pack->pages[i] =
 153                                         (uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool,
 154                                                                             page_size, NULL,
 155                                                                             page_size);
 156                         else
 157                                 phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,
 158                                                                         page_size);
 159                         if (!phys_pg_pack->pages[i]) {
 160                                 dev_err(hdev->dev,
 161                                         "Failed to allocate device memory (out of memory)\n");
 162                                 rc = -ENOMEM;
 163                                 goto page_err;
 164                         }
 165
 166                         num_curr_pgs++;
 167                 }
 168         }
 169
 170         spin_lock(&vm->idr_lock);
 171         handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
 172                                 GFP_ATOMIC);
 173         spin_unlock(&vm->idr_lock);
 174
 175         if (handle < 0) {
 176                 dev_err(hdev->dev, "Failed to get handle for page\n");
 177                 rc = -EFAULT;
 178                 goto idr_err;
 179         }
 180
 181         for (i = 0 ; i < num_pgs ; i++)
 182                 kref_get(&vm->dram_pg_pool_refcount);
 183
 184         phys_pg_pack->handle = handle;
 185
 186         atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 187         atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);
 188
 189         *ret_handle = handle;
 190
 191         return 0;
 192
 193 idr_err:
 194 page_err:
 195         if (!phys_pg_pack->contiguous)
 196                 for (i = 0 ; i < num_curr_pgs ; i++)
 197                         gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],
 198                                         page_size);
 199
 200         kvfree(phys_pg_pack->pages);
 201 pages_arr_err:
 202         kfree(phys_pg_pack);
 203 pages_pack_err:
 204         if (contiguous)
 205                 gen_pool_free(vm->dram_pg_pool, paddr, total_size);
 206
 207         return rc;
 208 }
 209
 210 /**
 211  * dma_map_host_va() - DMA mapping of the given host virtual address.
 212  * @hdev: habanalabs device structure.
 213  * @addr: the host virtual address of the memory area.
 214  * @size: the size of the memory area.
 215  * @p_userptr: pointer to result userptr structure.
 216  *
 217  * This function does the following:
 218  * - Allocate userptr structure.
 219  * - Pin the given host memory using the userptr structure.
 220  * - Perform DMA mapping to have the DMA addresses of the pages.
 221  */
 222 static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
 223                                 struct hl_userptr **p_userptr)
 224 {
 225         struct hl_userptr *userptr;
 226         int rc;
 227
 228         userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
 229         if (!userptr) {
 230                 rc = -ENOMEM;
 231                 goto userptr_err;
 232         }
 233
 234         rc = hl_pin_host_memory(hdev, addr, size, userptr);
 235         if (rc) {
 236                 dev_err(hdev->dev, "Failed to pin host memory\n");
 237                 goto pin_err;
 238         }
 239
 240         rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
 241                                         userptr->sgt->nents, DMA_BIDIRECTIONAL);
 242         if (rc) {
 243                 dev_err(hdev->dev, "failed to map sgt with DMA region\n");
 244                 goto dma_map_err;
 245         }
 246
 247         userptr->dma_mapped = true;
 248         userptr->dir = DMA_BIDIRECTIONAL;
 249         userptr->vm_type = VM_TYPE_USERPTR;
 250
 251         *p_userptr = userptr;
 252
 253         return 0;
 254
 255 dma_map_err:
 256         hl_unpin_host_memory(hdev, userptr);
 257 pin_err:
 258         kfree(userptr);
 259 userptr_err:
 260
 261         return rc;
 262 }
 263
 264 /**
 265  * dma_unmap_host_va() - DMA unmapping of the given host virtual address.
 266  * @hdev: habanalabs device structure.
 267  * @userptr: userptr to free.
 268  *
 269  * This function does the following:
 270  * - Unpins the physical pages.
 271  * - Frees the userptr structure.
 272  */
 273 static void dma_unmap_host_va(struct hl_device *hdev,
 274                                 struct hl_userptr *userptr)
 275 {
 276         hl_unpin_host_memory(hdev, userptr);
 277         kfree(userptr);
 278 }
 279
 280 /**
 281  * dram_pg_pool_do_release() - free DRAM pages pool
 282  * @ref: pointer to reference object.
 283  *
 284  * This function does the following:
 285  * - Frees the idr structure of physical pages handles.
 286  * - Frees the generic pool of DRAM physical pages.
 287  */
 288 static void dram_pg_pool_do_release(struct kref *ref)
 289 {
 290         struct hl_vm *vm = container_of(ref, struct hl_vm,
 291                         dram_pg_pool_refcount);
 292
 293         /*
 294          * free the idr here as only here we know for sure that there are no
 295          * allocated physical pages and hence there are no handles in use
 296          */
 297         idr_destroy(&vm->phys_pg_pack_handles);
 298         gen_pool_destroy(vm->dram_pg_pool);
 299 }
 300
 301 /**
 302  * free_phys_pg_pack() - free physical page pack.
 303  * @hdev: habanalabs device structure.
 304  * @phys_pg_pack: physical page pack to free.
 305  *
 306  * This function does the following:
 307  * - For DRAM memory only
 308  *   - iterate over the pack, scrub and free each physical block structure by
 309  *     returning it to the general pool.
 310  *     In case of error during scrubbing, initiate hard reset.
 311  *     Once hard reset is triggered, scrubbing is bypassed while freeing the
 312  *     memory continues.
 313  * - Free the hl_vm_phys_pg_pack structure.
 314  */
 315 static int free_phys_pg_pack(struct hl_device *hdev,
 316                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 317 {
 318         struct hl_vm *vm = &hdev->vm;
 319         u64 i;
 320         int rc = 0;
 321
 322         if (phys_pg_pack->created_from_userptr)
 323                 goto end;
 324
 325         if (phys_pg_pack->contiguous) {
 326                 if (hdev->memory_scrub && !hdev->disabled) {
 327                         rc = hdev->asic_funcs->scrub_device_mem(hdev,
 328                                         phys_pg_pack->pages[0],
 329                                         phys_pg_pack->total_size);
 330                         if (rc)
 331                                 dev_err(hdev->dev,
 332                                         "Failed to scrub contiguous device memory\n");
 333                 }
 334
 335                 gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],
 336                         phys_pg_pack->total_size);
 337
 338                 for (i = 0; i < phys_pg_pack->npages ; i++)
 339                         kref_put(&vm->dram_pg_pool_refcount,
 340                                 dram_pg_pool_do_release);
 341         } else {
 342                 for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 343                         if (hdev->memory_scrub && !hdev->disabled && rc == 0) {
 344                                 rc = hdev->asic_funcs->scrub_device_mem(
 345                                                 hdev,
 346                                                 phys_pg_pack->pages[i],
 347                                                 phys_pg_pack->page_size);
 348                                 if (rc)
 349                                         dev_err(hdev->dev,
 350                                                 "Failed to scrub device memory\n");
 351                         }
 352                         gen_pool_free(vm->dram_pg_pool,
 353                                 phys_pg_pack->pages[i],
 354                                 phys_pg_pack->page_size);
 355                         kref_put(&vm->dram_pg_pool_refcount,
 356                                 dram_pg_pool_do_release);
 357                 }
 358         }
 359
 360         if (rc && !hdev->disabled)
 361                 hl_device_reset(hdev, HL_DRV_RESET_HARD);
 362
 363 end:
 364         kvfree(phys_pg_pack->pages);
 365         kfree(phys_pg_pack);
 366
 367         return rc;
 368 }
 369
 370 /**
 371  * free_device_memory() - free device memory.
 372  * @ctx: pointer to the context structure.
 373  * @args: host parameters containing the requested size.
 374  *
 375  * This function does the following:
 376  * - Free the device memory related to the given handle.
 377  */
 378 static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)
 379 {
 380         struct hl_device *hdev = ctx->hdev;
 381         struct hl_vm *vm = &hdev->vm;
 382         struct hl_vm_phys_pg_pack *phys_pg_pack;
 383         u32 handle = args->free.handle;
 384
 385         spin_lock(&vm->idr_lock);
 386         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
 387         if (phys_pg_pack) {
 388                 if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {
 389                         dev_err(hdev->dev, "handle %u is mapped, cannot free\n",
 390                                 handle);
 391                         spin_unlock(&vm->idr_lock);
 392                         return -EINVAL;
 393                 }
 394
 395                 if (phys_pg_pack->exporting_cnt) {
 396                         dev_dbg(hdev->dev, "handle %u is exported, cannot free\n", handle);
 397                         spin_unlock(&vm->idr_lock);
 398                         return -EINVAL;
 399                 }
 400
 401                 /*
 402                  * must remove from idr before the freeing of the physical
 403                  * pages as the refcount of the pool is also the trigger of the
 404                  * idr destroy
 405                  */
 406                 idr_remove(&vm->phys_pg_pack_handles, handle);
 407                 spin_unlock(&vm->idr_lock);
 408
 409                 atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 410                 atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);
 411
 412                 return free_phys_pg_pack(hdev, phys_pg_pack);
 413         } else {
 414                 spin_unlock(&vm->idr_lock);
 415                 dev_err(hdev->dev,
 416                         "free device memory failed, no match for handle %u\n",
 417                         handle);
 418                 return -EINVAL;
 419         }
 420
 421         return 0;
 422 }
 423
 424 /**
 425  * clear_va_list_locked() - free virtual addresses list.
 426  * @hdev: habanalabs device structure.
 427  * @va_list: list of virtual addresses to free.
 428  *
 429  * This function does the following:
 430  * - Iterate over the list and free each virtual addresses block.
 431  *
 432  * This function should be called only when va_list lock is taken.
 433  */
 434 static void clear_va_list_locked(struct hl_device *hdev,
 435                 struct list_head *va_list)
 436 {
 437         struct hl_vm_va_block *va_block, *tmp;
 438
 439         list_for_each_entry_safe(va_block, tmp, va_list, node) {
 440                 list_del(&va_block->node);
 441                 kfree(va_block);
 442         }
 443 }
 444
 445 /**
 446  * print_va_list_locked() - print virtual addresses list.
 447  * @hdev: habanalabs device structure.
 448  * @va_list: list of virtual addresses to print.
 449  *
 450  * This function does the following:
 451  * - Iterate over the list and print each virtual addresses block.
 452  *
 453  * This function should be called only when va_list lock is taken.
 454  */
 455 static void print_va_list_locked(struct hl_device *hdev,
 456                 struct list_head *va_list)
 457 {
 458 #if HL_MMU_DEBUG
 459         struct hl_vm_va_block *va_block;
 460
 461         dev_dbg(hdev->dev, "print va list:\n");
 462
 463         list_for_each_entry(va_block, va_list, node)
 464                 dev_dbg(hdev->dev,
 465                         "va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
 466                         va_block->start, va_block->end, va_block->size);
 467 #endif
 468 }
 469
 470 /**
 471  * merge_va_blocks_locked() - merge a virtual block if possible.
 472  * @hdev: pointer to the habanalabs device structure.
 473  * @va_list: pointer to the virtual addresses block list.
 474  * @va_block: virtual block to merge with adjacent blocks.
 475  *
 476  * This function does the following:
 477  * - Merge the given blocks with the adjacent blocks if their virtual ranges
 478  *   create a contiguous virtual range.
 479  *
 480  * This Function should be called only when va_list lock is taken.
 481  */
 482 static void merge_va_blocks_locked(struct hl_device *hdev,
 483                 struct list_head *va_list, struct hl_vm_va_block *va_block)
 484 {
 485         struct hl_vm_va_block *prev, *next;
 486
 487         prev = list_prev_entry(va_block, node);
 488         if (&prev->node != va_list && prev->end + 1 == va_block->start) {
 489                 prev->end = va_block->end;
 490                 prev->size = prev->end - prev->start;
 491                 list_del(&va_block->node);
 492                 kfree(va_block);
 493                 va_block = prev;
 494         }
 495
 496         next = list_next_entry(va_block, node);
 497         if (&next->node != va_list && va_block->end + 1 == next->start) {
 498                 next->start = va_block->start;
 499                 next->size = next->end - next->start;
 500                 list_del(&va_block->node);
 501                 kfree(va_block);
 502         }
 503 }
 504
 505 /**
 506  * add_va_block_locked() - add a virtual block to the virtual addresses list.
 507  * @hdev: pointer to the habanalabs device structure.
 508  * @va_list: pointer to the virtual addresses block list.
 509  * @start: start virtual address.
 510  * @end: end virtual address.
 511  *
 512  * This function does the following:
 513  * - Add the given block to the virtual blocks list and merge with other blocks
 514  *   if a contiguous virtual block can be created.
 515  *
 516  * This Function should be called only when va_list lock is taken.
 517  */
 518 static int add_va_block_locked(struct hl_device *hdev,
 519                 struct list_head *va_list, u64 start, u64 end)
 520 {
 521         struct hl_vm_va_block *va_block, *res = NULL;
 522         u64 size = end - start + 1;
 523
 524         print_va_list_locked(hdev, va_list);
 525
 526         list_for_each_entry(va_block, va_list, node) {
 527                 /* TODO: remove upon matureness */
 528                 if (hl_mem_area_crosses_range(start, size, va_block->start,
 529                                 va_block->end)) {
 530                         dev_err(hdev->dev,
 531                                 "block crossing ranges at start 0x%llx, end 0x%llx\n",
 532                                 va_block->start, va_block->end);
 533                         return -EINVAL;
 534                 }
 535
 536                 if (va_block->end < start)
 537                         res = va_block;
 538         }
 539
 540         va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);
 541         if (!va_block)
 542                 return -ENOMEM;
 543
 544         va_block->start = start;
 545         va_block->end = end;
 546         va_block->size = size;
 547
 548         if (!res)
 549                 list_add(&va_block->node, va_list);
 550         else
 551                 list_add(&va_block->node, &res->node);
 552
 553         merge_va_blocks_locked(hdev, va_list, va_block);
 554
 555         print_va_list_locked(hdev, va_list);
 556
 557         return 0;
 558 }
 559
 560 /**
 561  * add_va_block() - wrapper for add_va_block_locked.
 562  * @hdev: pointer to the habanalabs device structure.
 563  * @va_range: pointer to the virtual addresses range object.
 564  * @start: start virtual address.
 565  * @end: end virtual address.
 566  *
 567  * This function does the following:
 568  * - Takes the list lock and calls add_va_block_locked.
 569  */
 570 static inline int add_va_block(struct hl_device *hdev,
 571                 struct hl_va_range *va_range, u64 start, u64 end)
 572 {
 573         int rc;
 574
 575         mutex_lock(&va_range->lock);
 576         rc = add_va_block_locked(hdev, &va_range->list, start, end);
 577         mutex_unlock(&va_range->lock);
 578
 579         return rc;
 580 }
 581
 582 /**
 583  * is_hint_crossing_range() - check if hint address crossing specified reserved.
 584  * @range_type: virtual space range type.
 585  * @start_addr: start virtual address.
 586  * @size: block size.
 587  * @prop: asic properties structure to retrieve reserved ranges from.
 588  */
 589 static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
 590                 u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
 591         bool range_cross;
 592
 593         if (range_type == HL_VA_RANGE_TYPE_DRAM)
 594                 range_cross =
 595                         hl_mem_area_crosses_range(start_addr, size,
 596                         prop->hints_dram_reserved_va_range.start_addr,
 597                         prop->hints_dram_reserved_va_range.end_addr);
 598         else if (range_type == HL_VA_RANGE_TYPE_HOST)
 599                 range_cross =
 600                         hl_mem_area_crosses_range(start_addr,   size,
 601                         prop->hints_host_reserved_va_range.start_addr,
 602                         prop->hints_host_reserved_va_range.end_addr);
 603         else
 604                 range_cross =
 605                         hl_mem_area_crosses_range(start_addr, size,
 606                         prop->hints_host_hpage_reserved_va_range.start_addr,
 607                         prop->hints_host_hpage_reserved_va_range.end_addr);
 608
 609         return range_cross;
 610 }
 611
 612 /**
 613  * get_va_block() - get a virtual block for the given size and alignment.
 614  *
 615  * @hdev: pointer to the habanalabs device structure.
 616  * @va_range: pointer to the virtual addresses range.
 617  * @size: requested block size.
 618  * @hint_addr: hint for requested address by the user.
 619  * @va_block_align: required alignment of the virtual block start address.
 620  * @range_type: va range type (host, dram)
 621  * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
 622  *
 623  * This function does the following:
 624  * - Iterate on the virtual block list to find a suitable virtual block for the
 625  *   given size, hint address and alignment.
 626  * - Reserve the requested block and update the list.
 627  * - Return the start address of the virtual block.
 628  */
 629 static u64 get_va_block(struct hl_device *hdev,
 630                                 struct hl_va_range *va_range,
 631                                 u64 size, u64 hint_addr, u32 va_block_align,
 632                                 enum hl_va_range_type range_type,
 633                                 u32 flags)
 634 {
 635         struct hl_vm_va_block *va_block, *new_va_block = NULL;
 636         struct asic_fixed_properties *prop = &hdev->asic_prop;
 637         u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
 638                 align_mask, reserved_valid_start = 0, reserved_valid_size = 0,
 639                 dram_hint_mask = prop->dram_hints_align_mask;
 640         bool add_prev = false;
 641         bool is_align_pow_2  = is_power_of_2(va_range->page_size);
 642         bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
 643         bool force_hint = flags & HL_MEM_FORCE_HINT;
 644
 645         if (is_align_pow_2)
 646                 align_mask = ~((u64)va_block_align - 1);
 647         else
 648                 /*
 649                  * with non-power-of-2 range we work only with page granularity
 650                  * and the start address is page aligned,
 651                  * so no need for alignment checking.
 652                  */
 653                 size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
 654                                                         va_range->page_size;
 655
 656         tmp_hint_addr = hint_addr & ~dram_hint_mask;
 657
 658         /* Check if we need to ignore hint address */
 659         if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||
 660                 (!is_align_pow_2 && is_hint_dram_addr &&
 661                         do_div(tmp_hint_addr, va_range->page_size))) {
 662
 663                 if (force_hint) {
 664                         /* Hint must be respected, so here we just fail */
 665                         dev_err(hdev->dev,
 666                                 "Hint address 0x%llx is not page aligned - cannot be respected\n",
 667                                 hint_addr);
 668                         return 0;
 669                 }
 670
 671                 dev_dbg(hdev->dev,
 672                         "Hint address 0x%llx will be ignored because it is not aligned\n",
 673                         hint_addr);
 674                 hint_addr = 0;
 675         }
 676
 677         mutex_lock(&va_range->lock);
 678
 679         print_va_list_locked(hdev, &va_range->list);
 680
 681         list_for_each_entry(va_block, &va_range->list, node) {
 682                 /* Calc the first possible aligned addr */
 683                 valid_start = va_block->start;
 684
 685                 if (is_align_pow_2 && (valid_start & (va_block_align - 1))) {
 686                         valid_start &= align_mask;
 687                         valid_start += va_block_align;
 688                         if (valid_start > va_block->end)
 689                                 continue;
 690                 }
 691
 692                 valid_size = va_block->end - valid_start + 1;
 693                 if (valid_size < size)
 694                         continue;
 695
 696                 /*
 697                  * In case hint address is 0, and hints_range_reservation
 698                  * property enabled, then avoid allocating va blocks from the
 699                  * range reserved for hint addresses
 700                  */
 701                 if (prop->hints_range_reservation && !hint_addr)
 702                         if (is_hint_crossing_range(range_type, valid_start,
 703                                         size, prop))
 704                                 continue;
 705
 706                 /* Pick the minimal length block which has the required size */
 707                 if (!new_va_block || (valid_size < reserved_valid_size)) {
 708                         new_va_block = va_block;
 709                         reserved_valid_start = valid_start;
 710                         reserved_valid_size = valid_size;
 711                 }
 712
 713                 if (hint_addr && hint_addr >= valid_start &&
 714                                         (hint_addr + size) <= va_block->end) {
 715                         new_va_block = va_block;
 716                         reserved_valid_start = hint_addr;
 717                         reserved_valid_size = valid_size;
 718                         break;
 719                 }
 720         }
 721
 722         if (!new_va_block) {
 723                 dev_err(hdev->dev, "no available va block for size %llu\n",
 724                                                                 size);
 725                 goto out;
 726         }
 727
 728         if (force_hint && reserved_valid_start != hint_addr) {
 729                 /* Hint address must be respected. If we are here - this means
 730                  * we could not respect it.
 731                  */
 732                 dev_err(hdev->dev,
 733                         "Hint address 0x%llx could not be respected\n",
 734                         hint_addr);
 735                 reserved_valid_start = 0;
 736                 goto out;
 737         }
 738
 739         /*
 740          * Check if there is some leftover range due to reserving the new
 741          * va block, then return it to the main virtual addresses list.
 742          */
 743         if (reserved_valid_start > new_va_block->start) {
 744                 prev_start = new_va_block->start;
 745                 prev_end = reserved_valid_start - 1;
 746
 747                 new_va_block->start = reserved_valid_start;
 748                 new_va_block->size = reserved_valid_size;
 749
 750                 add_prev = true;
 751         }
 752
 753         if (new_va_block->size > size) {
 754                 new_va_block->start += size;
 755                 new_va_block->size = new_va_block->end - new_va_block->start + 1;
 756         } else {
 757                 list_del(&new_va_block->node);
 758                 kfree(new_va_block);
 759         }
 760
 761         if (add_prev)
 762                 add_va_block_locked(hdev, &va_range->list, prev_start,
 763                                 prev_end);
 764
 765         print_va_list_locked(hdev, &va_range->list);
 766 out:
 767         mutex_unlock(&va_range->lock);
 768
 769         return reserved_valid_start;
 770 }
 771
 772 /*
 773  * hl_reserve_va_block() - reserve a virtual block of a given size.
 774  * @hdev: pointer to the habanalabs device structure.
 775  * @ctx: current context
 776  * @type: virtual addresses range type.
 777  * @size: requested block size.
 778  * @alignment: required alignment in bytes of the virtual block start address,
 779  *             0 means no alignment.
 780  *
 781  * This function does the following:
 782  * - Iterate on the virtual block list to find a suitable virtual block for the
 783  *   given size and alignment.
 784  * - Reserve the requested block and update the list.
 785  * - Return the start address of the virtual block.
 786  */
 787 u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 788                 enum hl_va_range_type type, u32 size, u32 alignment)
 789 {
 790         return get_va_block(hdev, ctx->va_range[type], size, 0,
 791                         max(alignment, ctx->va_range[type]->page_size),
 792                         type, 0);
 793 }
 794
 795 /**
 796  * hl_get_va_range_type() - get va_range type for the given address and size.
 797  * @ctx: context to fetch va_range from.
 798  * @address: the start address of the area we want to validate.
 799  * @size: the size in bytes of the area we want to validate.
 800  * @type: returned va_range type.
 801  *
 802  * Return: true if the area is inside a valid range, false otherwise.
 803  */
 804 static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
 805                         enum hl_va_range_type *type)
 806 {
 807         int i;
 808
 809         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {
 810                 if (hl_mem_area_inside_range(address, size,
 811                                 ctx->va_range[i]->start_addr,
 812                                 ctx->va_range[i]->end_addr)) {
 813                         *type = i;
 814                         return 0;
 815                 }
 816         }
 817
 818         return -EINVAL;
 819 }
 820
 821 /**
 822  * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.
 823  * @hdev: pointer to the habanalabs device structure
 824  * @ctx: pointer to the context structure.
 825  * @start_addr: start virtual address.
 826  * @size: number of bytes to unreserve.
 827  *
 828  * This function does the following:
 829  * - Takes the list lock and calls add_va_block_locked.
 830  */
 831 int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 832                 u64 start_addr, u64 size)
 833 {
 834         enum hl_va_range_type type;
 835         int rc;
 836
 837         rc = hl_get_va_range_type(ctx, start_addr, size, &type);
 838         if (rc) {
 839                 dev_err(hdev->dev,
 840                         "cannot find va_range for va %#llx size %llu",
 841                         start_addr, size);
 842                 return rc;
 843         }
 844
 845         rc = add_va_block(hdev, ctx->va_range[type], start_addr,
 846                                                 start_addr + size - 1);
 847         if (rc)
 848                 dev_warn(hdev->dev,
 849                         "add va block failed for vaddr: 0x%llx\n", start_addr);
 850
 851         return rc;
 852 }
 853
 854 /**
 855  * init_phys_pg_pack_from_userptr() - initialize physical page pack from host
 856  *                                    memory
 857  * @ctx: pointer to the context structure.
 858  * @userptr: userptr to initialize from.
 859  * @pphys_pg_pack: result pointer.
 860  * @force_regular_page: tell the function to ignore huge page optimization,
 861  *                      even if possible. Needed for cases where the device VA
 862  *                      is allocated before we know the composition of the
 863  *                      physical pages
 864  *
 865  * This function does the following:
 866  * - Pin the physical pages related to the given virtual block.
 867  * - Create a physical page pack from the physical pages related to the given
 868  *   virtual block.
 869  */
 870 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 871                                 struct hl_userptr *userptr,
 872                                 struct hl_vm_phys_pg_pack **pphys_pg_pack,
 873                                 bool force_regular_page)
 874 {
 875         u32 npages, page_size = PAGE_SIZE,
 876                 huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
 877         u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 878         struct hl_vm_phys_pg_pack *phys_pg_pack;
 879         bool first = true, is_huge_page_opt;
 880         u64 page_mask, total_npages;
 881         struct scatterlist *sg;
 882         dma_addr_t dma_addr;
 883         int rc, i, j;
 884
 885         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 886         if (!phys_pg_pack)
 887                 return -ENOMEM;
 888
 889         phys_pg_pack->vm_type = userptr->vm_type;
 890         phys_pg_pack->created_from_userptr = true;
 891         phys_pg_pack->asid = ctx->asid;
 892         atomic_set(&phys_pg_pack->mapping_cnt, 1);
 893
 894         is_huge_page_opt = (force_regular_page ? false : true);
 895
 896         /* Only if all dma_addrs are aligned to 2MB and their
 897          * sizes is at least 2MB, we can use huge page mapping.
 898          * We limit the 2MB optimization to this condition,
 899          * since later on we acquire the related VA range as one
 900          * consecutive block.
 901          */
 902         total_npages = 0;
 903         for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
 904                 npages = hl_get_sg_info(sg, &dma_addr);
 905
 906                 total_npages += npages;
 907
 908                 if ((npages % pgs_in_huge_page) ||
 909                                         (dma_addr & (huge_page_size - 1)))
 910                         is_huge_page_opt = false;
 911         }
 912
 913         if (is_huge_page_opt) {
 914                 page_size = huge_page_size;
 915                 do_div(total_npages, pgs_in_huge_page);
 916         }
 917
 918         page_mask = ~(((u64) page_size) - 1);
 919
 920         phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),
 921                                                 GFP_KERNEL);
 922         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 923                 rc = -ENOMEM;
 924                 goto page_pack_arr_mem_err;
 925         }
 926
 927         phys_pg_pack->npages = total_npages;
 928         phys_pg_pack->page_size = page_size;
 929         phys_pg_pack->total_size = total_npages * page_size;
 930
 931         j = 0;
 932         for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
 933                 npages = hl_get_sg_info(sg, &dma_addr);
 934
 935                 /* align down to physical page size and save the offset */
 936                 if (first) {
 937                         first = false;
 938                         phys_pg_pack->offset = dma_addr & (page_size - 1);
 939                         dma_addr &= page_mask;
 940                 }
 941
 942                 while (npages) {
 943                         phys_pg_pack->pages[j++] = dma_addr;
 944                         dma_addr += page_size;
 945
 946                         if (is_huge_page_opt)
 947                                 npages -= pgs_in_huge_page;
 948                         else
 949                                 npages--;
 950                 }
 951         }
 952
 953         *pphys_pg_pack = phys_pg_pack;
 954
 955         return 0;
 956
 957 page_pack_arr_mem_err:
 958         kfree(phys_pg_pack);
 959
 960         return rc;
 961 }
 962
 963 /**
 964  * map_phys_pg_pack() - maps the physical page pack..
 965  * @ctx: pointer to the context structure.
 966  * @vaddr: start address of the virtual area to map from.
 967  * @phys_pg_pack: the pack of physical pages to map to.
 968  *
 969  * This function does the following:
 970  * - Maps each chunk of virtual memory to matching physical chunk.
 971  * - Stores number of successful mappings in the given argument.
 972  * - Returns 0 on success, error code otherwise.
 973  */
 974 static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 975                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 976 {
 977         struct hl_device *hdev = ctx->hdev;
 978         u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
 979         u32 page_size = phys_pg_pack->page_size;
 980         int rc = 0;
 981         bool is_host_addr;
 982
 983         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 984                 paddr = phys_pg_pack->pages[i];
 985
 986                 rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,
 987                                 (i + 1) == phys_pg_pack->npages);
 988                 if (rc) {
 989                         dev_err(hdev->dev,
 990                                 "map failed for handle %u, npages: %llu, mapped: %llu",
 991                                 phys_pg_pack->handle, phys_pg_pack->npages,
 992                                 mapped_pg_cnt);
 993                         goto err;
 994                 }
 995
 996                 mapped_pg_cnt++;
 997                 next_vaddr += page_size;
 998         }
 999
1000         return 0;
1001
1002 err:
1003         is_host_addr = !hl_is_dram_va(hdev, vaddr);
1004
1005         next_vaddr = vaddr;
1006         for (i = 0 ; i < mapped_pg_cnt ; i++) {
1007                 if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
1008                                         (i + 1) == mapped_pg_cnt))
1009                         dev_warn_ratelimited(hdev->dev,
1010                                 "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
1011                                         phys_pg_pack->handle, next_vaddr,
1012                                         phys_pg_pack->pages[i], page_size);
1013
1014                 next_vaddr += page_size;
1015
1016                 /*
1017                  * unmapping on Palladium can be really long, so avoid a CPU
1018                  * soft lockup bug by sleeping a little between unmapping pages
1019                  *
1020                  * In addition, on host num of pages could be huge,
1021                  * because page size could be 4KB, so when unmapping host
1022                  * pages sleep every 32K pages to avoid soft lockup
1023                  */
1024                 if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
1025                         usleep_range(50, 200);
1026         }
1027
1028         return rc;
1029 }
1030
1031 /**
1032  * unmap_phys_pg_pack() - unmaps the physical page pack.
1033  * @ctx: pointer to the context structure.
1034  * @vaddr: start address of the virtual area to unmap.
1035  * @phys_pg_pack: the pack of physical pages to unmap.
1036  */
1037 static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
1038                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
1039 {
1040         struct hl_device *hdev = ctx->hdev;
1041         u64 next_vaddr, i;
1042         bool is_host_addr;
1043         u32 page_size;
1044
1045         is_host_addr = !hl_is_dram_va(hdev, vaddr);
1046         page_size = phys_pg_pack->page_size;
1047         next_vaddr = vaddr;
1048
1049         for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
1050                 if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
1051                                        (i + 1) == phys_pg_pack->npages))
1052                         dev_warn_ratelimited(hdev->dev,
1053                         "unmap failed for vaddr: 0x%llx\n", next_vaddr);
1054
1055                 /*
1056                  * unmapping on Palladium can be really long, so avoid a CPU
1057                  * soft lockup bug by sleeping a little between unmapping pages
1058                  *
1059                  * In addition, on host num of pages could be huge,
1060                  * because page size could be 4KB, so when unmapping host
1061                  * pages sleep every 32K pages to avoid soft lockup
1062                  */
1063                 if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
1064                         usleep_range(50, 200);
1065         }
1066 }
1067
1068 static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
1069                                         u64 *paddr)
1070 {
1071         struct hl_device *hdev = ctx->hdev;
1072         struct hl_vm *vm = &hdev->vm;
1073         struct hl_vm_phys_pg_pack *phys_pg_pack;
1074         u32 handle;
1075
1076         handle = lower_32_bits(args->map_device.handle);
1077         spin_lock(&vm->idr_lock);
1078         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
1079         if (!phys_pg_pack) {
1080                 spin_unlock(&vm->idr_lock);
1081                 dev_err(hdev->dev, "no match for handle %u\n", handle);
1082                 return -EINVAL;
1083         }
1084
1085         *paddr = phys_pg_pack->pages[0];
1086
1087         spin_unlock(&vm->idr_lock);
1088
1089         return 0;
1090 }
1091
1092 /**
1093  * map_device_va() - map the given memory.
1094  * @ctx: pointer to the context structure.
1095  * @args: host parameters with handle/host virtual address.
1096  * @device_addr: pointer to result device virtual address.
1097  *
1098  * This function does the following:
1099  * - If given a physical device memory handle, map to a device virtual block
1100  *   and return the start address of this block.
1101  * - If given a host virtual address and size, find the related physical pages,
1102  *   map a device virtual block to this pages and return the start address of
1103  *   this block.
1104  */
1105 static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
1106                 u64 *device_addr)
1107 {
1108         struct hl_device *hdev = ctx->hdev;
1109         struct hl_vm *vm = &hdev->vm;
1110         struct hl_vm_phys_pg_pack *phys_pg_pack;
1111         struct hl_userptr *userptr = NULL;
1112         struct hl_vm_hash_node *hnode;
1113         struct hl_va_range *va_range;
1114         enum vm_type *vm_type;
1115         u64 ret_vaddr, hint_addr;
1116         u32 handle = 0, va_block_align;
1117         int rc;
1118         bool is_userptr = args->flags & HL_MEM_USERPTR;
1119         enum hl_va_range_type va_range_type = 0;
1120
1121         /* Assume failure */
1122         *device_addr = 0;
1123
1124         if (is_userptr) {
1125                 u64 addr = args->map_host.host_virt_addr,
1126                         size = args->map_host.mem_size;
1127                 u32 page_size = hdev->asic_prop.pmmu.page_size,
1128                         huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
1129
1130                 rc = dma_map_host_va(hdev, addr, size, &userptr);
1131                 if (rc) {
1132                         dev_err(hdev->dev, "failed to get userptr from va\n");
1133                         return rc;
1134                 }
1135
1136                 rc = init_phys_pg_pack_from_userptr(ctx, userptr,
1137                                 &phys_pg_pack, false);
1138                 if (rc) {
1139                         dev_err(hdev->dev,
1140                                 "unable to init page pack for vaddr 0x%llx\n",
1141                                 addr);
1142                         goto init_page_pack_err;
1143                 }
1144
1145                 vm_type = (enum vm_type *) userptr;
1146                 hint_addr = args->map_host.hint_addr;
1147                 handle = phys_pg_pack->handle;
1148
1149                 /* get required alignment */
1150                 if (phys_pg_pack->page_size == page_size) {
1151                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1152                         va_range_type = HL_VA_RANGE_TYPE_HOST;
1153                         /*
1154                          * huge page alignment may be needed in case of regular
1155                          * page mapping, depending on the host VA alignment
1156                          */
1157                         if (addr & (huge_page_size - 1))
1158                                 va_block_align = page_size;
1159                         else
1160                                 va_block_align = huge_page_size;
1161                 } else {
1162                         /*
1163                          * huge page alignment is needed in case of huge page
1164                          * mapping
1165                          */
1166                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1167                         va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
1168                         va_block_align = huge_page_size;
1169                 }
1170         } else {
1171                 handle = lower_32_bits(args->map_device.handle);
1172
1173                 spin_lock(&vm->idr_lock);
1174                 phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
1175                 if (!phys_pg_pack) {
1176                         spin_unlock(&vm->idr_lock);
1177                         dev_err(hdev->dev,
1178                                 "no match for handle %u\n", handle);
1179                         return -EINVAL;
1180                 }
1181
1182                 /* increment now to avoid freeing device memory while mapping */
1183                 atomic_inc(&phys_pg_pack->mapping_cnt);
1184
1185                 spin_unlock(&vm->idr_lock);
1186
1187                 vm_type = (enum vm_type *) phys_pg_pack;
1188
1189                 hint_addr = args->map_device.hint_addr;
1190
1191                 /* DRAM VA alignment is the same as the MMU page size */
1192                 va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1193                 va_range_type = HL_VA_RANGE_TYPE_DRAM;
1194                 va_block_align = hdev->asic_prop.dmmu.page_size;
1195         }
1196
1197         /*
1198          * relevant for mapping device physical memory only, as host memory is
1199          * implicitly shared
1200          */
1201         if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
1202                         phys_pg_pack->asid != ctx->asid) {
1203                 dev_err(hdev->dev,
1204                         "Failed to map memory, handle %u is not shared\n",
1205                         handle);
1206                 rc = -EPERM;
1207                 goto shared_err;
1208         }
1209
1210         hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);
1211         if (!hnode) {
1212                 rc = -ENOMEM;
1213                 goto hnode_err;
1214         }
1215
1216         if (hint_addr && phys_pg_pack->offset) {
1217                 if (args->flags & HL_MEM_FORCE_HINT) {
1218                         /* Fail if hint must be respected but it can't be */
1219                         dev_err(hdev->dev,
1220                                 "Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
1221                                 hint_addr, phys_pg_pack->offset);
1222                         rc = -EINVAL;
1223                         goto va_block_err;
1224                 }
1225                 dev_dbg(hdev->dev,
1226                         "Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
1227                         hint_addr, phys_pg_pack->offset);
1228         }
1229
1230         ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
1231                                         hint_addr, va_block_align,
1232                                         va_range_type, args->flags);
1233         if (!ret_vaddr) {
1234                 dev_err(hdev->dev, "no available va block for handle %u\n",
1235                                 handle);
1236                 rc = -ENOMEM;
1237                 goto va_block_err;
1238         }
1239
1240         mutex_lock(&ctx->mmu_lock);
1241
1242         rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
1243         if (rc) {
1244                 mutex_unlock(&ctx->mmu_lock);
1245                 dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
1246                                 handle);
1247                 goto map_err;
1248         }
1249
1250         rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,
1251                                 ctx->asid, ret_vaddr, phys_pg_pack->total_size);
1252
1253         mutex_unlock(&ctx->mmu_lock);
1254
1255         if (rc)
1256                 goto map_err;
1257
1258         ret_vaddr += phys_pg_pack->offset;
1259
1260         hnode->ptr = vm_type;
1261         hnode->vaddr = ret_vaddr;
1262
1263         mutex_lock(&ctx->mem_hash_lock);
1264         hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
1265         mutex_unlock(&ctx->mem_hash_lock);
1266
1267         *device_addr = ret_vaddr;
1268
1269         if (is_userptr)
1270                 rc = free_phys_pg_pack(hdev, phys_pg_pack);
1271
1272         return rc;
1273
1274 map_err:
1275         if (add_va_block(hdev, va_range, ret_vaddr,
1276                                 ret_vaddr + phys_pg_pack->total_size - 1))
1277                 dev_warn(hdev->dev,
1278                         "release va block failed for handle 0x%x, vaddr: 0x%llx\n",
1279                                 handle, ret_vaddr);
1280
1281 va_block_err:
1282         kfree(hnode);
1283 hnode_err:
1284 shared_err:
1285         atomic_dec(&phys_pg_pack->mapping_cnt);
1286         if (is_userptr)
1287                 free_phys_pg_pack(hdev, phys_pg_pack);
1288 init_page_pack_err:
1289         if (is_userptr)
1290                 dma_unmap_host_va(hdev, userptr);
1291
1292         return rc;
1293 }
1294
1295 /**
1296  * unmap_device_va() - unmap the given device virtual address.
1297  * @ctx: pointer to the context structure.
1298  * @args: host parameters with device virtual address to unmap.
1299  * @ctx_free: true if in context free flow, false otherwise.
1300  *
1301  * This function does the following:
1302  * - unmap the physical pages related to the given virtual address.
1303  * - return the device virtual block to the virtual block list.
1304  */
1305 static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
1306                                 bool ctx_free)
1307 {
1308         struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
1309         u64 vaddr = args->unmap.device_virt_addr;
1310         struct hl_vm_hash_node *hnode = NULL;
1311         struct asic_fixed_properties *prop;
1312         struct hl_device *hdev = ctx->hdev;
1313         struct hl_userptr *userptr = NULL;
1314         struct hl_va_range *va_range;
1315         enum vm_type *vm_type;
1316         bool is_userptr;
1317         int rc = 0;
1318
1319         prop = &hdev->asic_prop;
1320
1321         /* protect from double entrance */
1322         mutex_lock(&ctx->mem_hash_lock);
1323         hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
1324                 if (vaddr == hnode->vaddr)
1325                         break;
1326
1327         if (!hnode) {
1328                 mutex_unlock(&ctx->mem_hash_lock);
1329                 dev_err(hdev->dev,
1330                         "unmap failed, no mem hnode for vaddr 0x%llx\n",
1331                         vaddr);
1332                 return -EINVAL;
1333         }
1334
1335         hash_del(&hnode->node);
1336         mutex_unlock(&ctx->mem_hash_lock);
1337
1338         vm_type = hnode->ptr;
1339
1340         if (*vm_type == VM_TYPE_USERPTR) {
1341                 is_userptr = true;
1342                 userptr = hnode->ptr;
1343
1344                 rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,
1345                                                         false);
1346                 if (rc) {
1347                         dev_err(hdev->dev,
1348                                 "unable to init page pack for vaddr 0x%llx\n",
1349                                 vaddr);
1350                         goto vm_type_err;
1351                 }
1352
1353                 if (phys_pg_pack->page_size ==
1354                                         hdev->asic_prop.pmmu.page_size)
1355                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1356                 else
1357                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1358         } else if (*vm_type == VM_TYPE_PHYS_PACK) {
1359                 is_userptr = false;
1360                 va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1361                 phys_pg_pack = hnode->ptr;
1362         } else {
1363                 dev_warn(hdev->dev,
1364                         "unmap failed, unknown vm desc for vaddr 0x%llx\n",
1365                                 vaddr);
1366                 rc = -EFAULT;
1367                 goto vm_type_err;
1368         }
1369
1370         if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {
1371                 dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
1372                 rc = -EINVAL;
1373                 goto mapping_cnt_err;
1374         }
1375
1376         if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size))
1377                 vaddr = prop->dram_base_address +
1378                         DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,
1379                                                 phys_pg_pack->page_size) *
1380                                                         phys_pg_pack->page_size;
1381         else
1382                 vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
1383
1384         mutex_lock(&ctx->mmu_lock);
1385
1386         unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
1387
1388         /*
1389          * During context free this function is called in a loop to clean all
1390          * the context mappings. Hence the cache invalidation can be called once
1391          * at the loop end rather than for each iteration
1392          */
1393         if (!ctx_free)
1394                 rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,
1395                                                         phys_pg_pack->total_size);
1396
1397         mutex_unlock(&ctx->mmu_lock);
1398
1399         /*
1400          * If the context is closing we don't need to check for the MMU cache
1401          * invalidation return code and update the VA free list as in this flow
1402          * we invalidate the MMU cache outside of this unmap function and the VA
1403          * free list will be freed anyway.
1404          */
1405         if (!ctx_free) {
1406                 int tmp_rc;
1407
1408                 tmp_rc = add_va_block(hdev, va_range, vaddr,
1409                                         vaddr + phys_pg_pack->total_size - 1);
1410                 if (tmp_rc) {
1411                         dev_warn(hdev->dev,
1412                                         "add va block failed for vaddr: 0x%llx\n",
1413                                         vaddr);
1414                         if (!rc)
1415                                 rc = tmp_rc;
1416                 }
1417         }
1418
1419         atomic_dec(&phys_pg_pack->mapping_cnt);
1420         kfree(hnode);
1421
1422         if (is_userptr) {
1423                 free_phys_pg_pack(hdev, phys_pg_pack);
1424                 dma_unmap_host_va(hdev, userptr);
1425         }
1426
1427         return rc;
1428
1429 mapping_cnt_err:
1430         if (is_userptr)
1431                 free_phys_pg_pack(hdev, phys_pg_pack);
1432 vm_type_err:
1433         mutex_lock(&ctx->mem_hash_lock);
1434         hash_add(ctx->mem_hash, &hnode->node, vaddr);
1435         mutex_unlock(&ctx->mem_hash_lock);
1436
1437         return rc;
1438 }
1439
1440 static int map_block(struct hl_device *hdev, u64 address, u64 *handle,
1441                         u32 *size)
1442 {
1443         u32 block_id = 0;
1444         int rc;
1445
1446         rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);
1447
1448         *handle = block_id | HL_MMAP_TYPE_BLOCK;
1449         *handle <<= PAGE_SHIFT;
1450
1451         return rc;
1452 }
1453
1454 static void hw_block_vm_close(struct vm_area_struct *vma)
1455 {
1456         struct hl_vm_hw_block_list_node *lnode =
1457                 (struct hl_vm_hw_block_list_node *) vma->vm_private_data;
1458         struct hl_ctx *ctx = lnode->ctx;
1459
1460         mutex_lock(&ctx->hw_block_list_lock);
1461         list_del(&lnode->node);
1462         mutex_unlock(&ctx->hw_block_list_lock);
1463         hl_ctx_put(ctx);
1464         kfree(lnode);
1465         vma->vm_private_data = NULL;
1466 }
1467
1468 static const struct vm_operations_struct hw_block_vm_ops = {
1469         .close = hw_block_vm_close
1470 };
1471
1472 /**
1473  * hl_hw_block_mmap() - mmap a hw block to user.
1474  * @hpriv: pointer to the private data of the fd
1475  * @vma: pointer to vm_area_struct of the process
1476  *
1477  * Driver increments context reference for every HW block mapped in order
1478  * to prevent user from closing FD without unmapping first
1479  */
1480 int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
1481 {
1482         struct hl_vm_hw_block_list_node *lnode;
1483         struct hl_device *hdev = hpriv->hdev;
1484         struct hl_ctx *ctx = hpriv->ctx;
1485         u32 block_id, block_size;
1486         int rc;
1487
1488         /* We use the page offset to hold the block id and thus we need to clear
1489          * it before doing the mmap itself
1490          */
1491         block_id = vma->vm_pgoff;
1492         vma->vm_pgoff = 0;
1493
1494         /* Driver only allows mapping of a complete HW block */
1495         block_size = vma->vm_end - vma->vm_start;
1496
1497         if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {
1498                 dev_err(hdev->dev,
1499                         "user pointer is invalid - 0x%lx\n",
1500                         vma->vm_start);
1501
1502                 return -EINVAL;
1503         }
1504
1505         lnode = kzalloc(sizeof(*lnode), GFP_KERNEL);
1506         if (!lnode)
1507                 return -ENOMEM;
1508
1509         vma->vm_ops = &hw_block_vm_ops;
1510         vma->vm_private_data = lnode;
1511
1512         hl_ctx_get(hdev, ctx);
1513
1514         rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
1515         if (rc) {
1516                 hl_ctx_put(ctx);
1517                 kfree(lnode);
1518                 return rc;
1519         }
1520
1521         lnode->ctx = ctx;
1522         lnode->vaddr = vma->vm_start;
1523         lnode->size = block_size;
1524         lnode->id = block_id;
1525
1526         mutex_lock(&ctx->hw_block_list_lock);
1527         list_add_tail(&lnode->node, &ctx->hw_block_mem_list);
1528         mutex_unlock(&ctx->hw_block_list_lock);
1529
1530         vma->vm_pgoff = block_id;
1531
1532         return 0;
1533 }
1534
1535 static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,
1536                         struct device *dev, enum dma_data_direction dir)
1537 {
1538         dma_addr_t addr;
1539         int rc;
1540
1541         addr = dma_map_resource(dev, bar_address, chunk_size, dir,
1542                                 DMA_ATTR_SKIP_CPU_SYNC);
1543         rc = dma_mapping_error(dev, addr);
1544         if (rc)
1545                 return rc;
1546
1547         sg_set_page(sg, NULL, chunk_size, 0);
1548         sg_dma_address(sg) = addr;
1549         sg_dma_len(sg) = chunk_size;
1550
1551         return 0;
1552 }
1553
1554 static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages,
1555                                                 u64 page_size, struct device *dev,
1556                                                 enum dma_data_direction dir)
1557 {
1558         u64 chunk_size, bar_address, dma_max_seg_size;
1559         struct asic_fixed_properties *prop;
1560         int rc, i, j, nents, cur_page;
1561         struct scatterlist *sg;
1562         struct sg_table *sgt;
1563
1564         prop = &hdev->asic_prop;
1565
1566         dma_max_seg_size = dma_get_max_seg_size(dev);
1567
1568         /* We would like to align the max segment size to PAGE_SIZE, so the
1569          * SGL will contain aligned addresses that can be easily mapped to
1570          * an MMU
1571          */
1572         dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE);
1573         if (dma_max_seg_size < PAGE_SIZE) {
1574                 dev_err_ratelimited(hdev->dev,
1575                                 "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
1576                                 dma_max_seg_size);
1577                 return ERR_PTR(-EINVAL);
1578         }
1579
1580         sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
1581         if (!sgt)
1582                 return ERR_PTR(-ENOMEM);
1583
1584         /* If the size of each page is larger than the dma max segment size,
1585          * then we can't combine pages and the number of entries in the SGL
1586          * will just be the
1587          * <number of pages> * <chunks of max segment size in each page>
1588          */
1589         if (page_size > dma_max_seg_size)
1590                 nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size);
1591         else
1592                 /* Get number of non-contiguous chunks */
1593                 for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) {
1594                         if (pages[i - 1] + page_size != pages[i] ||
1595                                         chunk_size + page_size > dma_max_seg_size) {
1596                                 nents++;
1597                                 chunk_size = page_size;
1598                                 continue;
1599                         }
1600
1601                         chunk_size += page_size;
1602                 }
1603
1604         rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
1605         if (rc)
1606                 goto error_free;
1607
1608         cur_page = 0;
1609
1610         if (page_size > dma_max_seg_size) {
1611                 u64 size_left, cur_device_address = 0;
1612
1613                 size_left = page_size;
1614
1615                 /* Need to split each page into the number of chunks of
1616                  * dma_max_seg_size
1617                  */
1618                 for_each_sgtable_dma_sg(sgt, sg, i) {
1619                         if (size_left == page_size)
1620                                 cur_device_address =
1621                                         pages[cur_page] - prop->dram_base_address;
1622                         else
1623                                 cur_device_address += dma_max_seg_size;
1624
1625                         chunk_size = min(size_left, dma_max_seg_size);
1626
1627                         bar_address = hdev->dram_pci_bar_start + cur_device_address;
1628
1629                         rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
1630                         if (rc)
1631                                 goto error_unmap;
1632
1633                         if (size_left > dma_max_seg_size) {
1634                                 size_left -= dma_max_seg_size;
1635                         } else {
1636                                 cur_page++;
1637                                 size_left = page_size;
1638                         }
1639                 }
1640         } else {
1641                 /* Merge pages and put them into the scatterlist */
1642                 for_each_sgtable_dma_sg(sgt, sg, i) {
1643                         chunk_size = page_size;
1644                         for (j = cur_page + 1 ; j < npages ; j++) {
1645                                 if (pages[j - 1] + page_size != pages[j] ||
1646                                                 chunk_size + page_size > dma_max_seg_size)
1647                                         break;
1648
1649                                 chunk_size += page_size;
1650                         }
1651
1652                         bar_address = hdev->dram_pci_bar_start +
1653                                         (pages[cur_page] - prop->dram_base_address);
1654
1655                         rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
1656                         if (rc)
1657                                 goto error_unmap;
1658
1659                         cur_page = j;
1660                 }
1661         }
1662
1663         /* Because we are not going to include a CPU list we want to have some
1664          * chance that other users will detect this by setting the orig_nents
1665          * to 0 and using only nents (length of DMA list) when going over the
1666          * sgl
1667          */
1668         sgt->orig_nents = 0;
1669
1670         return sgt;
1671
1672 error_unmap:
1673         for_each_sgtable_dma_sg(sgt, sg, i) {
1674                 if (!sg_dma_len(sg))
1675                         continue;
1676
1677                 dma_unmap_resource(dev, sg_dma_address(sg),
1678                                         sg_dma_len(sg), dir,
1679                                         DMA_ATTR_SKIP_CPU_SYNC);
1680         }
1681
1682         sg_free_table(sgt);
1683
1684 error_free:
1685         kfree(sgt);
1686         return ERR_PTR(rc);
1687 }
1688
1689 static int hl_dmabuf_attach(struct dma_buf *dmabuf,
1690                                 struct dma_buf_attachment *attachment)
1691 {
1692         struct hl_dmabuf_priv *hl_dmabuf;
1693         struct hl_device *hdev;
1694         int rc;
1695
1696         hl_dmabuf = dmabuf->priv;
1697         hdev = hl_dmabuf->ctx->hdev;
1698
1699         rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true);
1700
1701         if (rc < 0)
1702                 attachment->peer2peer = false;
1703         return 0;
1704 }
1705
1706 static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,
1707                                         enum dma_data_direction dir)
1708 {
1709         struct dma_buf *dma_buf = attachment->dmabuf;
1710         struct hl_vm_phys_pg_pack *phys_pg_pack;
1711         struct hl_dmabuf_priv *hl_dmabuf;
1712         struct hl_device *hdev;
1713         struct sg_table *sgt;
1714
1715         hl_dmabuf = dma_buf->priv;
1716         hdev = hl_dmabuf->ctx->hdev;
1717         phys_pg_pack = hl_dmabuf->phys_pg_pack;
1718
1719         if (!attachment->peer2peer) {
1720                 dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");
1721                 return ERR_PTR(-EPERM);
1722         }
1723
1724         if (phys_pg_pack)
1725                 sgt = alloc_sgt_from_device_pages(hdev,
1726                                                 phys_pg_pack->pages,
1727                                                 phys_pg_pack->npages,
1728                                                 phys_pg_pack->page_size,
1729                                                 attachment->dev,
1730                                                 dir);
1731         else
1732                 sgt = alloc_sgt_from_device_pages(hdev,
1733                                                 &hl_dmabuf->device_address,
1734                                                 1,
1735                                                 hl_dmabuf->dmabuf->size,
1736                                                 attachment->dev,
1737                                                 dir);
1738
1739         if (IS_ERR(sgt))
1740                 dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));
1741
1742         return sgt;
1743 }
1744
1745 static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
1746                                   struct sg_table *sgt,
1747                                   enum dma_data_direction dir)
1748 {
1749         struct scatterlist *sg;
1750         int i;
1751
1752         /* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives
1753          * only in the 'device' domain (after all, it maps a PCI bar address which points to the
1754          * device memory).
1755          *
1756          * Therefore, it was never in the 'CPU' domain and hence, there is no need to perform
1757          * a sync of the memory to the CPU's cache, as it never resided inside that cache.
1758          */
1759         for_each_sgtable_dma_sg(sgt, sg, i)
1760                 dma_unmap_resource(attachment->dev, sg_dma_address(sg),
1761                                         sg_dma_len(sg), dir,
1762                                         DMA_ATTR_SKIP_CPU_SYNC);
1763
1764         /* Need to restore orig_nents because sg_free_table use that field */
1765         sgt->orig_nents = sgt->nents;
1766         sg_free_table(sgt);
1767         kfree(sgt);
1768 }
1769
1770 static void hl_release_dmabuf(struct dma_buf *dmabuf)
1771 {
1772         struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
1773         struct hl_ctx *ctx = hl_dmabuf->ctx;
1774         struct hl_device *hdev = ctx->hdev;
1775         struct hl_vm *vm = &hdev->vm;
1776
1777         if (hl_dmabuf->phys_pg_pack) {
1778                 spin_lock(&vm->idr_lock);
1779                 hl_dmabuf->phys_pg_pack->exporting_cnt--;
1780                 spin_unlock(&vm->idr_lock);
1781         }
1782
1783         hl_ctx_put(hl_dmabuf->ctx);
1784
1785         kfree(hl_dmabuf);
1786 }
1787
1788 static const struct dma_buf_ops habanalabs_dmabuf_ops = {
1789         .attach = hl_dmabuf_attach,
1790         .map_dma_buf = hl_map_dmabuf,
1791         .unmap_dma_buf = hl_unmap_dmabuf,
1792         .release = hl_release_dmabuf,
1793 };
1794
1795 static int export_dmabuf_common(struct hl_ctx *ctx,
1796                                 struct hl_dmabuf_priv *hl_dmabuf,
1797                                 u64 total_size, int flags, int *dmabuf_fd)
1798 {
1799         DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
1800         struct hl_device *hdev = ctx->hdev;
1801         int rc, fd;
1802
1803         exp_info.ops = &habanalabs_dmabuf_ops;
1804         exp_info.size = total_size;
1805         exp_info.flags = flags;
1806         exp_info.priv = hl_dmabuf;
1807
1808         hl_dmabuf->dmabuf = dma_buf_export(&exp_info);
1809         if (IS_ERR(hl_dmabuf->dmabuf)) {
1810                 dev_err(hdev->dev, "failed to export dma-buf\n");
1811                 return PTR_ERR(hl_dmabuf->dmabuf);
1812         }
1813
1814         fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
1815         if (fd < 0) {
1816                 dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf\n");
1817                 rc = fd;
1818                 goto err_dma_buf_put;
1819         }
1820
1821         hl_dmabuf->ctx = ctx;
1822         hl_ctx_get(hdev, hl_dmabuf->ctx);
1823
1824         *dmabuf_fd = fd;
1825
1826         return 0;
1827
1828 err_dma_buf_put:
1829         dma_buf_put(hl_dmabuf->dmabuf);
1830         return rc;
1831 }
1832
1833 /**
1834  * export_dmabuf_from_addr() - export a dma-buf object for the given memory
1835  *                             address and size.
1836  * @ctx: pointer to the context structure.
1837  * @device_addr:  device memory physical address.
1838  * @size: size of device memory.
1839  * @flags: DMA-BUF file/FD flags.
1840  * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
1841  *
1842  * Create and export a dma-buf object for an existing memory allocation inside
1843  * the device memory, and return a FD which is associated with the dma-buf
1844  * object.
1845  *
1846  * Return: 0 on success, non-zero for failure.
1847  */
1848 static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr,
1849                                         u64 size, int flags, int *dmabuf_fd)
1850 {
1851         struct hl_dmabuf_priv *hl_dmabuf;
1852         struct hl_device *hdev = ctx->hdev;
1853         struct asic_fixed_properties *prop;
1854         u64 bar_address;
1855         int rc;
1856
1857         prop = &hdev->asic_prop;
1858
1859         if (!IS_ALIGNED(device_addr, PAGE_SIZE)) {
1860                 dev_dbg(hdev->dev,
1861                         "exported device memory address 0x%llx should be aligned to 0x%lx\n",
1862                         device_addr, PAGE_SIZE);
1863                 return -EINVAL;
1864         }
1865
1866         if (size < PAGE_SIZE) {
1867                 dev_dbg(hdev->dev,
1868                         "exported device memory size %llu should be equal to or greater than %lu\n",
1869                         size, PAGE_SIZE);
1870                 return -EINVAL;
1871         }
1872
1873         if (device_addr < prop->dram_user_base_address ||
1874                                 device_addr + size > prop->dram_end_address ||
1875                                 device_addr + size < device_addr) {
1876                 dev_dbg(hdev->dev,
1877                         "DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",
1878                         device_addr, size);
1879                 return -EINVAL;
1880         }
1881
1882         bar_address = hdev->dram_pci_bar_start +
1883                         (device_addr - prop->dram_base_address);
1884
1885         if (bar_address + size >
1886                         hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
1887                         bar_address + size < bar_address) {
1888                 dev_dbg(hdev->dev,
1889                         "DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",
1890                         device_addr, size);
1891                 return -EINVAL;
1892         }
1893
1894         hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
1895         if (!hl_dmabuf)
1896                 return -ENOMEM;
1897
1898         hl_dmabuf->device_address = device_addr;
1899
1900         rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd);
1901         if (rc)
1902                 goto err_free_dmabuf_wrapper;
1903
1904         return 0;
1905
1906 err_free_dmabuf_wrapper:
1907         kfree(hl_dmabuf);
1908         return rc;
1909 }
1910
1911 /**
1912  * export_dmabuf_from_handle() - export a dma-buf object for the given memory
1913  *                               handle.
1914  * @ctx: pointer to the context structure.
1915  * @handle: device memory allocation handle.
1916  * @flags: DMA-BUF file/FD flags.
1917  * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
1918  *
1919  * Create and export a dma-buf object for an existing memory allocation inside
1920  * the device memory, and return a FD which is associated with the dma-buf
1921  * object.
1922  *
1923  * Return: 0 on success, non-zero for failure.
1924  */
1925 static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags,
1926                                         int *dmabuf_fd)
1927 {
1928         struct hl_vm_phys_pg_pack *phys_pg_pack;
1929         struct hl_dmabuf_priv *hl_dmabuf;
1930         struct hl_device *hdev = ctx->hdev;
1931         struct asic_fixed_properties *prop;
1932         struct hl_vm *vm = &hdev->vm;
1933         u64 bar_address;
1934         int rc, i;
1935
1936         prop = &hdev->asic_prop;
1937
1938         if (upper_32_bits(handle)) {
1939                 dev_dbg(hdev->dev, "no match for handle 0x%llx\n", handle);
1940                 return -EINVAL;
1941         }
1942
1943         spin_lock(&vm->idr_lock);
1944
1945         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) handle);
1946         if (!phys_pg_pack) {
1947                 spin_unlock(&vm->idr_lock);
1948                 dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) handle);
1949                 return -EINVAL;
1950         }
1951
1952         /* increment now to avoid freeing device memory while exporting */
1953         phys_pg_pack->exporting_cnt++;
1954
1955         spin_unlock(&vm->idr_lock);
1956
1957         if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {
1958                 dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", handle);
1959                 rc = -EINVAL;
1960                 goto err_dec_exporting_cnt;
1961         }
1962
1963         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
1964
1965                 bar_address = hdev->dram_pci_bar_start +
1966                                                 (phys_pg_pack->pages[i] -
1967                                                 prop->dram_base_address);
1968
1969                 if (bar_address + phys_pg_pack->page_size >
1970                         hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
1971                         bar_address + phys_pg_pack->page_size < bar_address) {
1972
1973                         dev_dbg(hdev->dev,
1974                                 "DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",
1975                                 phys_pg_pack->pages[i],
1976                                 phys_pg_pack->page_size);
1977
1978                         rc = -EINVAL;
1979                         goto err_dec_exporting_cnt;
1980                 }
1981         }
1982
1983         hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
1984         if (!hl_dmabuf) {
1985                 rc = -ENOMEM;
1986                 goto err_dec_exporting_cnt;
1987         }
1988
1989         hl_dmabuf->phys_pg_pack = phys_pg_pack;
1990
1991         rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size,
1992                                 flags, dmabuf_fd);
1993         if (rc)
1994                 goto err_free_dmabuf_wrapper;
1995
1996         return 0;
1997
1998 err_free_dmabuf_wrapper:
1999         kfree(hl_dmabuf);
2000
2001 err_dec_exporting_cnt:
2002         spin_lock(&vm->idr_lock);
2003         phys_pg_pack->exporting_cnt--;
2004         spin_unlock(&vm->idr_lock);
2005
2006         return rc;
2007 }
2008
2009 static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
2010 {
2011         struct hl_device *hdev = hpriv->hdev;
2012         u64 block_handle, device_addr = 0;
2013         struct hl_ctx *ctx = hpriv->ctx;
2014         u32 handle = 0, block_size;
2015         int rc;
2016
2017         switch (args->in.op) {
2018         case HL_MEM_OP_ALLOC:
2019                 if (args->in.alloc.mem_size == 0) {
2020                         dev_err(hdev->dev, "alloc size must be larger than 0\n");
2021                         rc = -EINVAL;
2022                         goto out;
2023                 }
2024
2025                 /* Force contiguous as there are no real MMU
2026                  * translations to overcome physical memory gaps
2027                  */
2028                 args->in.flags |= HL_MEM_CONTIGUOUS;
2029                 rc = alloc_device_memory(ctx, &args->in, &handle);
2030
2031                 memset(args, 0, sizeof(*args));
2032                 args->out.handle = (__u64) handle;
2033                 break;
2034
2035         case HL_MEM_OP_FREE:
2036                 rc = free_device_memory(ctx, &args->in);
2037                 break;
2038
2039         case HL_MEM_OP_MAP:
2040                 if (args->in.flags & HL_MEM_USERPTR) {
2041                         dev_err(hdev->dev, "Failed to map host memory when MMU is disabled\n");
2042                         rc = -EPERM;
2043                 } else {
2044                         rc = get_paddr_from_handle(ctx, &args->in, &device_addr);
2045                         memset(args, 0, sizeof(*args));
2046                         args->out.device_virt_addr = device_addr;
2047                 }
2048
2049                 break;
2050
2051         case HL_MEM_OP_UNMAP:
2052                 rc = 0;
2053                 break;
2054
2055         case HL_MEM_OP_MAP_BLOCK:
2056                 rc = map_block(hdev, args->in.map_block.block_addr, &block_handle, &block_size);
2057                 args->out.block_handle = block_handle;
2058                 args->out.block_size = block_size;
2059                 break;
2060
2061         case HL_MEM_OP_EXPORT_DMABUF_FD:
2062                 dev_err(hdev->dev, "Failed to export dma-buf object when MMU is disabled\n");
2063                 rc = -EPERM;
2064                 break;
2065
2066         case HL_MEM_OP_TS_ALLOC:
2067                 rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
2068                 break;
2069         default:
2070                 dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
2071                 rc = -EINVAL;
2072                 break;
2073         }
2074
2075 out:
2076         return rc;
2077 }
2078
2079 static void ts_buff_release(struct kref *ref)
2080 {
2081         struct hl_ts_buff *buff;
2082
2083         buff = container_of(ref, struct hl_ts_buff, refcount);
2084
2085         vfree(buff->kernel_buff_address);
2086         vfree(buff->user_buff_address);
2087         kfree(buff);
2088 }
2089
2090 struct hl_ts_buff *hl_ts_get(struct hl_device *hdev, struct hl_ts_mgr *mgr,
2091                                         u32 handle)
2092 {
2093         struct hl_ts_buff *buff;
2094
2095         spin_lock(&mgr->ts_lock);
2096         buff = idr_find(&mgr->ts_handles, handle);
2097         if (!buff) {
2098                 spin_unlock(&mgr->ts_lock);
2099                 dev_warn(hdev->dev,
2100                         "TS buff get failed, no match to handle 0x%x\n", handle);
2101                 return NULL;
2102         }
2103         kref_get(&buff->refcount);
2104         spin_unlock(&mgr->ts_lock);
2105
2106         return buff;
2107 }
2108
2109 void hl_ts_put(struct hl_ts_buff *buff)
2110 {
2111         kref_put(&buff->refcount, ts_buff_release);
2112 }
2113
2114 static void buff_vm_close(struct vm_area_struct *vma)
2115 {
2116         struct hl_ts_buff *buff = (struct hl_ts_buff *) vma->vm_private_data;
2117         long new_mmap_size;
2118
2119         new_mmap_size = buff->mmap_size - (vma->vm_end - vma->vm_start);
2120
2121         if (new_mmap_size > 0) {
2122                 buff->mmap_size = new_mmap_size;
2123                 return;
2124         }
2125
2126         atomic_set(&buff->mmap, 0);
2127         hl_ts_put(buff);
2128         vma->vm_private_data = NULL;
2129 }
2130
2131 static const struct vm_operations_struct ts_buff_vm_ops = {
2132         .close = buff_vm_close
2133 };
2134
2135 int hl_ts_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
2136 {
2137         struct hl_device *hdev = hpriv->hdev;
2138         struct hl_ts_buff *buff;
2139         u32 handle, user_buff_size;
2140         int rc;
2141
2142         /* We use the page offset to hold the idr and thus we need to clear
2143          * it before doing the mmap itself
2144          */
2145         handle = vma->vm_pgoff;
2146         vma->vm_pgoff = 0;
2147
2148         buff = hl_ts_get(hdev, &hpriv->ts_mem_mgr, handle);
2149         if (!buff) {
2150                 dev_err(hdev->dev,
2151                         "TS buff mmap failed, no match to handle 0x%x\n", handle);
2152                 return -EINVAL;
2153         }
2154
2155         /* Validation check */
2156         user_buff_size = vma->vm_end - vma->vm_start;
2157         if (user_buff_size != ALIGN(buff->user_buff_size, PAGE_SIZE)) {
2158                 dev_err(hdev->dev,
2159                         "TS buff mmap failed, mmap size 0x%x != 0x%x buff size\n",
2160                         user_buff_size, ALIGN(buff->user_buff_size, PAGE_SIZE));
2161                 rc = -EINVAL;
2162                 goto put_buff;
2163         }
2164
2165 #ifdef _HAS_TYPE_ARG_IN_ACCESS_OK
2166         if (!access_ok(VERIFY_WRITE,
2167                 (void __user *) (uintptr_t) vma->vm_start, user_buff_size)) {
2168 #else
2169         if (!access_ok((void __user *) (uintptr_t) vma->vm_start,
2170                                                 user_buff_size)) {
2171 #endif
2172                 dev_err(hdev->dev,
2173                         "user pointer is invalid - 0x%lx\n",
2174                         vma->vm_start);
2175
2176                 rc = -EINVAL;
2177                 goto put_buff;
2178         }
2179
2180         if (atomic_cmpxchg(&buff->mmap, 0, 1)) {
2181                 dev_err(hdev->dev, "TS buff memory mmap failed, already mmaped to user\n");
2182                 rc = -EINVAL;
2183                 goto put_buff;
2184         }
2185
2186         vma->vm_ops = &ts_buff_vm_ops;
2187         vma->vm_private_data = buff;
2188         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE;
2189         rc = remap_vmalloc_range(vma, buff->user_buff_address, 0);
2190         if (rc) {
2191                 atomic_set(&buff->mmap, 0);
2192                 goto put_buff;
2193         }
2194
2195         buff->mmap_size = buff->user_buff_size;
2196         vma->vm_pgoff = handle;
2197
2198         return 0;
2199
2200 put_buff:
2201         hl_ts_put(buff);
2202         return rc;
2203 }
2204
2205 void hl_ts_mgr_init(struct hl_ts_mgr *mgr)
2206 {
2207         spin_lock_init(&mgr->ts_lock);
2208         idr_init(&mgr->ts_handles);
2209 }
2210
2211 void hl_ts_mgr_fini(struct hl_device *hdev, struct hl_ts_mgr *mgr)
2212 {
2213         struct hl_ts_buff *buff;
2214         struct idr *idp;
2215         u32 id;
2216
2217         idp = &mgr->ts_handles;
2218
2219         idr_for_each_entry(idp, buff, id) {
2220                 if (kref_put(&buff->refcount, ts_buff_release) != 1)
2221                         dev_err(hdev->dev, "TS buff handle %d for CTX is still alive\n",
2222                                                         id);
2223         }
2224
2225         idr_destroy(&mgr->ts_handles);
2226 }
2227
2228 static struct hl_ts_buff *hl_ts_alloc_buff(struct hl_device *hdev, u32 num_elements)
2229 {
2230         struct hl_ts_buff *ts_buff = NULL;
2231         u32 size;
2232         void *p;
2233
2234         ts_buff = kzalloc(sizeof(*ts_buff), GFP_KERNEL);
2235         if (!ts_buff)
2236                 return NULL;
2237
2238         /* Allocate the user buffer */
2239         size = num_elements * sizeof(u64);
2240         p = vmalloc_user(size);
2241         if (!p)
2242                 goto free_mem;
2243
2244         ts_buff->user_buff_address = p;
2245         ts_buff->user_buff_size = size;
2246
2247         /* Allocate the internal kernel buffer */
2248         size = num_elements * sizeof(struct hl_user_pending_interrupt);
2249         p = vmalloc(size);
2250         if (!p)
2251                 goto free_user_buff;
2252
2253         ts_buff->kernel_buff_address = p;
2254         ts_buff->kernel_buff_size = size;
2255
2256         return ts_buff;
2257
2258 free_user_buff:
2259         vfree(ts_buff->user_buff_address);
2260 free_mem:
2261         kfree(ts_buff);
2262         return NULL;
2263 }
2264
2265 /**
2266  * allocate_timestamps_buffers() - allocate timestamps buffers
2267  * This function will allocate ts buffer that will later on be mapped to the user
2268  * in order to be able to read the timestamp.
2269  * in additon it'll allocate an extra buffer for registration management.
2270  * since we cannot fail during registration for out-of-memory situation, so
2271  * we'll prepare a pool which will be used as user interrupt nodes and instead
2272  * of dynamically allocating nodes while registration we'll pick the node from
2273  * this pool. in addtion it'll add node to the mapping hash which will be used
2274  * to map user ts buffer to the internal kernel ts buffer.
2275  * @hpriv: pointer to the private data of the fd
2276  * @args: ioctl input
2277  * @handle: user timestamp buffer handle as an output
2278  */
2279 static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)
2280 {
2281         struct hl_ts_mgr *ts_mgr = &hpriv->ts_mem_mgr;
2282         struct hl_device *hdev = hpriv->hdev;
2283         struct hl_ts_buff *ts_buff;
2284         int rc = 0;
2285
2286         if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {
2287                 dev_err(hdev->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
2288                                 args->num_of_elements, TS_MAX_ELEMENTS_NUM);
2289                 return -EINVAL;
2290         }
2291
2292         /* Allocate ts buffer object
2293          * This object will contain two buffers one that will be mapped to the user
2294          * and another internal buffer for the driver use only, which won't be mapped
2295          * to the user.
2296          */
2297         ts_buff = hl_ts_alloc_buff(hdev, args->num_of_elements);
2298         if (!ts_buff) {
2299                 rc = -ENOMEM;
2300                 goto out_err;
2301         }
2302
2303         spin_lock(&ts_mgr->ts_lock);
2304         rc = idr_alloc(&ts_mgr->ts_handles, ts_buff, 1, 0, GFP_ATOMIC);
2305         spin_unlock(&ts_mgr->ts_lock);
2306         if (rc < 0) {
2307                 dev_err(hdev->dev, "Failed to allocate IDR for a new ts buffer\n");
2308                 goto release_ts_buff;
2309         }
2310
2311         ts_buff->id = rc;
2312         ts_buff->hdev = hdev;
2313
2314         kref_init(&ts_buff->refcount);
2315
2316         /* idr is 32-bit so we can safely OR it with a mask that is above 32 bit */
2317         *handle = (u64) ts_buff->id | HL_MMAP_TYPE_TS_BUFF;
2318         *handle <<= PAGE_SHIFT;
2319
2320         dev_dbg(hdev->dev, "Created ts buff object handle(%u)\n", ts_buff->id);
2321
2322         return 0;
2323
2324 release_ts_buff:
2325         kref_put(&ts_buff->refcount, ts_buff_release);
2326 out_err:
2327         *handle = 0;
2328         return rc;
2329 }
2330
2331 int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
2332 {
2333         enum hl_device_status status;
2334         union hl_mem_args *args = data;
2335         struct hl_device *hdev = hpriv->hdev;
2336         struct hl_ctx *ctx = hpriv->ctx;
2337         u64 block_handle, device_addr = 0;
2338         u32 handle = 0, block_size;
2339         int rc, dmabuf_fd = -EBADF;
2340
2341         if (!hl_device_operational(hdev, &status)) {
2342                 dev_warn_ratelimited(hdev->dev,
2343                         "Device is %s. Can't execute MEMORY IOCTL\n",
2344                         hdev->status[status]);
2345                 return -EBUSY;
2346         }
2347
2348         if (!hdev->mmu_enable)
2349                 return mem_ioctl_no_mmu(hpriv, args);
2350
2351         switch (args->in.op) {
2352         case HL_MEM_OP_ALLOC:
2353                 if (args->in.alloc.mem_size == 0) {
2354                         dev_err(hdev->dev,
2355                                 "alloc size must be larger than 0\n");
2356                         rc = -EINVAL;
2357                         goto out;
2358                 }
2359
2360                 /* If DRAM does not support virtual memory the driver won't
2361                  * handle the allocation/freeing of that memory. However, for
2362                  * system administration/monitoring purposes, the driver will
2363                  * keep track of the amount of DRAM memory that is allocated
2364                  * and freed by the user. Because this code totally relies on
2365                  * the user's input, the driver can't ensure the validity
2366                  * of this accounting.
2367                  */
2368                 if (!hdev->asic_prop.dram_supports_virtual_memory) {
2369                         atomic64_add(args->in.alloc.mem_size,
2370                                         &ctx->dram_phys_mem);
2371                         atomic64_add(args->in.alloc.mem_size,
2372                                         &hdev->dram_used_mem);
2373
2374                         dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2375                         rc = 0;
2376
2377                         memset(args, 0, sizeof(*args));
2378                         args->out.handle = 0;
2379                         goto out;
2380                 }
2381
2382                 rc = alloc_device_memory(ctx, &args->in, &handle);
2383
2384                 memset(args, 0, sizeof(*args));
2385                 args->out.handle = (__u64) handle;
2386                 break;
2387
2388         case HL_MEM_OP_FREE:
2389                 /* If DRAM does not support virtual memory the driver won't
2390                  * handle the allocation/freeing of that memory. However, for
2391                  * system administration/monitoring purposes, the driver will
2392                  * keep track of the amount of DRAM memory that is allocated
2393                  * and freed by the user. Because this code totally relies on
2394                  * the user's input, the driver can't ensure the validity
2395                  * of this accounting.
2396                  */
2397                 if (!hdev->asic_prop.dram_supports_virtual_memory) {
2398                         atomic64_sub(args->in.alloc.mem_size,
2399                                         &ctx->dram_phys_mem);
2400                         atomic64_sub(args->in.alloc.mem_size,
2401                                         &hdev->dram_used_mem);
2402
2403                         dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2404                         rc = 0;
2405
2406                         goto out;
2407                 }
2408
2409                 rc = free_device_memory(ctx, &args->in);
2410                 break;
2411
2412         case HL_MEM_OP_MAP:
2413                 rc = map_device_va(ctx, &args->in, &device_addr);
2414
2415                 memset(args, 0, sizeof(*args));
2416                 args->out.device_virt_addr = device_addr;
2417                 break;
2418
2419         case HL_MEM_OP_UNMAP:
2420                 rc = unmap_device_va(ctx, &args->in, false);
2421                 break;
2422
2423         case HL_MEM_OP_MAP_BLOCK:
2424                 rc = map_block(hdev, args->in.map_block.block_addr,
2425                                 &block_handle, &block_size);
2426                 args->out.block_handle = block_handle;
2427                 args->out.block_size = block_size;
2428                 break;
2429
2430         case HL_MEM_OP_EXPORT_DMABUF_FD:
2431                 if (hdev->asic_prop.dram_supports_virtual_memory)
2432                         rc = export_dmabuf_from_handle(ctx,
2433                                         args->in.export_dmabuf_fd.handle,
2434                                         args->in.flags,
2435                                         &dmabuf_fd);
2436                 else
2437                         rc = export_dmabuf_from_addr(ctx,
2438                                         args->in.export_dmabuf_fd.handle,
2439                                         args->in.export_dmabuf_fd.mem_size,
2440                                         args->in.flags,
2441                                         &dmabuf_fd);
2442                 memset(args, 0, sizeof(*args));
2443                 args->out.fd = dmabuf_fd;
2444                 break;
2445
2446         case HL_MEM_OP_TS_ALLOC:
2447                 rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
2448                 break;
2449         default:
2450                 dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
2451                 rc = -EINVAL;
2452                 break;
2453         }
2454
2455 out:
2456         return rc;
2457 }
2458
2459 static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
2460                                 u32 npages, u64 start, u32 offset,
2461                                 struct hl_userptr *userptr)
2462 {
2463         int rc;
2464
2465         if (!access_ok((void __user *) (uintptr_t) addr, size)) {
2466                 dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
2467                 return -EFAULT;
2468         }
2469
2470         userptr->pages = kvmalloc_array(npages, sizeof(*userptr->pages),
2471                                         GFP_KERNEL);
2472         if (!userptr->pages)
2473                 return -ENOMEM;
2474
2475         rc = pin_user_pages_fast(start, npages,
2476                                  FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
2477                                  userptr->pages);
2478
2479         if (rc != npages) {
2480                 dev_err(hdev->dev,
2481                         "Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",
2482                         rc, addr, size, npages);
2483                 if (rc < 0)
2484                         goto destroy_pages;
2485                 npages = rc;
2486                 rc = -EFAULT;
2487                 goto put_pages;
2488         }
2489         userptr->npages = npages;
2490
2491         rc = sg_alloc_table_from_pages(userptr->sgt,
2492                                        userptr->pages,
2493                                        npages, offset, size, GFP_KERNEL);
2494         if (rc < 0) {
2495                 dev_err(hdev->dev, "failed to create SG table from pages\n");
2496                 goto put_pages;
2497         }
2498
2499         return 0;
2500
2501 put_pages:
2502         unpin_user_pages(userptr->pages, npages);
2503 destroy_pages:
2504         kvfree(userptr->pages);
2505         return rc;
2506 }
2507
2508 /**
2509  * hl_pin_host_memory() - pins a chunk of host memory.
2510  * @hdev: pointer to the habanalabs device structure.
2511  * @addr: the host virtual address of the memory area.
2512  * @size: the size of the memory area.
2513  * @userptr: pointer to hl_userptr structure.
2514  *
2515  * This function does the following:
2516  * - Pins the physical pages.
2517  * - Create an SG list from those pages.
2518  */
2519 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
2520                                         struct hl_userptr *userptr)
2521 {
2522         u64 start, end;
2523         u32 npages, offset;
2524         int rc;
2525
2526         if (!size) {
2527                 dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
2528                 return -EINVAL;
2529         }
2530
2531         /*
2532          * If the combination of the address and size requested for this memory
2533          * region causes an integer overflow, return error.
2534          */
2535         if (((addr + size) < addr) ||
2536                         PAGE_ALIGN(addr + size) < (addr + size)) {
2537                 dev_err(hdev->dev,
2538                         "user pointer 0x%llx + %llu causes integer overflow\n",
2539                         addr, size);
2540                 return -EINVAL;
2541         }
2542
2543         userptr->pid = current->pid;
2544         userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);
2545         if (!userptr->sgt)
2546                 return -ENOMEM;
2547
2548         start = addr & PAGE_MASK;
2549         offset = addr & ~PAGE_MASK;
2550         end = PAGE_ALIGN(addr + size);
2551         npages = (end - start) >> PAGE_SHIFT;
2552
2553         userptr->size = size;
2554         userptr->addr = addr;
2555         userptr->dma_mapped = false;
2556         INIT_LIST_HEAD(&userptr->job_node);
2557
2558         rc = get_user_memory(hdev, addr, size, npages, start, offset,
2559                                 userptr);
2560         if (rc) {
2561                 dev_err(hdev->dev,
2562                         "failed to get user memory for address 0x%llx\n",
2563                         addr);
2564                 goto free_sgt;
2565         }
2566
2567         hl_debugfs_add_userptr(hdev, userptr);
2568
2569         return 0;
2570
2571 free_sgt:
2572         kfree(userptr->sgt);
2573         return rc;
2574 }
2575
2576 /*
2577  * hl_unpin_host_memory - unpins a chunk of host memory.
2578  * @hdev: pointer to the habanalabs device structure
2579  * @userptr: pointer to hl_userptr structure
2580  *
2581  * This function does the following:
2582  * - Unpins the physical pages related to the host memory
2583  * - Free the SG list
2584  */
2585 void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
2586 {
2587         hl_debugfs_remove_userptr(hdev, userptr);
2588
2589         if (userptr->dma_mapped)
2590                 hdev->asic_funcs->hl_dma_unmap_sg(hdev, userptr->sgt->sgl,
2591                                                         userptr->sgt->nents,
2592                                                         userptr->dir);
2593
2594         unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);
2595         kvfree(userptr->pages);
2596
2597         list_del(&userptr->job_node);
2598
2599         sg_free_table(userptr->sgt);
2600         kfree(userptr->sgt);
2601 }
2602
2603 /**
2604  * hl_userptr_delete_list() - clear userptr list.
2605  * @hdev: pointer to the habanalabs device structure.
2606  * @userptr_list: pointer to the list to clear.
2607  *
2608  * This function does the following:
2609  * - Iterates over the list and unpins the host memory and frees the userptr
2610  *   structure.
2611  */
2612 void hl_userptr_delete_list(struct hl_device *hdev,
2613                                 struct list_head *userptr_list)
2614 {
2615         struct hl_userptr *userptr, *tmp;
2616
2617         list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
2618                 hl_unpin_host_memory(hdev, userptr);
2619                 kfree(userptr);
2620         }
2621
2622         INIT_LIST_HEAD(userptr_list);
2623 }
2624
2625 /**
2626  * hl_userptr_is_pinned() - returns whether the given userptr is pinned.
2627  * @hdev: pointer to the habanalabs device structure.
2628  * @addr: user address to check.
2629  * @size: user block size to check.
2630  * @userptr_list: pointer to the list to clear.
2631  * @userptr: pointer to userptr to check.
2632  *
2633  * This function does the following:
2634  * - Iterates over the list and checks if the given userptr is in it, means is
2635  *   pinned. If so, returns true, otherwise returns false.
2636  */
2637 bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
2638                                 u32 size, struct list_head *userptr_list,
2639                                 struct hl_userptr **userptr)
2640 {
2641         list_for_each_entry((*userptr), userptr_list, job_node) {
2642                 if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
2643                         return true;
2644         }
2645
2646         return false;
2647 }
2648
2649 /**
2650  * va_range_init() - initialize virtual addresses range.
2651  * @hdev: pointer to the habanalabs device structure.
2652  * @va_ranges: pointer to va_ranges array.
2653  * @start: range start address.
2654  * @end: range end address.
2655  * @page_size: page size for this va_range.
2656  *
2657  * This function does the following:
2658  * - Initializes the virtual addresses list of the given range with the given
2659  *   addresses.
2660  */
2661 static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
2662                                 u64 start, u64 end, u32 page_size)
2663 {
2664         int rc;
2665
2666         INIT_LIST_HEAD(&va_range->list);
2667
2668         /*
2669          * PAGE_SIZE alignment
2670          * it is the callers responsibility to align the addresses if the
2671          * page size is not a power of 2
2672          */
2673
2674         if (is_power_of_2(page_size)) {
2675                 if (start & (PAGE_SIZE - 1)) {
2676                         start &= PAGE_MASK;
2677                         start += PAGE_SIZE;
2678                 }
2679
2680                 /*
2681                  * The end of the range is inclusive, hence we need to align it
2682                  * to the end of the last full page in the range. For example if
2683                  * end = 0x3ff5 with page size 0x1000, we need to align it to
2684                  * 0x2fff. The remainig 0xff5 bytes do not form a full page.
2685                  */
2686                 if ((end + 1) & (PAGE_SIZE - 1))
2687                         end = ((end + 1) & PAGE_MASK) - 1;
2688         }
2689
2690         if (start >= end) {
2691                 dev_err(hdev->dev, "too small vm range for va list\n");
2692                 return -EFAULT;
2693         }
2694
2695         rc = add_va_block(hdev, va_range, start, end);
2696
2697         if (rc) {
2698                 dev_err(hdev->dev, "Failed to init host va list\n");
2699                 return rc;
2700         }
2701
2702         va_range->start_addr = start;
2703         va_range->end_addr = end;
2704         va_range->page_size = page_size;
2705
2706         return 0;
2707 }
2708
2709 /**
2710  * va_range_fini() - clear a virtual addresses range.
2711  * @hdev: pointer to the habanalabs structure.
2712  * @va_range: pointer to virtual addresses range.
2713  *
2714  * This function does the following:
2715  * - Frees the virtual addresses block list and its lock.
2716  */
2717 static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)
2718 {
2719         mutex_lock(&va_range->lock);
2720         clear_va_list_locked(hdev, &va_range->list);
2721         mutex_unlock(&va_range->lock);
2722
2723         mutex_destroy(&va_range->lock);
2724         kfree(va_range);
2725 }
2726
2727 /**
2728  * vm_ctx_init_with_ranges() - initialize virtual memory for context.
2729  * @ctx: pointer to the habanalabs context structure.
2730  * @host_range_start: host virtual addresses range start.
2731  * @host_range_end: host virtual addresses range end.
2732  * @host_page_size: host page size.
2733  * @host_huge_range_start: host virtual addresses range start for memory
2734  *                         allocated with huge pages.
2735  * @host_huge_range_end: host virtual addresses range end for memory allocated
2736  *                        with huge pages.
2737  * @host_huge_page_size: host huge page size.
2738  * @dram_range_start: dram virtual addresses range start.
2739  * @dram_range_end: dram virtual addresses range end.
2740  * @dram_page_size: dram page size.
2741  *
2742  * This function initializes the following:
2743  * - MMU for context.
2744  * - Virtual address to area descriptor hashtable.
2745  * - Virtual block list of available virtual memory.
2746  */
2747 static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
2748                                         u64 host_range_start,
2749                                         u64 host_range_end,
2750                                         u32 host_page_size,
2751                                         u64 host_huge_range_start,
2752                                         u64 host_huge_range_end,
2753                                         u32 host_huge_page_size,
2754                                         u64 dram_range_start,
2755                                         u64 dram_range_end,
2756                                         u32 dram_page_size)
2757 {
2758         struct hl_device *hdev = ctx->hdev;
2759         int i, rc;
2760
2761         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {
2762                 ctx->va_range[i] =
2763                         kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);
2764                 if (!ctx->va_range[i]) {
2765                         rc = -ENOMEM;
2766                         goto free_va_range;
2767                 }
2768         }
2769
2770         rc = hl_mmu_ctx_init(ctx);
2771         if (rc) {
2772                 dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
2773                 goto free_va_range;
2774         }
2775
2776         mutex_init(&ctx->mem_hash_lock);
2777         hash_init(ctx->mem_hash);
2778
2779         mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2780
2781         rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST],
2782                         host_range_start, host_range_end, host_page_size);
2783         if (rc) {
2784                 dev_err(hdev->dev, "failed to init host vm range\n");
2785                 goto mmu_ctx_fini;
2786         }
2787
2788         if (hdev->pmmu_huge_range) {
2789                 mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2790
2791                 rc = va_range_init(hdev,
2792                         ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE],
2793                         host_huge_range_start, host_huge_range_end,
2794                         host_huge_page_size);
2795                 if (rc) {
2796                         dev_err(hdev->dev,
2797                                 "failed to init host huge vm range\n");
2798                         goto clear_host_va_range;
2799                 }
2800         } else {
2801                 kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2802                 ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =
2803                                 ctx->va_range[HL_VA_RANGE_TYPE_HOST];
2804         }
2805
2806         mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2807
2808         rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM],
2809                         dram_range_start, dram_range_end, dram_page_size);
2810         if (rc) {
2811                 dev_err(hdev->dev, "failed to init dram vm range\n");
2812                 goto clear_host_huge_va_range;
2813         }
2814
2815         hl_debugfs_add_ctx_mem_hash(hdev, ctx);
2816
2817         return 0;
2818
2819 clear_host_huge_va_range:
2820         mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2821
2822         if (hdev->pmmu_huge_range) {
2823                 mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2824                 clear_va_list_locked(hdev,
2825                         &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);
2826                 mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2827         }
2828 clear_host_va_range:
2829         if (hdev->pmmu_huge_range)
2830                 mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2831         mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2832         clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);
2833         mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2834 mmu_ctx_fini:
2835         mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2836         mutex_destroy(&ctx->mem_hash_lock);
2837         hl_mmu_ctx_fini(ctx);
2838 free_va_range:
2839         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)
2840                 kfree(ctx->va_range[i]);
2841
2842         return rc;
2843 }
2844
2845 int hl_vm_ctx_init(struct hl_ctx *ctx)
2846 {
2847         struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
2848         u64 host_range_start, host_range_end, host_huge_range_start,
2849                 host_huge_range_end, dram_range_start, dram_range_end;
2850         u32 host_page_size, host_huge_page_size, dram_page_size;
2851
2852         atomic64_set(&ctx->dram_phys_mem, 0);
2853
2854         /*
2855          * - If MMU is enabled, init the ranges as usual.
2856          * - If MMU is disabled, in case of host mapping, the returned address
2857          *   is the given one.
2858          *   In case of DRAM mapping, the returned address is the physical
2859          *   address of the memory related to the given handle.
2860          */
2861         if (!ctx->hdev->mmu_enable)
2862                 return 0;
2863
2864         dram_range_start = prop->dmmu.start_addr;
2865         dram_range_end = prop->dmmu.end_addr - 1;
2866         dram_page_size = prop->dram_page_size ?
2867                                 prop->dram_page_size : prop->dmmu.page_size;
2868         host_range_start = prop->pmmu.start_addr;
2869         host_range_end = prop->pmmu.end_addr - 1;
2870         host_page_size = prop->pmmu.page_size;
2871         host_huge_range_start = prop->pmmu_huge.start_addr;
2872         host_huge_range_end = prop->pmmu_huge.end_addr - 1;
2873         host_huge_page_size = prop->pmmu_huge.page_size;
2874
2875         return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
2876                         host_page_size, host_huge_range_start,
2877                         host_huge_range_end, host_huge_page_size,
2878                         dram_range_start, dram_range_end, dram_page_size);
2879 }
2880
2881 /**
2882  * hl_vm_ctx_fini() - virtual memory teardown of context.
2883  * @ctx: pointer to the habanalabs context structure.
2884  *
2885  * This function perform teardown the following:
2886  * - Virtual block list of available virtual memory.
2887  * - Virtual address to area descriptor hashtable.
2888  * - MMU for context.
2889  *
2890  * In addition this function does the following:
2891  * - Unmaps the existing hashtable nodes if the hashtable is not empty. The
2892  *   hashtable should be empty as no valid mappings should exist at this
2893  *   point.
2894  * - Frees any existing physical page list from the idr which relates to the
2895  *   current context asid.
2896  * - This function checks the virtual block list for correctness. At this point
2897  *   the list should contain one element which describes the whole virtual
2898  *   memory range of the context. Otherwise, a warning is printed.
2899  */
2900 void hl_vm_ctx_fini(struct hl_ctx *ctx)
2901 {
2902         struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node;
2903         struct hl_device *hdev = ctx->hdev;
2904         struct hl_vm_hash_node *hnode;
2905         struct hl_vm *vm = &hdev->vm;
2906         struct hlist_node *tmp_node;
2907         struct list_head free_list;
2908         struct hl_mem_in args;
2909         int i;
2910
2911         if (!hdev->mmu_enable)
2912                 return;
2913
2914         hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
2915
2916         /*
2917          * Clearly something went wrong on hard reset so no point in printing
2918          * another side effect error
2919          */
2920         if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
2921                 dev_dbg(hdev->dev,
2922                         "user released device without removing its memory mappings\n");
2923
2924         hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
2925                 dev_dbg(hdev->dev,
2926                         "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
2927                         hnode->vaddr, ctx->asid);
2928                 args.unmap.device_virt_addr = hnode->vaddr;
2929                 unmap_device_va(ctx, &args, true);
2930         }
2931
2932         mutex_lock(&ctx->mmu_lock);
2933
2934         /* invalidate the cache once after the unmapping loop */
2935         hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
2936         hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);
2937
2938         mutex_unlock(&ctx->mmu_lock);
2939
2940         INIT_LIST_HEAD(&free_list);
2941
2942         spin_lock(&vm->idr_lock);
2943         idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
2944                 if (phys_pg_list->asid == ctx->asid) {
2945                         dev_dbg(hdev->dev,
2946                                 "page list 0x%px of asid %d is still alive\n",
2947                                 phys_pg_list, ctx->asid);
2948
2949                         atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem);
2950                         idr_remove(&vm->phys_pg_pack_handles, i);
2951                         list_add(&phys_pg_list->node, &free_list);
2952                 }
2953         spin_unlock(&vm->idr_lock);
2954
2955         list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)
2956                 free_phys_pg_pack(hdev, phys_pg_list);
2957
2958         va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);
2959         va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);
2960
2961         if (hdev->pmmu_huge_range)
2962                 va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2963
2964         mutex_destroy(&ctx->mem_hash_lock);
2965         hl_mmu_ctx_fini(ctx);
2966
2967         /* In this case we need to clear the global accounting of DRAM usage
2968          * because the user notifies us on allocations. If the user is no more,
2969          * all DRAM is available
2970          */
2971         if (ctx->asid != HL_KERNEL_ASID_ID &&
2972                         !hdev->asic_prop.dram_supports_virtual_memory)
2973                 atomic64_set(&hdev->dram_used_mem, 0);
2974 }
2975
2976 /**
2977  * hl_vm_init() - initialize virtual memory module.
2978  * @hdev: pointer to the habanalabs device structure.
2979  *
2980  * This function initializes the following:
2981  * - MMU module.
2982  * - DRAM physical pages pool of 2MB.
2983  * - Idr for device memory allocation handles.
2984  */
2985 int hl_vm_init(struct hl_device *hdev)
2986 {
2987         struct asic_fixed_properties *prop = &hdev->asic_prop;
2988         struct hl_vm *vm = &hdev->vm;
2989         int rc;
2990
2991         if (is_power_of_2(prop->dram_page_size))
2992                 vm->dram_pg_pool =
2993                         gen_pool_create(__ffs(prop->dram_page_size), -1);
2994         else
2995                 vm->dram_pg_pool =
2996                         gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1);
2997
2998         if (!vm->dram_pg_pool) {
2999                 dev_err(hdev->dev, "Failed to create dram page pool\n");
3000                 return -ENOMEM;
3001         }
3002
3003         kref_init(&vm->dram_pg_pool_refcount);
3004
3005         rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,
3006                         prop->dram_end_address - prop->dram_user_base_address,
3007                         -1);
3008
3009         if (rc) {
3010                 dev_err(hdev->dev,
3011                         "Failed to add memory to dram page pool %d\n", rc);
3012                 goto pool_add_err;
3013         }
3014
3015         spin_lock_init(&vm->idr_lock);
3016         idr_init(&vm->phys_pg_pack_handles);
3017
3018         atomic64_set(&hdev->dram_used_mem, 0);
3019
3020         vm->init_done = true;
3021
3022         return 0;
3023
3024 pool_add_err:
3025         gen_pool_destroy(vm->dram_pg_pool);
3026
3027         return rc;
3028 }
3029
3030 /**
3031  * hl_vm_fini() - virtual memory module teardown.
3032  * @hdev: pointer to the habanalabs device structure.
3033  *
3034  * This function perform teardown to the following:
3035  * - Idr for device memory allocation handles.
3036  * - DRAM physical pages pool of 2MB.
3037  * - MMU module.
3038  */
3039 void hl_vm_fini(struct hl_device *hdev)
3040 {
3041         struct hl_vm *vm = &hdev->vm;
3042
3043         if (!vm->init_done)
3044                 return;
3045
3046         /*
3047          * At this point all the contexts should be freed and hence no DRAM
3048          * memory should be in use. Hence the DRAM pool should be freed here.
3049          */
3050         if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)
3051                 dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
3052                                 __func__);
3053
3054         vm->init_done = false;
3055 }
3056
3057 /**
3058  * hl_hw_block_mem_init() - HW block memory initialization.
3059  * @ctx: pointer to the habanalabs context structure.
3060  *
3061  * This function initializes the HW block virtual mapped addresses list and
3062  * it's lock.
3063  */
3064 void hl_hw_block_mem_init(struct hl_ctx *ctx)
3065 {
3066         mutex_init(&ctx->hw_block_list_lock);
3067         INIT_LIST_HEAD(&ctx->hw_block_mem_list);
3068 }
3069
3070 /**
3071  * hl_hw_block_mem_fini() - HW block memory teardown.
3072  * @ctx: pointer to the habanalabs context structure.
3073  *
3074  * This function clears the HW block virtual mapped addresses list and destroys
3075  * it's lock.
3076  */
3077 void hl_hw_block_mem_fini(struct hl_ctx *ctx)
3078 {
3079         struct hl_vm_hw_block_list_node *lnode, *tmp;
3080
3081         if (!list_empty(&ctx->hw_block_mem_list))
3082                 dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");
3083
3084         list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {
3085                 list_del(&lnode->node);
3086                 kfree(lnode);
3087         }
3088
3089         mutex_destroy(&ctx->hw_block_list_lock);
3090 }