diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 30ddaaae67e5..5dc6c77b4721 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -12,6 +12,7 @@ #include #include +#include #include @@ -2199,6 +2200,8 @@ void hl_device_fini(struct hl_device *hdev) hl_mmu_fini(hdev); + vfree(hdev->captured_err_info.pgf_info.user_mappings); + hl_eq_fini(hdev, &hdev->event_queue); kfree(hdev->shadow_cs_queue); @@ -2275,3 +2278,58 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ num_of_engines * sizeof(u16)); hdev->captured_err_info.razwi.flags = flags; } +static void hl_capture_user_mappings(struct hl_device *hdev) +{ + struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; + struct hl_vm_hash_node *hnode; + struct hl_userptr *userptr; + struct hl_ctx *ctx; + u32 map_idx = 0; + int i; + + ctx = hl_get_compute_ctx(hdev); + if (!ctx) { + dev_err(hdev->dev, "Can't get user context for user mappings\n"); + return; + } + + mutex_lock(&ctx->mem_hash_lock); + hash_for_each(ctx->mem_hash, i, hnode, node) + pgf_info->num_of_user_mappings++; + + if (!pgf_info->num_of_user_mappings) + goto finish; + + /* In case we already allocated in previous session, need to release it before + * allocating new buffer. + */ + vfree(pgf_info->user_mappings); + pgf_info->user_mappings = + vmalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); + if (!pgf_info->user_mappings) { + pgf_info->num_of_user_mappings = 0; + goto finish; + } + + hash_for_each(ctx->mem_hash, i, hnode, node) { + userptr = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = userptr->size; + map_idx++; + } +finish: + mutex_unlock(&ctx->mem_hash_lock); + hl_ctx_put(ctx); +} + +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu) +{ + /* Capture only the first page fault */ + if (atomic_cmpxchg(&hdev->captured_err_info.pgf_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.pgf_info.pgf.addr = addr; + hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id; + hl_capture_user_mappings(hdev); +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index f4b3fa4b0976..1489240d5a3a 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2957,19 +2957,38 @@ struct undefined_opcode_info { bool write_enable; }; +/** + * struct page_fault_info - info about page fault + * @pgf_info: page fault information. + * @user_mappings: buffer containing user mappings. + * @num_of_user_mappings: number of user mappings. + */ +struct page_fault_info { + struct hl_page_fault_info pgf; + struct hl_user_mapping *user_mappings; + u64 num_of_user_mappings; +}; + /** * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi: razwi information. * @razwi_info_recorded: if set writing to razwi information is enabled. - * otherwise - disabled, so the first (root cause) razwi will not be overwritten. + * otherwise - disabled, so the first (root cause) razwi will not be + * overwritten. * @undef_opcode: undefined opcode information + * @pgf_info: page fault information. + * @pgf_info_recorded: if set writing to page fault information is enabled. + * otherwise - disabled, so the first (root cause) page fault will not be + * overwritten. */ struct hl_error_info { struct cs_timeout_info cs_timeout; struct hl_info_razwi_event razwi; atomic_t razwi_info_recorded; struct undefined_opcode_info undef_opcode; + struct page_fault_info pgf_info; + atomic_t pgf_info_recorded; }; /** @@ -3781,6 +3800,7 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, u8 flags); +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index d87434b9bc16..714994725224 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -213,6 +213,7 @@ int hl_device_open(struct inode *inode, struct file *filp) atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0); + atomic_set(&hdev->captured_err_info.pgf_info_recorded, 0); hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 6aef4e24d122..cac2c7fb14f1 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -778,6 +778,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return rc; } +static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_page_fault_info *info = &hdev->captured_err_info.pgf_info.pgf; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info))) + ? -EFAULT : 0; +} + +static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + u32 user_buf_size = args->return_size; + struct hl_device *hdev = hpriv->hdev; + struct page_fault_info *pgf_info; + u64 actual_size; + + pgf_info = &hdev->captured_err_info.pgf_info; + args->array_size = pgf_info->num_of_user_mappings; + + if (!out) + return -EINVAL; + + actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping); + if (user_buf_size < actual_size) + return -ENOMEM; + + return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size)) + ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -837,6 +873,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_GET_EVENTS: return events_info(hpriv, args); + case HL_INFO_PAGE_FAULT_EVENT: + return page_fault_info(hpriv, args); + + case HL_INFO_USER_MAPPINGS: + return user_mappings_info(hpriv, args); + default: break; } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index f856ac51fde1..1a99f7be8b60 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6755,6 +6755,8 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); + hl_capture_page_fault(hdev, *addr, 0, true); + WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index d6f84cb35e3d..2b794f54e2ed 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -778,6 +778,9 @@ enum hl_server_type { * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd * HL_INFO_GET_EVENTS - Retrieve the last occurred events * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information. + * HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic. + * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault. + * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -809,6 +812,8 @@ enum hl_server_type { #define HL_INFO_GET_EVENTS 30 #define HL_INFO_UNDEFINED_OPCODE_EVENT 31 #define HL_INFO_ENGINE_STATUS 32 +#define HL_INFO_PAGE_FAULT_EVENT 33 +#define HL_INFO_USER_MAPPINGS 34 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -1187,6 +1192,29 @@ struct hl_info_sec_attest { __u8 pad0[2]; }; +/** + * struct hl_page_fault_info - page fault information. + * @timestamp: timestamp of page fault. + * @addr: address which accessing it caused page fault. + * @engine_id: engine id which caused the page fault, supported only in gaudi3. + */ +struct hl_page_fault_info { + __s64 timestamp; + __u64 addr; + __u16 engine_id; + __u8 pad[6]; +}; + +/** + * struct hl_user_mapping - user mapping information. + * @dev_va: device virtual address. + * @size: virtual address mapping size. + */ +struct hl_user_mapping { + __u64 dev_va; + __u64 size; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, @@ -1213,6 +1241,8 @@ enum gaudi_dcores { * needed, hence updating this variable so user will know the exact amount * of bytes copied by the kernel to the buffer. * @sec_attest_nonce: Nonce number used for attestation report. + * @array_size: Number of array members copied to user buffer. + * Relevant for HL_INFO_USER_MAPPINGS info ioctl. * @pad: Padding to 64 bit. */ struct hl_info_args { @@ -1228,6 +1258,7 @@ struct hl_info_args { __u32 eventfd; __u32 user_buffer_actual_size; __u32 sec_attest_nonce; + __u32 array_size; }; __u32 pad;