/* * Copyright (c) 2006-2023, RT-Thread Development Team * * SPDX-License-Identifier: Apache-2.0 * * Change Logs: * Date Author Notes * 2023-02-25 GuEe-GUI the first version */ #include #include #include #define DBG_TAG "rtdm.nvme" #define DBG_LVL DBG_INFO #include static struct rt_dm_ida nvme_controller_ida = RT_DM_IDA_INIT(CUSTOM); static struct rt_dm_ida nvme_ida = RT_DM_IDA_INIT(NVME); static struct rt_spinlock nvme_lock = {}; static rt_list_t nvme_nodes = RT_LIST_OBJECT_INIT(nvme_nodes); rt_inline rt_uint32_t nvme_readl(struct rt_nvme_controller *nvme, int offset) { return HWREG32(nvme->regs + offset); } rt_inline void nvme_writel(struct rt_nvme_controller *nvme, int offset, rt_uint32_t value) { HWREG32(nvme->regs + offset) = value; } rt_inline rt_uint64_t nvme_readq(struct rt_nvme_controller *nvme, int offset) { rt_uint32_t lo32, hi32; lo32 = HWREG32(nvme->regs + offset); hi32 = HWREG32(nvme->regs + offset + 4); return ((rt_uint64_t)hi32 << 32) + lo32; } rt_inline void nvme_writeq(struct rt_nvme_controller *nvme, int offset, rt_uint64_t value) { nvme_writel(nvme, offset, (rt_uint32_t)(value & 0xffffffff)); nvme_writel(nvme, offset + 4, (rt_uint32_t)(value >> 32)); } static rt_err_t nvme_poll_csts(struct rt_nvme_controller *nvme, rt_uint32_t mask, rt_uint32_t value) { rt_tick_t timeout; timeout = rt_tick_from_millisecond(RT_NVME_CAP_TIMEOUT(nvme->cap) * 500); timeout += rt_tick_get(); do { if ((nvme_readl(nvme, RT_NVME_REG_CSTS) & mask) == value) { return RT_EOK; } rt_hw_cpu_relax(); } while (rt_tick_get() < timeout); return -RT_ETIMEOUT; } static rt_err_t nvme_enable_ctrl(struct rt_nvme_controller *nvme) { nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK; nvme->ctrl_config |= RT_NVME_CC_ENABLE; nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config); return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, RT_NVME_CSTS_RDY); } static rt_err_t nvme_disable_ctrl(struct rt_nvme_controller *nvme) { nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK; nvme->ctrl_config &= ~RT_NVME_CC_ENABLE; nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config); return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, 0); } static rt_err_t nvme_shutdown_ctrl(struct rt_nvme_controller *nvme) { nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK; nvme->ctrl_config |= RT_NVME_CC_SHN_NORMAL; nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config); return nvme_poll_csts(nvme, RT_NVME_CSTS_SHST_MASK, RT_NVME_CSTS_SHST_CMPLT); } rt_inline rt_le16_t nvme_next_cmdid(struct rt_nvme_controller *nvme) { return rt_cpu_to_le16((rt_uint16_t)rt_atomic_add(&nvme->cmdid, 1)); } static rt_err_t nvme_submit_cmd(struct rt_nvme_queue *queue, struct rt_nvme_command *cmd) { rt_ubase_t level; rt_err_t err = RT_EOK; rt_uint16_t tail, head; struct rt_nvme_controller *nvme = queue->nvme; _retry: level = rt_spin_lock_irqsave(&queue->lock); tail = queue->sq_tail; head = queue->cq_head; if (tail + 1 == head) { /* IO queue is full, waiting for the last IO command to complete. */ rt_spin_unlock_irqrestore(&queue->lock, level); rt_thread_yield(); goto _retry; } cmd->common.cmdid = nvme_next_cmdid(nvme); rt_memcpy(&queue->sq_cmds[tail], cmd, sizeof(*cmd)); if (nvme->ops->submit_cmd) { if ((err = nvme->ops->submit_cmd(queue, cmd))) { return err; } } if (++tail == queue->depth) { tail = 0; } HWREG32(queue->doorbell) = tail; queue->sq_tail = tail; queue->cmd = cmd; queue->err = RT_EOK; rt_spin_unlock_irqrestore(&queue->lock, level); err = rt_completion_wait(&queue->done, rt_tick_from_millisecond(queue->qid != 0 ? RT_WAITING_FOREVER : 60)); return err ? : queue->err; } static rt_err_t nvme_set_features_simple(struct rt_nvme_controller *nvme, rt_uint32_t fid, rt_uint32_t dword11) { struct rt_nvme_command cmd; rt_memset(&cmd, 0, sizeof(cmd)); cmd.features.opcode = RT_NVME_ADMIN_OPCODE_SET_FEATURES; cmd.features.fid = rt_cpu_to_le32(fid); cmd.features.dword11 = rt_cpu_to_le32(dword11); return nvme_submit_cmd(&nvme->admin_queue, &cmd); } static rt_err_t nvme_submit_io_cmd(struct rt_nvme_controller *nvme, struct rt_nvme_command *cmd) { rt_uint16_t qid; qid = rt_atomic_add(&nvme->ioqid[rt_hw_cpu_id()], RT_CPUS_NR); qid %= nvme->io_queue_max; return nvme_submit_cmd(&nvme->io_queues[qid], cmd); } /* * PRP Mode: * * |63 n+1|n 0| * +----------------------------------------+----------+---+---+ * | Page Base Address | Offset | 0 | 0 | * +----------------------------------------+----------+---+---+ * | * v * Host Physical Pages * +----------------------------+ * +--------------+----------+ | Page k | * | PRP Entry1 | Offset +---------->+----------------------------+ * +--------------+----------+ | Page k + 1 | * +----------------------------+ * ... * +----------------------------+ * +--------------+----------+ | Page k + m | * | PRP Entry2 | 0 +---------->+----------------------------+ * +--------------+----------+ | Page k + m + 1 | * +----------------------------+ * PRP List (In PRP Entry2): * * |63 n+1|n 0| * +----------------------------------------+------------------+ * | Page Base Address k | 0h | * +----------------------------------------+------------------+ * | Page Base Address k + 1 | 0h | * +----------------------------------------+------------------+ * | ... | * +----------------------------------------+------------------+ * | Page Base Address k + m | 0h | * +----------------------------------------+------------------+ * | Page Base Address k + m + 1 | 0h | * +----------------------------------------+------------------+ * * SGL Mode: * +----- Non-transport * LBA / * +---------------+---------------+-------/-------+---------------+ * | 3KB | 4KB | 2KB | 4KB | * +-------+-------+-------+-------+---------------+--------+------+ * | +-------------------------+ | * | | | * | +--------------------|------+ * | | | * +-------v-------+ +-------v-------+ +-------v-------+ * | A MEM BLOCK | | B MEM BLOCK | | C MEM BLOCK | * +-------^-------+ +-------^-------+ +-------^-------+ * | | | * +----------------+ | | * | | | * Segment(0) | | | * +----------+----------+ | | | * | Address: A +--+ | | * +----------+----------+ | | * | Type: 0h | Len: 3KB | | | * +----------+----------+ | | * | Address: Segment(1) +--+ | | * +----------+----------+ | | | * | Type: 2h | Len: 48 | | | | * +----------+----------+ | | | * | | | * +------------------------+ | | * | | | * v | | * Segment(1) | | * +----------+----------+ | | * | Address: B +------+ | * +----------+----------+ | * | Type: 0h | Len: 4KB | | * +----------+----------+ | * | Address: | | * +----------+----------+ | * | Type: 1h | Len: 2KB | | * +----------+----------+ | * | Address: Segment(2) +--+ | * +----------+----------+ | | * | Type: 0h | Len: 16 | | | * +----------+----------+ | | * | | * +------------------------+ | * | | * v | * Segment(2) | * +----------+----------+ | * | Address: C +---------------------------+ * +----------+----------+ * | Type: 0h | Len: 4KB | * +----------+----------+ */ static rt_ssize_t nvme_blk_rw(struct rt_nvme_device *ndev, rt_off_t slba, rt_ubase_t buffer_dma, rt_size_t lbas, rt_uint8_t opcode) { rt_err_t err; rt_uint16_t max_lbas; rt_uint32_t lba_shift; rt_size_t tlbas; rt_ssize_t data_length; struct rt_nvme_command cmd; struct rt_nvme_controller *nvme = ndev->ctrl; rt_memset(&cmd, 0, sizeof(cmd)); cmd.rw.opcode = opcode; cmd.rw.flags = nvme->sgl_mode << RT_NVME_CMD_FLAGS_PSDT_SHIFT; cmd.rw.nsid = rt_cpu_to_le32(ndev->nsid); tlbas = lbas; lba_shift = ndev->lba_shift; max_lbas = 1 << (nvme->max_transfer_shift - lba_shift); if (nvme->sgl_mode) { while ((rt_ssize_t)lbas > 0) { if (lbas < max_lbas) { max_lbas = (rt_uint16_t)lbas; } data_length = max_lbas << lba_shift; cmd.rw.sgl.adddress = rt_cpu_to_le64(buffer_dma); cmd.rw.sgl.length = rt_cpu_to_le32(data_length); cmd.rw.sgl.sgl_identify = SGL_DESC_TYPE_DATA_BLOCK; cmd.rw.slba = rt_cpu_to_le16(slba); cmd.rw.length = rt_cpu_to_le16(max_lbas - 1); if ((err = nvme_submit_io_cmd(nvme, &cmd))) { tlbas -= lbas; break; } lbas -= max_lbas; slba += max_lbas; buffer_dma += data_length; } } else { void *prp_list = RT_NULL; rt_size_t prp_list_size = 0, page_size; page_size = nvme->page_size; while ((rt_ssize_t)lbas > 0) { rt_uint64_t prp2_addr, dma_addr; rt_ssize_t remain_length, page_offset; if (lbas < max_lbas) { max_lbas = (rt_uint16_t)lbas; } /* * PRP transfer: * 1. data_length <= 4KB: * prp1 = buffer_dma * prp2 = 0 * * 2. 4KB < data_length <= 8KB: * prp1 = buffer_dma * prp2 = buffer_dma * * 3. 8KB < data_length: * prp1 = buffer_dma(0, 4k) * prp2 = buffer_dma(4k, ~) */ dma_addr = buffer_dma; page_offset = buffer_dma & (page_size - 1); data_length = max_lbas << lba_shift; remain_length = data_length - (page_size - page_offset); do { rt_size_t prps_per_page, prps, pages; rt_uint64_t *prp_list_ptr, prp_list_dma; if (remain_length <= 0) { prp2_addr = 0; break; } if (remain_length) { dma_addr += (page_size - page_offset); } if (remain_length <= page_size) { prp2_addr = dma_addr; break; } prps_per_page = page_size / sizeof(rt_uint64_t); prps = RT_DIV_ROUND_UP(remain_length, page_size); pages = RT_DIV_ROUND_UP(prps - 1, prps_per_page - 1); if (prps > prp_list_size) { if (prp_list) { rt_free_align(prp_list); } prp_list = rt_malloc_align(pages * page_size, page_size); if (!prp_list) { LOG_D("No memory to create a PRP List"); /* Ask user to try again */ return tlbas - lbas; } prp_list_size = pages * (prps_per_page - 1) + 1; } prp_list_ptr = prp_list; prp_list_dma = (rt_uint64_t)rt_kmem_v2p(prp_list_ptr); prp2_addr = prp_list_dma; for (int i = 0; prps; --prps, ++i) { /* End of the entry, fill the next entry addr if remain */ if ((i == (prps_per_page - 1)) && prps > 1) { prp_list_dma += page_size; *prp_list_ptr = rt_cpu_to_le64(prp_list_dma); /* Start to fill the next PRP */ i = 0; } *prp_list_ptr = rt_cpu_to_le64(dma_addr); dma_addr += page_size; } rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, prp_list_ptr, prp_list_size); } while (0); cmd.rw.prp1 = rt_cpu_to_le64(buffer_dma); cmd.rw.prp2 = rt_cpu_to_le64(prp2_addr); cmd.rw.slba = rt_cpu_to_le16(slba); cmd.rw.length = rt_cpu_to_le16(max_lbas - 1); if ((err = nvme_submit_io_cmd(nvme, &cmd))) { tlbas -= lbas; break; } lbas -= max_lbas; slba += max_lbas; buffer_dma += data_length; } if (prp_list) { rt_free_align(prp_list); } } return tlbas; } static rt_ssize_t nvme_blk_read(struct rt_blk_disk *disk, rt_off_t sector, void *buffer, rt_size_t sector_count) { rt_ssize_t res; rt_uint32_t page_bits; rt_size_t buffer_size; rt_ubase_t buffer_dma; void *temp_buffer = RT_NULL; struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); struct rt_nvme_controller *nvme = ndev->ctrl; buffer_size = (1 << ndev->lba_shift) * sector_count; buffer_dma = (rt_ubase_t)rt_kmem_v2p(buffer); if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) || (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK))) { LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "read"); page_bits = rt_page_bits(buffer_size); temp_buffer = rt_pages_alloc(page_bits); if (!temp_buffer) { return -RT_ENOMEM; } buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer); } res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_READ); if (res > 0) { if (res != sector_count) { /* * Don't always aim for optimization, checking for equality * is much faster than multiplication calculation. */ buffer_size = res * (1 << ndev->lba_shift); } if (temp_buffer) { rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, temp_buffer, buffer_size); rt_memcpy(buffer, temp_buffer, buffer_size); } else { rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, buffer, buffer_size); } } if (temp_buffer) { rt_pages_free(temp_buffer, page_bits); } return res; } static rt_ssize_t nvme_blk_write(struct rt_blk_disk *disk, rt_off_t sector, const void *buffer, rt_size_t sector_count) { rt_ssize_t res; rt_uint32_t page_bits; rt_size_t buffer_size; rt_ubase_t buffer_dma; void *temp_buffer = RT_NULL; struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); struct rt_nvme_controller *nvme = ndev->ctrl; buffer_size = (1 << ndev->lba_shift) * sector_count; buffer_dma = (rt_ubase_t)rt_kmem_v2p((void *)buffer); if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) || (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK))) { LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "write"); page_bits = rt_page_bits(buffer_size); temp_buffer = rt_pages_alloc(page_bits); if (!temp_buffer) { return -RT_ENOMEM; } buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer); rt_memcpy(temp_buffer, buffer, buffer_size); buffer = temp_buffer; } rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, (void *)buffer, buffer_size); res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_WRITE); if (temp_buffer) { rt_pages_free(temp_buffer, page_bits); } return res; } static rt_err_t nvme_blk_getgeome(struct rt_blk_disk *disk, struct rt_device_blk_geometry *geometry) { struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); geometry->bytes_per_sector = 1 << ndev->lba_shift; geometry->block_size = 1 << ndev->lba_shift; geometry->sector_count = rt_le64_to_cpu(ndev->id.nsze); return RT_EOK; } static rt_err_t nvme_blk_sync(struct rt_blk_disk *disk) { struct rt_nvme_command cmd; struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); rt_memset(&cmd, 0, sizeof(cmd)); cmd.common.opcode = RT_NVME_CMD_FLUSH; cmd.common.nsid = rt_cpu_to_le32(ndev->nsid); return nvme_submit_io_cmd(ndev->ctrl, &cmd); } static rt_err_t nvme_blk_erase(struct rt_blk_disk *disk) { rt_err_t err; rt_ssize_t slba, lbas, max_lbas; struct rt_nvme_command cmd; struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); struct rt_nvme_controller *nvme = ndev->ctrl; if (!nvme->write_zeroes) { return -RT_ENOSYS; } rt_memset(&cmd, 0, sizeof(cmd)); cmd.write_zeroes.opcode = RT_NVME_CMD_WRITE_ZEROES; cmd.write_zeroes.nsid = rt_cpu_to_le32(ndev->nsid); slba = 0; lbas = rt_le64_to_cpu(ndev->id.nsze); max_lbas = 1 << (nvme->max_transfer_shift - ndev->lba_shift); while ((rt_ssize_t)lbas > 0) { if (lbas < max_lbas) { max_lbas = (rt_uint16_t)lbas; } cmd.write_zeroes.slba = rt_cpu_to_le16(slba); cmd.write_zeroes.length = rt_cpu_to_le16(max_lbas - 1); if ((err = nvme_submit_io_cmd(nvme, &cmd))) { break; } lbas -= max_lbas; slba += max_lbas; } return err; } static rt_err_t nvme_blk_autorefresh(struct rt_blk_disk *disk, rt_bool_t is_auto) { struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk); struct rt_nvme_controller *nvme = ndev->ctrl; if (nvme->volatile_write_cache & RT_NVME_CTRL_VWC_PRESENT) { return nvme_set_features_simple(nvme, RT_NVME_FEAT_VOLATILE_WC, !!is_auto); } else if (!is_auto) { return RT_EOK; } return -RT_ENOSYS; } static const struct rt_blk_disk_ops nvme_blk_ops = { .read = nvme_blk_read, .write = nvme_blk_write, .getgeome = nvme_blk_getgeome, .sync = nvme_blk_sync, .erase = nvme_blk_erase, .autorefresh = nvme_blk_autorefresh, }; static void nvme_queue_isr(int irqno, void *param) { rt_ubase_t level; rt_uint16_t head, phase, status; struct rt_nvme_queue *queue = param; struct rt_nvme_controller *nvme = queue->nvme; level = rt_spin_lock_irqsave(&queue->lock); head = queue->cq_head; phase = queue->cq_phase; status = HWREG16(&queue->cq_entry[head].status); status = rt_le16_to_cpu(status); if ((status & 0x01) == phase) { if ((status >> 1)) { queue->err = -RT_EIO; goto _end_cmd; } if (nvme->ops->complete_cmd) { nvme->ops->complete_cmd(queue, queue->cmd); } _end_cmd: if (++head == queue->depth) { head = 0; phase = !phase; } HWREG32(queue->doorbell + nvme->doorbell_stride) = head; queue->cq_head = head; queue->cq_phase = phase; rt_completion_done(&queue->done); } rt_spin_unlock_irqrestore(&queue->lock, level); } static rt_err_t nvme_identify(struct rt_nvme_controller *nvme, rt_uint32_t nsid, rt_uint32_t cns, void *data) { rt_err_t err; rt_uint32_t page_size = nvme->page_size; rt_ubase_t data_phy = (rt_ubase_t)rt_kmem_v2p(data); int offset = data_phy & (page_size - 1); struct rt_nvme_command cmd; rt_memset(&cmd, 0, sizeof(cmd)); cmd.identify.opcode = RT_NVME_ADMIN_OPCODE_IDENTIFY; cmd.identify.nsid = rt_cpu_to_le32(nsid); cmd.identify.prp1 = rt_cpu_to_le64(data_phy); if (sizeof(struct rt_nvme_id_ctrl) <= page_size - offset) { cmd.identify.prp2 = 0; } else { data_phy += (page_size - offset); cmd.identify.prp2 = rt_cpu_to_le64(data_phy); } cmd.identify.cns = rt_cpu_to_le32(cns); rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, data, sizeof(struct rt_nvme_id_ctrl)); if (!(err = nvme_submit_cmd(&nvme->admin_queue, &cmd))) { rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, data, sizeof(struct rt_nvme_id_ctrl)); } return err; } static rt_err_t nvme_attach_queue(struct rt_nvme_queue *queue, rt_uint8_t opcode) { struct rt_nvme_command cmd; struct rt_nvme_controller *nvme = queue->nvme; rt_uint16_t flags = RT_NVME_QUEUE_PHYS_CONTIG; rt_memset(&cmd, 0, sizeof(cmd)); if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_CQ) { cmd.create_cq.opcode = opcode; cmd.create_cq.prp1 = rt_cpu_to_le64(queue->cq_entry_phy); cmd.create_cq.cqid = rt_cpu_to_le16(queue->qid); cmd.create_cq.qsize = rt_cpu_to_le16(queue->depth - 1); cmd.create_cq.cq_flags = rt_cpu_to_le16(flags | RT_NVME_CQ_IRQ_ENABLED); cmd.create_cq.irq_vector = rt_cpu_to_le16(nvme->irqs_nr > 1 ? queue->qid : 0); } else if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_SQ) { cmd.create_sq.opcode = opcode; cmd.create_sq.prp1 = rt_cpu_to_le64(queue->sq_cmds_phy); cmd.create_sq.sqid = rt_cpu_to_le16(queue->qid); cmd.create_sq.qsize = rt_cpu_to_le16(queue->depth - 1); cmd.create_sq.sq_flags = rt_cpu_to_le16(flags | RT_NVME_SQ_PRIO_MEDIUM); cmd.create_sq.cqid = rt_cpu_to_le16(queue->qid); } else { LOG_E("What the fuck opcode = %x", opcode); RT_ASSERT(0); } return nvme_submit_cmd(&nvme->admin_queue, &cmd); } rt_inline rt_err_t nvme_attach_queue_sq(struct rt_nvme_queue *queue) { return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_SQ); } rt_inline rt_err_t nvme_attach_queue_cq(struct rt_nvme_queue *queue) { return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_CQ); } static rt_err_t nvme_detach_queue(struct rt_nvme_queue *queue, rt_uint8_t opcode) { struct rt_nvme_command cmd; struct rt_nvme_controller *nvme = queue->nvme; rt_memset(&cmd, 0, sizeof(cmd)); cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = rt_cpu_to_le16(queue->qid); return nvme_submit_cmd(&nvme->admin_queue, &cmd); } rt_inline rt_ubase_t nvme_queue_dma_flags(void) { return RT_DMA_F_NOCACHE | RT_DMA_F_LINEAR; } static void nvme_free_queue(struct rt_nvme_queue *queue) { rt_ubase_t dma_flags; struct rt_nvme_controller *nvme = queue->nvme; if (nvme->ops->cleanup_queue) { rt_err_t err; if (!(err = nvme->ops->cleanup_queue(queue))) { LOG_W("Cleanup[%s] queue error = %s", nvme->ops->name, rt_strerror(err)); } } dma_flags = nvme_queue_dma_flags(); if (queue->sq_cmds) { rt_dma_free(nvme->dev, sizeof(*queue->sq_cmds) * queue->depth, queue->sq_cmds, queue->sq_cmds_phy, dma_flags); } if (queue->cq_entry) { rt_dma_free(nvme->dev, sizeof(*queue->cq_entry) * queue->depth, queue->cq_entry, queue->cq_entry_phy, dma_flags); } } static struct rt_nvme_queue *nvme_alloc_queue(struct rt_nvme_controller *nvme, int qid, int depth) { rt_err_t err; rt_ubase_t dma_flags; struct rt_nvme_queue *queue = &nvme->queue[qid]; rt_memset(queue, 0, sizeof(*queue)); queue->nvme = nvme; queue->doorbell = &nvme->doorbell_tbl[qid * 2 * nvme->doorbell_stride]; queue->qid = qid; queue->depth = depth; queue->cq_head = 0; queue->cq_phase = 1; rt_completion_init(&queue->done); rt_spin_lock_init(&queue->lock); dma_flags = nvme_queue_dma_flags(); /* struct rt_nvme_command */ queue->sq_cmds = rt_dma_alloc(nvme->dev, sizeof(*queue->sq_cmds) * depth, &queue->sq_cmds_phy, dma_flags); if (!queue->sq_cmds) { err = -RT_ENOMEM; goto _fail; } /* struct rt_nvme_completion */ queue->cq_entry = rt_dma_alloc(nvme->dev, sizeof(*queue->cq_entry) * depth, &queue->cq_entry_phy, dma_flags); if (!queue->cq_entry) { err = -RT_ENOMEM; goto _fail; } rt_memset(queue->sq_cmds, 0, sizeof(struct rt_nvme_command) * depth); rt_memset(queue->cq_entry, 0, sizeof(struct rt_nvme_completion) * depth); if (nvme->ops->setup_queue) { if (!(err = nvme->ops->setup_queue(queue))) { LOG_E("Setup[%s] queue error = %s", nvme->ops->name, rt_strerror(err)); goto _fail; } } return queue; _fail: nvme_free_queue(queue); return rt_err_ptr(err); } static rt_err_t nvme_configure_admin_queue(struct rt_nvme_controller *nvme) { rt_err_t err; int irq; char name[RT_NAME_MAX]; rt_uint32_t aqa; rt_uint32_t page_shift = ARCH_PAGE_SHIFT; rt_uint32_t page_min = RT_NVME_CAP_MPSMIN(nvme->cap) + 12; rt_uint32_t page_max = RT_NVME_CAP_MPSMAX(nvme->cap) + 12; struct rt_nvme_queue *admin_queue; if (page_shift < page_min) { LOG_E("Device %s page size (%u) %s than host (%u)", "minimum", 1 << page_min, "larger", 1 << page_shift); return -RT_EINVAL; } if (page_shift > page_max) { LOG_W("Device %s page size (%u) %s than host (%u)", "maximum", 1 << page_max, "smaller", 1 << page_shift); page_shift = page_max; } if ((err = nvme_disable_ctrl(nvme))) { return err; } admin_queue = nvme_alloc_queue(nvme, 0, RT_NVME_AQ_DEPTH); if (rt_is_err(admin_queue)) { return rt_ptr_err(admin_queue); } aqa = admin_queue->depth - 1; aqa |= aqa << 16; nvme->page_shift = page_shift; nvme->page_size = 1U << page_shift; nvme->ctrl_config = RT_NVME_CC_CSS_NVM; nvme->ctrl_config |= (page_shift - 12) << RT_NVME_CC_MPS_SHIFT; nvme->ctrl_config |= RT_NVME_CC_ARB_RR | RT_NVME_CC_SHN_NONE; nvme->ctrl_config |= RT_NVME_CC_IOSQES | RT_NVME_CC_IOCQES; nvme_writel(nvme, RT_NVME_REG_AQA, aqa); nvme_writeq(nvme, RT_NVME_REG_ASQ, admin_queue->sq_cmds_phy); nvme_writeq(nvme, RT_NVME_REG_ACQ, admin_queue->cq_entry_phy); if ((err = nvme_enable_ctrl(nvme))) { nvme_free_queue(admin_queue); return err; } irq = nvme->irqs[0]; rt_snprintf(name, RT_NAME_MAX, "%s-admin-queue", nvme->name); rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->admin_queue, name); rt_hw_interrupt_umask(irq); return RT_EOK; } static rt_err_t nvme_setup_io_queues(struct rt_nvme_controller *nvme) { rt_err_t err; rt_uint32_t value; int irq, cpuid = 0; char name[RT_NAME_MAX]; rt_bool_t affinity_fixup = RT_FALSE; RT_DECLARE_IRQ_AFFINITY(affinity) = { 0 }; struct rt_nvme_queue *queue; nvme->io_queue_max = nvme->irqs_nr > 1 ? nvme->irqs_nr - 1 : 1; value = (nvme->io_queue_max - 1) | ((nvme->io_queue_max - 1) << 16); if ((err = nvme_set_features_simple(nvme, RT_NVME_FEAT_NUM_QUEUES, value))) { return err; } for (int i = 0, q_idx = 1; i < nvme->io_queue_max; ++i, ++q_idx) { queue = nvme_alloc_queue(nvme, q_idx, nvme->queue_depth); if (!queue) { return -RT_ENOMEM; } if ((err = nvme_attach_queue_cq(queue)) || (err = nvme_attach_queue_sq(queue))) { return err; } } for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx) { irq = nvme->irqs[irq_idx % nvme->irqs_nr]; rt_snprintf(name, RT_NAME_MAX, "%s-io-queue%d", nvme->name, i); if (!affinity_fixup) { RT_IRQ_AFFINITY_SET(affinity, cpuid % RT_CPUS_NR); if (rt_pic_irq_set_affinity(irq, affinity)) { /* Fixup in secondary CPU startup */ affinity_fixup = RT_TRUE; } RT_IRQ_AFFINITY_CLEAR(affinity, cpuid++ % RT_CPUS_NR); } rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->io_queues[i], name); rt_hw_interrupt_umask(irq); } return RT_EOK; } static void nvme_remove_io_queues(struct rt_nvme_controller *nvme) { int irq; struct rt_nvme_queue *queue; for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx) { queue = &nvme->io_queues[i]; nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_SQ); nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_CQ); nvme_free_queue(queue); irq = nvme->irqs[irq_idx % nvme->irqs_nr]; rt_hw_interrupt_mask(irq); rt_pic_detach_irq(irq, queue); } } static void nvme_remove_admin_queues(struct rt_nvme_controller *nvme) { int irq = nvme->irqs[0]; rt_hw_interrupt_mask(irq); rt_pic_detach_irq(irq, &nvme->admin_queue); nvme_free_queue(&nvme->admin_queue); } static void nvme_remove_devices(struct rt_nvme_controller *nvme) { struct rt_nvme_device *ndev, *next_ndev; rt_list_for_each_entry_safe(ndev, next_ndev, &nvme->ns_nodes, list) { rt_list_remove(&ndev->list); rt_hw_blk_disk_unregister(&ndev->parent); rt_free(ndev); } } static rt_err_t nvme_scan_device(struct rt_nvme_controller *nvme, rt_size_t number_of_ns) { rt_err_t err; rt_uint32_t lbaf; struct rt_nvme_id_ns *id = RT_NULL; if (!(id = rt_malloc_align(sizeof(*id), nvme->page_size))) { return -RT_ENOMEM; } /* NVME Namespace is start with "1" */ for (rt_uint32_t nsid = 1; nsid <= number_of_ns; ++nsid) { struct rt_nvme_device *ndev = rt_calloc(1, sizeof(*ndev)); if (!ndev) { err = -RT_ENOMEM; goto _free_res; } rt_memset(id, 0, sizeof(*id)); if ((err = nvme_identify(nvme, nsid, 0, id))) { goto _free_res; } if (!id->nsze) { continue; } ndev->ctrl = nvme; rt_memcpy(&ndev->id, id, sizeof(ndev->id)); lbaf = id->flbas & RT_NVME_NS_FLBAS_LBA_MASK; lbaf |= ((id->flbas & RT_NVME_NS_FLBAS_LBA_UMASK) >> RT_NVME_NS_FLBAS_LBA_SHIFT); ndev->nsid = nsid; ndev->lba_shift = id->lbaf[lbaf].ds; ndev->parent.ida = &nvme_ida; ndev->parent.parallel_io = RT_TRUE; ndev->parent.ops = &nvme_blk_ops; ndev->parent.max_partitions = RT_BLK_PARTITION_MAX; rt_dm_dev_set_name(&ndev->parent.parent, "%sn%u", nvme->name, nsid); if ((err = rt_hw_blk_disk_register(&ndev->parent))) { goto _free_res; } rt_list_init(&ndev->list); rt_list_insert_before(&nvme->ns_nodes, &ndev->list); } _free_res: rt_free_align(id); return err; } rt_inline rt_size_t strip_len(const char *str, rt_size_t max_len) { rt_size_t size = 0; for (int i = 0; *str && i < max_len; ++i, ++str) { if (*str != ' ') { size = i + 1; } } return size; } rt_err_t rt_nvme_controller_register(struct rt_nvme_controller *nvme) { rt_err_t err; struct rt_nvme_id_ctrl *ctrl = RT_NULL; if (!nvme || !nvme->ops) { return -RT_EINVAL; } if (nvme_readl(nvme, RT_NVME_REG_CSTS) == (rt_uint32_t)-1) { LOG_E("Out of memory"); return -RT_EINVAL; } if ((nvme->nvme_id = rt_dm_ida_alloc(&nvme_controller_ida)) < 0) { return -RT_EFULL; } rt_snprintf(nvme->name, RT_NAME_MAX, "nvme%u", nvme->nvme_id); nvme->cap = nvme_readq(nvme, RT_NVME_REG_CAP); nvme->queue_depth = RT_NVME_CAP_MQES(nvme->cap) + 1; nvme->doorbell_stride = 1 << RT_NVME_CAP_STRIDE(nvme->cap); nvme->doorbell_tbl = nvme->regs + RT_NVME_REG_DBS; if ((err = nvme_configure_admin_queue(nvme))) { LOG_E("Configure admin queue error = %s", rt_strerror(err)); goto _free_admin_queue; } if ((err = nvme_setup_io_queues(nvme))) { LOG_E("Unable to setup I/O queues error = %s", rt_strerror(err)); goto _free_admin_queue; } if (!(ctrl = rt_malloc_align(sizeof(*ctrl), nvme->page_size))) { err = -RT_ENOMEM; goto _fail; } if ((err = nvme_identify(nvme, 0, 1, ctrl))) { goto _fail; } if (ctrl->mdts) { nvme->max_transfer_shift = ctrl->mdts + (RT_NVME_CAP_MPSMIN(nvme->cap) + 12); } else { /* 1MB is recommended. */ nvme->max_transfer_shift = 20; } nvme->volatile_write_cache = ctrl->vwc; nvme->write_zeroes = !!(rt_le64_to_cpu(ctrl->oncs) & RT_NVME_CTRL_ONCS_WRITE_ZEROES); if ((rt_le32_to_cpu(ctrl->sgls) & RT_NVME_ID_SGL_SUPPORT_MASK)) { nvme->sgl_mode = RT_NVME_PSDT_SGL_MPTR_SGL; } LOG_I("NVM Express v%d.%d (%s, %-*.s, %-*.s)", nvme_readl(nvme, RT_NVME_REG_VS) >> 16, nvme_readl(nvme, RT_NVME_REG_VS) & 0xff, nvme->ops->name, strip_len(ctrl->mn, sizeof(ctrl->mn)), ctrl->mn, strip_len(ctrl->fr, sizeof(ctrl->fr)), ctrl->fr); rt_list_init(&nvme->ns_nodes); if ((err = nvme_scan_device(nvme, rt_le32_to_cpu(ctrl->nn)))) { goto _fail; } rt_free_align(ctrl); rt_spin_lock(&nvme_lock); rt_list_insert_after(&nvme_nodes, &nvme->list); rt_spin_unlock(&nvme_lock); return RT_EOK; _fail: if (ctrl) { rt_free_align(ctrl); } nvme_remove_devices(nvme); nvme_remove_io_queues(nvme); _free_admin_queue: nvme_remove_admin_queues(nvme); rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id); return err; } rt_err_t rt_nvme_controller_unregister(struct rt_nvme_controller *nvme) { rt_err_t err; if (!nvme) { return -RT_EINVAL; } rt_spin_lock(&nvme_lock); rt_list_remove(&nvme->list); rt_spin_unlock(&nvme_lock); nvme_remove_devices(nvme); nvme_remove_io_queues(nvme); nvme_remove_admin_queues(nvme); rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id); if (!(err = nvme_shutdown_ctrl(nvme))) { err = nvme_disable_ctrl(nvme); } else { LOG_E("%s: shutdown error = %s", nvme->name, rt_strerror(err)); } return err; } /* * NVME's IO queue should be Per-CPU, fixup the affinity after the secondary CPU * startup, this stage can make sure the affinity setting success as possible. */ static int nvme_queue_affinify_fixup(void) { int cpuid = rt_hw_cpu_id(); struct rt_nvme_controller *nvme; RT_DECLARE_IRQ_AFFINITY(affinity) = { 0 }; RT_DECLARE_IRQ_AFFINITY(current_affinity) = { 0 }; RT_IRQ_AFFINITY_SET(affinity, cpuid); rt_hw_spin_lock(&nvme_lock.lock); rt_list_for_each_entry(nvme, &nvme_nodes, list) { for (int i = cpuid % RT_CPUS_NR; i < nvme->io_queue_max; i += RT_CPUS_NR) { int irq = nvme->irqs[i]; if (!rt_pic_irq_get_affinity(irq, current_affinity) && !rt_bitmap_test_bit(current_affinity, cpuid)) { rt_ubase_t level = rt_hw_interrupt_disable(); rt_pic_irq_set_affinity(irq, affinity); rt_hw_interrupt_enable(level); } } } rt_hw_spin_unlock(&nvme_lock.lock); return 0; } INIT_SECONDARY_CPU_EXPORT(nvme_queue_affinify_fixup);