From f257cf1e1f05f0e5b1af6905eab4f57244ea1ebb Mon Sep 17 00:00:00 2001 From: Ivan Roth Date: Mon, 12 Aug 2013 10:52:30 -0500 Subject: [PATCH] Switch from contiguous raw scatterlist to use scatterlist table api instead, fixes memory alloc failure when host memory gets fragmented enough to not be able to allocate enough memory for a contiguous scatterlist. Don't increment ISR count if no outstanding commands, prevents test failure due to race between reaped completion, unmasking interrupts, and device canceling interrupt, sometimes spurious interrupt may occur that arrive the moment interrupts are unmasked. --- dnvme_cmds.c | 79 ++++++++++++++++++++++++++------------------------ dnvme_ds.c | 6 ++-- dnvme_ds.h | 5 ++-- dnvme_ioctls.c | 14 +++++++-- dnvme_irq.c | 46 ++++++++++++++++++++++++++--- dnvme_irq.h | 9 ++++++ dnvme_queue.c | 9 +++++- 7 files changed, 118 insertions(+), 50 deletions(-) diff --git a/dnvme_cmds.c b/dnvme_cmds.c index 2b775c0..756ee93 100644 --- a/dnvme_cmds.c +++ b/dnvme_cmds.c @@ -36,11 +36,11 @@ static int data_buf_to_prp(struct nvme_device *nvme_dev, enum data_buf_type data_buf_type, u16 cmd_id); static int map_user_pg_to_dma(struct nvme_device *nvme_dev, enum dma_data_direction kernel_dir, unsigned long buf_addr, - unsigned total_buf_len, struct scatterlist **sg_list, - struct nvme_prps *prps, enum data_buf_type data_buf_type); + unsigned total_buf_len, struct nvme_prps *prps, + enum data_buf_type data_buf_type); static int pages_to_sg(struct page **pages, int num_pages, int buf_offset, - unsigned len, struct scatterlist **sg_list); -static int setup_prps(struct nvme_device *nvme_dev, struct scatterlist *sg, + unsigned len, struct sg_table *sg_tbl); +static int setup_prps(struct nvme_device *nvme_dev, s32 buf_len, struct nvme_prps *prps, u8 cr_io_q, enum send_64b_bitmask prp_mask); static void unmap_user_pg_to_dma(struct nvme_device *nvme_dev, @@ -182,7 +182,7 @@ static int data_buf_to_prp(struct nvme_device *nvme_dev, { int err; unsigned long addr; - struct scatterlist *sg_list = NULL; + enum dma_data_direction kernel_dir; #ifdef TEST_PRP_DEBUG int last_prp, i, j; @@ -207,12 +207,12 @@ static int data_buf_to_prp(struct nvme_device *nvme_dev, /* Mapping user pages to dma memory */ err = map_user_pg_to_dma(nvme_dev, kernel_dir, addr, - nvme_64b_send->data_buf_size, &sg_list, prps, data_buf_type); + nvme_64b_send->data_buf_size, prps, data_buf_type); if (err < 0) { return err; } - err = setup_prps(nvme_dev, sg_list, nvme_64b_send->data_buf_size, prps, + err = setup_prps(nvme_dev, nvme_64b_send->data_buf_size, prps, data_buf_type, nvme_64b_send->bit_mask); if (err < 0) { unmap_user_pg_to_dma(nvme_dev, prps); @@ -278,8 +278,8 @@ static int data_buf_to_prp(struct nvme_device *nvme_dev, static int map_user_pg_to_dma(struct nvme_device *nvme_dev, enum dma_data_direction kernel_dir, unsigned long buf_addr, - unsigned total_buf_len, struct scatterlist **sg_list, - struct nvme_prps *prps, enum data_buf_type data_buf_type) + unsigned total_buf_len, struct nvme_prps *prps, + enum data_buf_type data_buf_type) { int i, err, buf_pg_offset, buf_pg_count, num_sg_entries; struct page **pages; @@ -323,9 +323,9 @@ static int map_user_pg_to_dma(struct nvme_device *nvme_dev, } } - /* Generate SG List from pinned down pages */ + /* Generate SG Table from pinned down pages */ err = pages_to_sg(pages, buf_pg_count, buf_pg_offset, - total_buf_len, sg_list); + total_buf_len, &prps->st); if (err < 0) { LOG_ERR("Generation of sg lists failed"); goto error_unmap; @@ -334,7 +334,7 @@ static int map_user_pg_to_dma(struct nvme_device *nvme_dev, /* Mapping SG List to DMA; NOTE: The sg list could be coalesced by either * an IOMMU or the kernel, so checking whether or not the number of * mapped entries equate to the number given to the func is not warranted */ - num_sg_entries = dma_map_sg(&nvme_dev->private_dev.pdev->dev, *sg_list, + num_sg_entries = dma_map_sg(&nvme_dev->private_dev.pdev->dev, prps->st.sgl, buf_pg_count, kernel_dir); LOG_DBG("%d elements mapped out of %d sglist elements", num_sg_entries, num_sg_entries); @@ -349,7 +349,7 @@ static int map_user_pg_to_dma(struct nvme_device *nvme_dev, { struct scatterlist *work; LOG_DBG("Dump sg list after DMA mapping"); - for (i = 0, work = *sg_list; i < num_sg_entries; i++, + for (i = 0, work = prps->st.sgl; i < num_sg_entries; i++, work = sg_next(work)) { LOG_DBG(" sg list: page_link=0x%016lx", work->page_link); @@ -362,7 +362,6 @@ static int map_user_pg_to_dma(struct nvme_device *nvme_dev, #endif /* Fill in nvme_prps */ - prps->sg = *sg_list; prps->num_map_pgs = num_sg_entries; prps->vir_kern_addr = vir_kern_addr; prps->data_dir = kernel_dir; @@ -381,32 +380,32 @@ static int map_user_pg_to_dma(struct nvme_device *nvme_dev, } static int pages_to_sg(struct page **pages, int num_pages, int buf_offset, - unsigned len, struct scatterlist **sg_list) -{ + unsigned len, struct sg_table *sg_tbl) +{ int i; - struct scatterlist *sg; - - *sg_list = NULL; - sg = kmalloc((num_pages * sizeof(struct scatterlist)), GFP_KERNEL); - if (sg == NULL) { - LOG_ERR("Memory alloc for sg list failed"); - return -ENOMEM; - } - + int rc; + struct scatterlist *sg_cur = NULL, *sg_prv = NULL; + + rc = sg_alloc_table(sg_tbl, num_pages, GFP_KERNEL); + if (rc) { + LOG_ERR("Memory alloc for sg table failed rc=0x%X", rc); + return rc; + } + /* Building the SG List */ - sg_init_table(sg, num_pages); - for (i = 0; i < num_pages; i++) { + for (i = 0, sg_cur=sg_tbl->sgl; i < num_pages; i++, sg_cur=sg_next(sg_cur)) { if (pages[i] == NULL) { - kfree(sg); + sg_free_table(sg_tbl); return -EFAULT; } - sg_set_page(&sg[i], pages[i], + sg_set_page(sg_cur, pages[i], min_t(int, len, (PAGE_SIZE - buf_offset)), buf_offset); len -= (PAGE_SIZE - buf_offset); buf_offset = 0; + sg_prv = sg_cur; } - sg_mark_end(&sg[i - 1]); - *sg_list = sg; + sg_mark_end(sg_prv); + return 0; } @@ -415,7 +414,7 @@ static int pages_to_sg(struct page **pages, int num_pages, int buf_offset, * Sets up PRP'sfrom DMA'ed memory * Returns Error codes */ -static int setup_prps(struct nvme_device *nvme_dev, struct scatterlist *sg, +static int setup_prps(struct nvme_device *nvme_dev, s32 buf_len, struct nvme_prps *prps, u8 cr_io_q, enum send_64b_bitmask prp_mask) { @@ -426,7 +425,10 @@ static int setup_prps(struct nvme_device *nvme_dev, struct scatterlist *sg, u32 num_prps, num_pg, prp_page = 0; int index, err; struct dma_pool *prp_page_pool; - + struct scatterlist *sg; + + sg = prps->st.sgl; + dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); offset = offset_in_page(dma_addr); @@ -593,7 +595,8 @@ static void unmap_user_pg_to_dma(struct nvme_device *nvme_dev, { int i; struct page *pg; - + struct scatterlist *sg_cur; + if (!prps) { return; } @@ -604,11 +607,11 @@ static void unmap_user_pg_to_dma(struct nvme_device *nvme_dev, } if (prps->type != NO_PRP) { - dma_unmap_sg(&nvme_dev->private_dev.pdev->dev, prps->sg, + dma_unmap_sg(&nvme_dev->private_dev.pdev->dev, prps->st.sgl, prps->num_map_pgs, prps->data_dir); - for (i = 0; i < prps->num_map_pgs; i++) { - pg = sg_page(&prps->sg[i]); + for (i = 0, sg_cur = prps->st.sgl; i < prps->num_map_pgs; i++, sg_cur = sg_next(sg_cur)) { + pg = sg_page(sg_cur); if ((prps->data_dir == DMA_FROM_DEVICE) || (prps->data_dir == DMA_BIDIRECTIONAL)) { @@ -616,7 +619,7 @@ static void unmap_user_pg_to_dma(struct nvme_device *nvme_dev, } put_page(pg); } - kfree(prps->sg); + sg_free_table(&prps->st); } } diff --git a/dnvme_ds.c b/dnvme_ds.c index ff24fc4..10018a1 100755 --- a/dnvme_ds.c +++ b/dnvme_ds.c @@ -285,7 +285,7 @@ int driver_log(struct nvme_file *n_file) pmetrics_cq_list->private_cq.prp_persist.data_buf_size); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L4"sq = 0X%llX", - (u64)pmetrics_cq_list->private_cq.prp_persist.sg); + (u64)pmetrics_cq_list->private_cq.prp_persist.st.sgl); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L4"num_map_pgs = 0X%llX", (u64)pmetrics_cq_list->private_cq.prp_persist.num_map_pgs); @@ -375,7 +375,7 @@ int driver_log(struct nvme_file *n_file) pmetrics_sq_list->private_sq.prp_persist.data_buf_size); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L4"sg = 0X%llX", - (u64)pmetrics_sq_list->private_sq.prp_persist.sg); + (u64)pmetrics_sq_list->private_sq.prp_persist.st.sgl); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L4"num_map_pgs = 0X%llX", (u64)pmetrics_sq_list->private_sq.prp_persist.num_map_pgs); @@ -436,7 +436,7 @@ int driver_log(struct nvme_file *n_file) pcmd_track_list->prp_nonpersist.data_buf_size); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L6"sg = 0X%llX", - (u64)pcmd_track_list->prp_nonpersist.sg); + (u64)pcmd_track_list->prp_nonpersist.st.sgl); vfs_write(file, work, strlen(work), &pos); snprintf(work, SIZE_OF_WORK, IDNT_L6"num_map_pgs = 0X%llX", (u64)pcmd_track_list->prp_nonpersist.num_map_pgs); diff --git a/dnvme_ds.h b/dnvme_ds.h index 149b114..aeaa0e8 100755 --- a/dnvme_ds.h +++ b/dnvme_ds.h @@ -48,8 +48,8 @@ struct nvme_prps { dma_addr_t first_dma; /* First entry in PRP List */ /* Size of data buffer for the specific command */ u32 data_buf_size; - /* Pointer to SG list generated */ - struct scatterlist *sg; + /* SG table use sg_next(nvme_prps.st.sgl) */ + struct sg_table st; /* Number of pages mapped to DMA area */ u32 num_map_pgs; /* Address of data buffer for the specific command */ @@ -135,6 +135,7 @@ struct irq_track { u32 int_vec; /* vec number; assigned by OS */ u8 isr_fired; /* flag to indicate if irq has fired */ u32 isr_count; /* total no. of times irq fired */ + u32 outstanding_cmd_count; /* count of outstanding cmds */ }; /* diff --git a/dnvme_ioctls.c b/dnvme_ioctls.c index 4fd45ad..6069e92 100755 --- a/dnvme_ioctls.c +++ b/dnvme_ioctls.c @@ -848,7 +848,7 @@ int driver_toxic_dword(struct metrics_device_list *pmetrics_device, LOG_DBG("After tgt_DW%d = 0x%08X", user_data->dword, *tgt_dword); dma_sync_sg_for_device(pmetrics_device->metrics_device-> - private_dev.dmadev, pmetrics_sq->private_sq.prp_persist.sg, + private_dev.dmadev, pmetrics_sq->private_sq.prp_persist.st.sgl, pmetrics_sq->private_sq.prp_persist.num_map_pgs, pmetrics_sq->private_sq.prp_persist.data_dir); @@ -883,6 +883,8 @@ int driver_send_64b(struct metrics_device_list *pmetrics_device, struct metrics_sq *p_cmd_sq; /* Particular CQ (within CMD) from linked list of Q's for device */ struct metrics_cq *p_cmd_cq; + /* Particular CQ from linked list of SQ's for device */ + struct metrics_cq *pmetrics_cq; /* struct describing the meta buf */ struct metrics_meta *meta_buf; /* Kernel space memory for passed in command */ @@ -1214,11 +1216,19 @@ int driver_send_64b(struct metrics_device_list *pmetrics_device, nvme_cmd_ker, cmd_buf_size); dma_sync_sg_for_device(pmetrics_device->metrics_device-> - private_dev.dmadev, pmetrics_sq->private_sq.prp_persist.sg, + private_dev.dmadev, pmetrics_sq->private_sq.prp_persist.st.sgl, pmetrics_sq->private_sq.prp_persist.num_map_pgs, pmetrics_sq->private_sq.prp_persist.data_dir); } + + pmetrics_cq = find_cq(pmetrics_device, pmetrics_sq->public_sq.cq_id); + if ((pmetrics_cq->public_cq.irq_enabled) && (pmetrics_device-> + metrics_device->public_dev.irq_active.irq_type != INT_NONE)) { + incr_outstanding_cmd_count(pmetrics_device, pmetrics_cq->public_cq.irq_no); + } + + /* Increment the Tail pointer and handle roll over conditions */ pmetrics_sq->public_sq.tail_ptr_virt = (u16)(((u32)pmetrics_sq->public_sq.tail_ptr_virt + 1UL) % diff --git a/dnvme_irq.c b/dnvme_irq.c index 02cd640..9ce45f0 100755 --- a/dnvme_irq.c +++ b/dnvme_irq.c @@ -913,10 +913,15 @@ static void inc_isr_count(struct irq_processing *pirq_process, list_for_each_entry(pirq_node, &pirq_process->irq_track_list, irq_list_hd) { if (irq_no == pirq_node->irq_no) { - pirq_node->isr_fired = 1; - pirq_node->isr_count++; - LOG_DBG("BH:isr count = %d for irq no = %d", - pirq_node->isr_count, pirq_node->irq_no); + if (pirq_node->outstanding_cmd_count > 0) { + pirq_node->isr_fired = 1; + pirq_node->isr_count++; + LOG_DBG("BH:isr count = %d for irq no = %d", + pirq_node->isr_count, pirq_node->irq_no); + } else { + /*unmask ints */ + unmask_interrupts(irq_no, pirq_process); + } } /* end of if ivec */ } /* end of list for irq */ } @@ -996,6 +1001,7 @@ static int add_irq_node(struct metrics_device_list *pmetrics_device_elem, irq_trk_node->irq_no = irq_no; /* irq number assigned */ irq_trk_node->isr_fired = 0; irq_trk_node->isr_count = 0; + irq_trk_node->outstanding_cmd_count = 0; /* Init irq cq linked list */ INIT_LIST_HEAD(&irq_trk_node->irq_cq_track); @@ -1374,3 +1380,35 @@ int reset_isr_flag(struct metrics_device_list *pmetrics_device, return 0; } +int incr_outstanding_cmd_count(struct metrics_device_list *pmetrics_device, + u16 irq_no) { + struct irq_track *pirq_node; + pirq_node = find_irq_node(pmetrics_device, irq_no); + if (pirq_node == NULL) { + LOG_ERR("Irq node not found for irq_no: %d", irq_no); + return -EINVAL; + } + LOG_DBG("incr outstanding, before: %d irq_no=%d", pirq_node->outstanding_cmd_count, irq_no); + pirq_node->outstanding_cmd_count++; + return 0; +} + +int sub_outstanding_cmd_count(struct metrics_device_list *pmetrics_device, + u16 irq_no, u32 reaped_count) { + struct irq_track *pirq_node; + pirq_node = find_irq_node(pmetrics_device, irq_no); + if (pirq_node == NULL) { + LOG_ERR("Irq node not found for irq_no: %d", irq_no); + return -EINVAL; + } + if (pirq_node->outstanding_cmd_count < reaped_count) { + LOG_ERR("outstanding_cmd_count: %d < reaped_count: %d",pirq_node->outstanding_cmd_count, reaped_count); + pirq_node->outstanding_cmd_count = 0; + } else { + pirq_node->outstanding_cmd_count -= reaped_count; + } + + LOG_DBG("sub outstanding, after: %d irq_no=%d reaped= %d", pirq_node->outstanding_cmd_count, irq_no, reaped_count); + return 0; +} + diff --git a/dnvme_irq.h b/dnvme_irq.h index 6eab817..be91da1 100755 --- a/dnvme_irq.h +++ b/dnvme_irq.h @@ -170,4 +170,13 @@ int reap_inquiry_isr(struct metrics_cq *pmetrics_cq_node, int reset_isr_flag(struct metrics_device_list *pmetrics_device, u16 irq_no); +/* update incr outstanding cmd count + */ +int incr_outstanding_cmd_count(struct metrics_device_list *pmetrics_device, + u16 irq_no); + +int sub_outstanding_cmd_count(struct metrics_device_list *pmetrics_device, + u16 irq_no, u32 reaped_count); + + #endif diff --git a/dnvme_queue.c b/dnvme_queue.c index 11dd2bb..8e8bff2 100755 --- a/dnvme_queue.c +++ b/dnvme_queue.c @@ -741,7 +741,7 @@ u32 reap_inquiry(struct metrics_cq *pmetrics_cq_node, struct device *dev) (comp_entry_size * (u32)pmetrics_cq_node->public_cq.head_ptr); } else { /* do sync and update when pointer to discontig Q is reaped inq */ - dma_sync_sg_for_cpu(dev, pmetrics_cq_node->private_cq.prp_persist.sg, + dma_sync_sg_for_cpu(dev, pmetrics_cq_node->private_cq.prp_persist.st.sgl, pmetrics_cq_node->private_cq.prp_persist.num_map_pgs, pmetrics_cq_node->private_cq.prp_persist.data_dir); queue_base_addr = @@ -1431,6 +1431,13 @@ int driver_reap_cq(struct metrics_device_list *pmetrics_device, writel(pmetrics_cq_node->public_cq.head_ptr, pmetrics_cq_node-> private_cq.dbs); + if ((pmetrics_cq_node->public_cq.irq_enabled == 1) && + (pmetrics_device->metrics_device->public_dev.irq_active.irq_type != INT_NONE)) { + sub_outstanding_cmd_count(pmetrics_device, + pmetrics_cq_node->public_cq.irq_no, user_data->num_reaped); + } + + /* if 0 CE in a given cq, then reset the isr flag. */ if ((pmetrics_cq_node->public_cq.irq_enabled == 1) && (user_data->num_remaining == 0) &&