[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver

To: Michael Kelley <mikelley@xxxxxxxxxxxxx>, KY Srinivasan <kys@xxxxxxxxxxxxx>, Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>, Stephen Hemminger <sthemmin@xxxxxxxxxxxxx>, "wei.liu@xxxxxxxxxx" <wei.liu@xxxxxxxxxx>, Dexuan Cui <decui@xxxxxxxxxxxxx>, "tglx@xxxxxxxxxxxxx" <tglx@xxxxxxxxxxxxx>, "mingo@xxxxxxxxxx" <mingo@xxxxxxxxxx>, "bp@xxxxxxxxx" <bp@xxxxxxxxx>, "x86@xxxxxxxxxx" <x86@xxxxxxxxxx>, "hpa@xxxxxxxxx" <hpa@xxxxxxxxx>, "dave.hansen@xxxxxxxxxxxxxxx" <dave.hansen@xxxxxxxxxxxxxxx>, "luto@xxxxxxxxxx" <luto@xxxxxxxxxx>, "peterz@xxxxxxxxxxxxx" <peterz@xxxxxxxxxxxxx>, "konrad.wilk@xxxxxxxxxx" <konrad.wilk@xxxxxxxxxx>, "boris.ostrovsky@xxxxxxxxxx" <boris.ostrovsky@xxxxxxxxxx>, "jgross@xxxxxxxx" <jgross@xxxxxxxx>, "sstabellini@xxxxxxxxxx" <sstabellini@xxxxxxxxxx>, "joro@xxxxxxxxxx" <joro@xxxxxxxxxx>, "will@xxxxxxxxxx" <will@xxxxxxxxxx>, "davem@xxxxxxxxxxxxx" <davem@xxxxxxxxxxxxx>, "kuba@xxxxxxxxxx" <kuba@xxxxxxxxxx>, "jejb@xxxxxxxxxxxxx" <jejb@xxxxxxxxxxxxx>, "martin.petersen@xxxxxxxxxx" <martin.petersen@xxxxxxxxxx>, "arnd@xxxxxxxx" <arnd@xxxxxxxx>, "hch@xxxxxx" <hch@xxxxxx>, "m.szyprowski@xxxxxxxxxxx" <m.szyprowski@xxxxxxxxxxx>, "robin.murphy@xxxxxxx" <robin.murphy@xxxxxxx>, "thomas.lendacky@xxxxxxx" <thomas.lendacky@xxxxxxx>, "brijesh.singh@xxxxxxx" <brijesh.singh@xxxxxxx>, "ardb@xxxxxxxxxx" <ardb@xxxxxxxxxx>, Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>, "pgonda@xxxxxxxxxx" <pgonda@xxxxxxxxxx>, "martin.b.radev@xxxxxxxxx" <martin.b.radev@xxxxxxxxx>, "akpm@xxxxxxxxxxxxxxxxxxxx" <akpm@xxxxxxxxxxxxxxxxxxxx>, "kirill.shutemov@xxxxxxxxxxxxxxx" <kirill.shutemov@xxxxxxxxxxxxxxx>, "rppt@xxxxxxxxxx" <rppt@xxxxxxxxxx>, "sfr@xxxxxxxxxxxxxxxx" <sfr@xxxxxxxxxxxxxxxx>, "saravanand@xxxxxx" <saravanand@xxxxxx>, "krish.sadhukhan@xxxxxxxxxx" <krish.sadhukhan@xxxxxxxxxx>, "aneesh.kumar@xxxxxxxxxxxxx" <aneesh.kumar@xxxxxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, "rientjes@xxxxxxxxxx" <rientjes@xxxxxxxxxx>, "hannes@xxxxxxxxxxx" <hannes@xxxxxxxxxxx>, "tj@xxxxxxxxxx" <tj@xxxxxxxxxx>
From: Tianyu Lan <ltykernel@xxxxxxxxx>
Date: Fri, 20 Aug 2021 23:20:21 +0800
Cc: "iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx" <iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx>, "linux-arch@xxxxxxxxxxxxxxx" <linux-arch@xxxxxxxxxxxxxxx>, "linux-hyperv@xxxxxxxxxxxxxxx" <linux-hyperv@xxxxxxxxxxxxxxx>, "linux-kernel@xxxxxxxxxxxxxxx" <linux-kernel@xxxxxxxxxxxxxxx>, "linux-scsi@xxxxxxxxxxxxxxx" <linux-scsi@xxxxxxxxxxxxxxx>, "netdev@xxxxxxxxxxxxxxx" <netdev@xxxxxxxxxxxxxxx>, vkuznets <vkuznets@xxxxxxxxxx>, "parri.andrea@xxxxxxxxx" <parri.andrea@xxxxxxxxx>, "dave.hansen@xxxxxxxxx" <dave.hansen@xxxxxxxxx>
Delivery-date: Fri, 20 Aug 2021 15:20:49 +0000
List-id: Xen developer discussion <xen-devel.lists.xenproject.org>



On 8/20/2021 2:17 AM, Michael Kelley wrote:

From: Tianyu Lan <ltykernel@xxxxxxxxx> Sent: Monday, August 9, 2021 10:56 AM


Subject line tag should be "scsi: storvsc:"

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
mpb_desc() still need to handle. Use DMA API to map/umap these


s/need to handle/needs to be handled/

memory during sending/receiving packet and Hyper-V DMA ops callback
will use swiotlb function to allocate bounce buffer and copy data
from/to bounce buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>
---
  drivers/scsi/storvsc_drv.c | 68 +++++++++++++++++++++++++++++++++++---
  1 file changed, 63 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 328bb961c281..78320719bdd8 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -21,6 +21,8 @@
  #include <linux/device.h>
  #include <linux/hyperv.h>
  #include <linux/blkdev.h>
+#include <linux/io.h>
+#include <linux/dma-mapping.h>
  #include <scsi/scsi.h>
  #include <scsi/scsi_cmnd.h>
  #include <scsi/scsi_host.h>
@@ -427,6 +429,8 @@ struct storvsc_cmd_request {
        u32 payload_sz;

        struct vstor_packet vstor_packet;
+       u32 hvpg_count;


This count is really the number of entries in the dma_range
array, right?  If so, perhaps "dma_range_count" would be
a better name so that it is more tightly associated.


Yes, will update.

+       struct hv_dma_range *dma_range;
  };


@@ -509,6 +513,14 @@ struct storvsc_scan_work {
        u8 tgt_id;
  };

+#define storvsc_dma_map(dev, page, offset, size, dir) \
+       dma_map_page(dev, page, offset, size, dir)
+
+#define storvsc_dma_unmap(dev, dma_range, dir)         \
+               dma_unmap_page(dev, dma_range.dma,      \
+                              dma_range.mapping_size,  \
+                              dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
+


Each of these macros is used only once.  IMHO, they don't
add a lot of value.  Just coding dma_map/unmap_page()
inline would be fine and eliminate these lines of code.


OK. Will update.

  static void storvsc_device_scan(struct work_struct *work)
  {
        struct storvsc_scan_work *wrk;
@@ -1260,6 +1272,7 @@ static void storvsc_on_channel_callback(void *context)
        struct hv_device *device;
        struct storvsc_device *stor_device;
        struct Scsi_Host *shost;
+       int i;

        if (channel->primary_channel != NULL)
                device = channel->primary_channel->device_obj;
@@ -1314,6 +1327,15 @@ static void storvsc_on_channel_callback(void *context)
                                request = (struct storvsc_cmd_request 
*)scsi_cmd_priv(scmnd);
                        }

+                       if (request->dma_range) {
+                               for (i = 0; i < request->hvpg_count; i++)
+                                       storvsc_dma_unmap(&device->device,
+                                               request->dma_range[i],
+                                               
request->vstor_packet.vm_srb.data_in == READ_TYPE);


I think you can directly get the DMA direction as 
request->cmd->sc_data_direction.

+
+                               kfree(request->dma_range);
+                       }
+
                        storvsc_on_receive(stor_device, packet, request);
                        continue;
                }
@@ -1810,7 +1832,9 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
                unsigned int hvpgoff, hvpfns_to_add;
                unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
                unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
+               dma_addr_t dma;
                u64 hvpfn;
+               u32 size;

                if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {

@@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
                payload->range.len = length;
                payload->range.offset = offset_in_hvpg;

+               cmd_request->dma_range = kcalloc(hvpg_count,
+                                sizeof(*cmd_request->dma_range),
+                                GFP_ATOMIC);


With this patch, it appears that storvsc_queuecommand() is always
doing bounce buffering, even when running in a non-isolated VM.


In the non-isolated VM, SWIOTLB_FORCE mode isn't enabled and so
the swiotlb bounce buffer will not work.

The dma_range is always allocated, and the inner loop below does
the dma mapping for every I/O page.  The corresponding code in
storvsc_on_channel_callback() that does the dma unmap allows for
the dma_range to be NULL, but that never happens.


Yes, dma mapping function will return PA directly in non-isolated VM.

+               if (!cmd_request->dma_range) {
+                       ret = -ENOMEM;


The other memory allocation failure in this function returns
SCSI_MLQUEUE_DEVICE_BUSY.   It may be debatable as to whether
that's the best approach, but that's a topic for a different patch.  I
would suggest being consistent and using the same return code
here.


OK. I will keep to return SCSI_MLQUEUE_DEVICE_BUSY here.

+                       goto free_payload;
+               }

                for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
                        /*
@@ -1847,9 +1878,29 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
                         * last sgl should be reached at the same time that
                         * the PFN array is filled.
                         */
-                       while (hvpfns_to_add--)
-                               payload->range.pfn_array[i++] =      hvpfn++;
+                       while (hvpfns_to_add--) {
+                               size = min(HV_HYP_PAGE_SIZE - offset_in_hvpg,
+                                          (unsigned long)length);
+                               dma = storvsc_dma_map(&dev->device, 
pfn_to_page(hvpfn++),
+                                                     offset_in_hvpg, size,
+                                                     scmnd->sc_data_direction);
+                               if (dma_mapping_error(&dev->device, dma)) {
+                                       ret = -ENOMEM;


The typical error from dma_map_page() will be running out of
bounce buffer memory.   This is a transient condition that should be
retried at the higher levels.  So make sure to return an error code
that indicates the I/O should be resubmitted.


OK. It looks like error code should be SCSI_MLQUEUE_DEVICE_BUSY here.

+                                       goto free_dma_range;
+                               }
+
+                               if (offset_in_hvpg) {
+                                       payload->range.offset = dma & 
~HV_HYP_PAGE_MASK;
+                                       offset_in_hvpg = 0;
+                               }


I'm not clear on why payload->range.offset needs to be set again.
Even after the dma mapping is done, doesn't the offset in the first
page have to be the same?  If it wasn't the same, Hyper-V wouldn't
be able to process the PFN list correctly.  In fact, couldn't the above
code just always set offset_in_hvpg = 0?

The offset will be changed. The swiotlb bounce buffer is allocated withIO_TLB_SIZE(2K) as unit. So the offset here may be changed.

+
+                               cmd_request->dma_range[i].dma = dma;
+                               cmd_request->dma_range[i].mapping_size = size;
+                               payload->range.pfn_array[i++] = dma >> 
HV_HYP_PAGE_SHIFT;
+                               length -= size;
+                       }
                }
+               cmd_request->hvpg_count = hvpg_count;


This line just saves the size of the dma_range array.  Could
it be moved up with the code that allocates the dma_range
array?  To me, it would make more sense to have all that
code together in one place.


Sure. Will update.


The whole approach here is to do dma remapping on each individual page
of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
each scatterlist entry as a unit?  Each scatterlist entry describes a range of
physically contiguous memory.  After dma_map_sg(), the resulting dma
address must also refer to a physically contiguous range in the swiotlb
bounce buffer memory.   So at the top of the "for" loop over the scatterlist
entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
hvpfn value based on the dma address instead of sg_page().  But everything
else is the same, and the inner loop for populating the pfn_arry is unmodified.
Furthermore, the dma_range array that you've added is not needed, since
scatterlist entries already have a dma_address field for saving the mapped
address, and dma_unmap_sg() uses that field.

I don't use dma_map_sg() here in order to avoid introducing one moreloop(e,g dma_map_sg()). We already have a loop to populatecmd_request->dma_range[] and so do the dma map in the same loop.


One thing:  There's a maximum swiotlb mapping size, which I think works
out to be 256 Kbytes.  See swiotlb_max_mapping_size().  We need to make
sure that we don't get a scatterlist entry bigger than this size.  But I think
this already happens because you set the device->dma_mask field in
Patch 11 of this series.  __scsi_init_queue checks for this setting and
sets max_sectors to limits transfers to the max mapping size.


I will double check.


        cmd_request->payload = payload;
@@ -1860,13 +1911,20 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
        put_cpu();

        if (ret == -EAGAIN) {
-               if (payload_sz > sizeof(cmd_request->mpb))
-                       kfree(payload);
                /* no more space */
-               return SCSI_MLQUEUE_DEVICE_BUSY;
+               ret = SCSI_MLQUEUE_DEVICE_BUSY;
+               goto free_dma_range;
        }

        return 0;
+
+free_dma_range:
+       kfree(cmd_request->dma_range);
+
+free_payload:
+       if (payload_sz > sizeof(cmd_request->mpb))
+               kfree(payload);
+       return ret;
  }

  static struct scsi_host_template scsi_driver = {
--
2.25.1

Follow-Ups:
- RE: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
  - From: Michael Kelley
- Re: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
  - From: Tianyu Lan

References:
- [PATCH V3 00/13] x86/Hyper-V: Add Hyper-V Isolation VM support
  - From: Tianyu Lan
- [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
  - From: Tianyu Lan
- RE: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
  - From: Michael Kelley

Prev by Date: Re: [XEN RFC PATCH 07/40] xen/arm: use !CONFIG_NUMA to keep fake NUMA API
Next by Date: [xen-unstable-smoke test] 164271: tolerable all pass - PUSHED
Previous by thread: Re: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
Next by thread: Re: [PATCH V3 13/13] HV/Storvsc: Add Isolation VM support for storvsc driver
Index(es):
- Date
- Thread

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.