|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
On 4/1/26 11:21, Jan Beulich wrote:
> On 01.04.2026 09:59, Mykyta Poturai wrote:
>> On 3/31/26 17:55, Jan Beulich wrote:
>>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev
>>>> *pdev, uint16_t cmd,
>>>>
>>>> bool vpci_process_pending(struct vcpu *v)
>>>> {
>>>> - const struct pci_dev *pdev = v->vpci.pdev;
>>>> - struct vpci_header *header = NULL;
>>>> - unsigned int i;
>>>> -
>>>> - if ( !pdev )
>>>> - return false;
>>>> -
>>>> - read_lock(&v->domain->pci_lock);
>>>> -
>>>> - if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>> + switch ( v->vpci.task )
>>>> {
>>>> - v->vpci.pdev = NULL;
>>>> - read_unlock(&v->domain->pci_lock);
>>>> - return false;
>>>> - }
>>>> -
>>>> - header = &pdev->vpci->header;
>>>> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> + case MODIFY_MEMORY:
>>>> {
>>>> - struct vpci_bar *bar = &header->bars[i];
>>>> - struct rangeset *mem = v->vpci.bar_mem[i];
>>>> - struct map_data data = {
>>>> - .d = v->domain,
>>>> - .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>> - .bar = bar,
>>>> - };
>>>> - int rc;
>>>> + const struct pci_dev *pdev = v->vpci.memory.pdev;
>>>> + struct vpci_header *header = NULL;
>>>> + unsigned int i;
>>>>
>>>> - if ( rangeset_is_empty(mem) )
>>>> - continue;
>>>> + if ( !pdev )
>>>> + break;
>>>>
>>>> - rc = rangeset_consume_ranges(mem, map_range, &data);
>>>> + read_lock(&v->domain->pci_lock);
>>>>
>>>> - if ( rc == -ERESTART )
>>>> + if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>> {
>>>> + v->vpci.memory.pdev = NULL;
>>>> read_unlock(&v->domain->pci_lock);
>>>> - return true;
>>>> + break;
>>>> }
>>>>
>>>> - if ( rc )
>>>> + header = &pdev->vpci->header;
>>>> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> {
>>>> - spin_lock(&pdev->vpci->lock);
>>>> - /* Disable memory decoding unconditionally on failure. */
>>>> - modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>>> - false);
>>>> - spin_unlock(&pdev->vpci->lock);
>>>> + struct vpci_bar *bar = &header->bars[i];
>>>> + struct rangeset *mem = v->vpci.bar_mem[i];
>>>> + struct map_data data = {
>>>> + .d = v->domain,
>>>> + .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>>> + .bar = bar,
>>>> + };
>>>> + int rc;
>>>> +
>>>> + if ( rangeset_is_empty(mem) )
>>>> + continue;
>>>>
>>>> - /* Clean all the rangesets */
>>>> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> - if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>> - rangeset_purge(v->vpci.bar_mem[i]);
>>>> + rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>
>>>> - v->vpci.pdev = NULL;
>>>> + if ( rc == -ERESTART )
>>>> + {
>>>> + read_unlock(&v->domain->pci_lock);
>>>> + return true;
>>>> + }
>>>>
>>>> - read_unlock(&v->domain->pci_lock);
>>>> + if ( rc )
>>>> + {
>>>> + spin_lock(&pdev->vpci->lock);
>>>> + /* Disable memory decoding unconditionally on failure. */
>>>> + modify_decoding(pdev, v->vpci.memory.cmd &
>>>> ~PCI_COMMAND_MEMORY,
>>>> + false);
>>>> + spin_unlock(&pdev->vpci->lock);
>>>> +
>>>> + /* Clean all the rangesets */
>>>> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> + if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>> + rangeset_purge(v->vpci.bar_mem[i]);
>>>> +
>>>> + v->vpci.memory.pdev = NULL;
>>>> +
>>>> + read_unlock(&v->domain->pci_lock);
>>>>
>>>> - if ( !is_hardware_domain(v->domain) )
>>>> - domain_crash(v->domain);
>>>> + if ( !is_hardware_domain(v->domain) )
>>>> + domain_crash(v->domain);
>>>>
>>>> - return false;
>>>> + break;
>>>> + }
>>>> }
>>>> - }
>>>> - v->vpci.pdev = NULL;
>>>> + v->vpci.memory.pdev = NULL;
>>>>
>>>> - spin_lock(&pdev->vpci->lock);
>>>> - modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>>> - spin_unlock(&pdev->vpci->lock);
>>>> + spin_lock(&pdev->vpci->lock);
>>>> + modify_decoding(pdev, v->vpci.memory.cmd,
>>>> v->vpci.memory.rom_only);
>>>> + spin_unlock(&pdev->vpci->lock);
>>>>
>>>> - read_unlock(&v->domain->pci_lock);
>>>> + read_unlock(&v->domain->pci_lock);
>>>> +
>>>> + break;
>>>> + }
>>>> + case WAIT:
>>>> + if ( NOW() < v->vpci.wait.end )
>>>> + return true;
>>>> + v->vpci.wait.callback(v->vpci.wait.data);
>>>> + break;
>>>
>>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>>> This is even more so when the waiting exceeds the typical length of a
>>> scheduling timeslice.
>>>
>>> In that other reply I said to put the vCPU to sleep, but you need to be
>>> careful
>>> there too: The domain may not expect its vCPU to not make any progress for
>>> such
>>> an extended period of time. This may need doing entirely differently: Once
>>> the
>>> command register was written, you may want to record the time after which
>>> accesses to the VF registers are permitted. Earlier accesses would simply be
>>> terminated. You may still additionally need a timer, in order to kick off
>>> BAR
>>> mapping after that time. (Yet better would be if the BAR mapping could be
>>> done during those 100ms. After all that may be a reason why this long a
>>> delay
>>> is specified: Firmware on the device may also require some time to set up
>>> the
>>> BARs accordingly.)
>>
>> I am not sure it would work that way. If we look at how linux
>> initialized sriov, it writes VFE and MSE bits, waits 100ms and then
>> expects VFs to be operational. If they are not operational at that
>> moment, then it considers the operation failed and removes all VFs. If
>> we also wait 100ms before enabling access, the probability of a guest
>> trying to access something before we allow it would be very high.
>
> Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
> Furthermore it may be acceptable (or even appropriate) to stall premature
> accesses (because they shouldn't occur in the first place), by blocking the
> vCPU at that point. A middle route may be possible: Terminate accesses in,
> say, the first 90ms, and stall the vCPU for any access past that, but before
> the 100ms expired.
>
Is there any real benefit to doing all this work instead of just waiting
for Dom0 to register the FVs? Implementing it the way you described
would require a relatively complex state machine and two timers per
sriov-capable device. And will also probably require some hacks to
handle partially initialized VFs in Xen. This adds a lot of work and
many possible bugs for not a lot of benefit in my opinion.
--
Mykyta
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |