[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [XENVIF PATCH RESEND] Implement faster checksum for x64


  • To: Tu Dinh <ngoc-tu.dinh@xxxxxxxxxx>, "win-pv-devel@xxxxxxxxxxxxxxxxxxxx" <win-pv-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Owen Smith <owen.smith@xxxxxxxxxx>
  • Date: Tue, 14 Oct 2025 11:03:05 +0000
  • Accept-language: en-GB, en-US
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=citrix.com; dmarc=pass action=none header.from=citrix.com; dkim=pass header.d=citrix.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector10001; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=7AnvW6qL7K8xjzx7AJY9yKWG4CW62jgomQCSP6TPOPA=; b=PJ5LluZav4QsC7El46IfK4mcYMLTo8gPrqzgCVqxHpcN5WZ1wmP7QlpC2Vx4as5Q8U8Rp/xwYUwkvDiVClVffIJnxF2Yct8qNJvuJli5SMAAC/hpkToDgi87qFWolh26KYkAkW1p1E0rBxT1tl1vjCjaS2W2Uc6deKaut4x/xe0SlOPDXWSiJ/4tfpH9m4Bj5YJA4qyCalQyxjpH8fsbK6VltQ2qepcNEGtTWUbdnP4kg8HhIPu7pmHmN859sfatb1BMCcKgbZ8qVx6tUhF0J1AmrCjVW+W/luoAgjJm0Ip4JGMKlymjq5Gt66Vt7sno7DSVOYUnYfCiqQFb0UIRnA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none; b=sWftIYcH9mAiAjFXIod3qSw+bS5Nld1zEhVYectr5d/RpqQ7TVP3QeqrDAkqxVXZAi1R2PnddJi2ALyAOYLks8X9vp4b4zWvc+vSQ2XbLZLpVE1DbinBJmNg5PyTEwPeOgCzkqQy/57ckKzM2ANuKOBFatIINJyhRMT25hmvyfbpKd6G1ETsAgqXctZV5V+KQTmFVRFHRqFbEZMFmkTuXdoeP9SLlz04b4MxqSfF3So4XTGGkTAbByVdv1UEWwxuOswMVFgpUw0jF2j6qSDv1GsvlL0dM8TJprnnnC2jEKVAsZ6mWgYjZpaV6kkn7HjXOUTMrybwQrArXGrcgVdsCA==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=citrix.com;
  • Cc: Owen Smith <owen.smith@xxxxxxxxx>
  • Delivery-date: Tue, 14 Oct 2025 11:03:21 +0000
  • List-id: Developer list for the Windows PV Drivers subproject <win-pv-devel.lists.xenproject.org>
  • Msip_labels:
  • Thread-index: AQHcOcWgrxgEvuusrkOggt0FireYCbTBf94A
  • Thread-topic: [XENVIF PATCH RESEND] Implement faster checksum for x64

Reviewed-by: Owen Smith <owen.smith@xxxxxxxxx>

________________________________________
From: Tu Dinh <ngoc-tu.dinh@xxxxxxxxxx>
Sent: 10 October 2025 10:09 AM
To: win-pv-devel@xxxxxxxxxxxxxxxxxxxx
Cc: Tu Dinh; Owen Smith
Subject: [XENVIF PATCH RESEND] Implement faster checksum for x64

Implement the Internet checksum using parallel summation based on
add-with-carry per RFC 1071, following the algorithm described here:

https://blogs.igalia.com/dpino/2018/06/14/fast-checksum-computation/

This implementation provides >8x speed improvement over the previous C
implementation:

                C               assembly
    40          9.53867 ns      2.77659 ns
    128         31.0441 ns      4.43259 ns
    576         125.706 ns      14.332 ns
    1500        320.611 ns      38.434 ns
    2048        436.591 ns      53.45 ns
    4096        869.137 ns      114.892 ns
    8192        1.73352 us      239.596 ns
    16384       3.46273 us      482.943 ns
    32768       6.92263 us      974.495 ns
    65535       13.8462 us      1.9464 us

At the same time, move the entire checksum code into a single function
instead of having a static inline implementation and a wrapper.

Signed-off-by: Tu Dinh Ngoc <dinhngoc.tu@xxxxxxx>
Signed-off-by: Tu Dinh <ngoc-tu.dinh@xxxxxxxxxx>
---
Resending a 4th time (!) due to previous email being lost.
---
 src/xenvif/amd64/checksum_amd64.asm | 120 ++++++++++++++++++++++++++++
 src/xenvif/checksum.c               |  37 ++++-----
 vs2019/xenvif/xenvif.vcxproj        |   5 ++
 vs2022/xenvif/xenvif.vcxproj        |   5 ++
 4 files changed, 145 insertions(+), 22 deletions(-)
 create mode 100644 src/xenvif/amd64/checksum_amd64.asm

diff --git a/src/xenvif/amd64/checksum_amd64.asm 
b/src/xenvif/amd64/checksum_amd64.asm
new file mode 100644
index 0000000..8fbc241
--- /dev/null
+++ b/src/xenvif/amd64/checksum_amd64.asm
@@ -0,0 +1,120 @@
+; SPDX-License-Identifier: MIT
+
+.code
+
+;   VOID
+;   AccumulateChecksum(
+;       IN OUT  PULONG  Accumulator,
+;       IN      PUCHAR  BaseVa,
+;       IN      ULONG   ByteCount
+;       )
+public AccumulateChecksum
+AccumulateChecksum  proc
+    ; rcx:  Accumulator
+    ; rdx:  BaseVa
+    ; r8:   ByteCount
+    ; rax:  temporary accumulator
+    ; r9:   scratch
+
+    mov eax, [rcx]
+
+l64:
+    cmp r8, 64                      ; eight qwords
+    jb l32
+
+    add rax, [rdx]
+    adc rax, [rdx + 8]
+    adc rax, [rdx + 16]
+    adc rax, [rdx + 24]
+    adc rax, [rdx + 32]
+    adc rax, [rdx + 40]
+    adc rax, [rdx + 48]
+    adc rax, [rdx + 56]
+    adc rax, 0
+
+    sub r8, 64
+    add rdx, 64
+    jmp l64
+
+l32:
+    cmp r8, 32                      ; four qwords
+    jb l16
+
+    add rax, [rdx]
+    adc rax, [rdx + 8]
+    adc rax, [rdx + 16]
+    adc rax, [rdx + 24]
+    adc rax, 0
+
+    sub r8, 32
+    add rdx, 32
+
+l16:
+    cmp r8, 16                      ; two qwords
+    jb l8
+
+    add rax, [rdx]
+    adc rax, [rdx + 8]
+    adc rax, 0
+
+    sub r8, 16
+    add rdx, 16
+
+l8:
+    cmp r8, 8                       ; one qword
+    jb l4
+
+    add rax, [rdx]
+    adc rax, 0
+
+    sub r8, 8
+    add rdx, 8
+
+l4:
+    cmp r8, 4                       ; one dword
+    jb l2
+
+    mov r9d, dword ptr [rdx]
+    add rax, r9
+    adc rax, 0
+
+    sub r8, 4
+    add rdx, 4
+
+l2:
+    cmp r8, 2                       ; one word
+    jb l1
+
+    movzx r9d, word ptr [rdx]
+    add rax, r9
+    adc rax, 0
+
+    sub r8, 2
+    add rdx, 2
+
+l1:
+    cmp r8, 1                       ; last byte
+    jb l0
+
+    movzx r9d, byte ptr [rdx]
+    add rax, r9
+    adc rax, 0
+
+l0:
+    mov r9, rax
+    shr r9, 32
+    add eax, r9d
+    adc eax, 0
+
+    mov r9d, eax
+    shr r9d, 16
+    add ax, r9w
+    adc ax, 0
+
+    movzx eax, ax
+
+    mov [rcx], eax
+    ret
+AccumulateChecksum  endp
+
+end
diff --git a/src/xenvif/checksum.c b/src/xenvif/checksum.c
index 4e8f2fc..a7181c6 100644
--- a/src/xenvif/checksum.c
+++ b/src/xenvif/checksum.c
@@ -35,6 +35,7 @@
 #include <stdlib.h>
 #include <ethernet.h>
 #include <tcpip.h>
+#include <intrin.h>

 #include <vif_interface.h>

@@ -43,8 +44,9 @@
 #include "assert.h"
 #include "util.h"

-static FORCEINLINE VOID
-__AccumulateChecksum(
+#if !defined(_M_X64)
+VOID
+AccumulateChecksum(
     IN OUT  PULONG  Accumulator,
     IN      PUCHAR  BaseVa,
     IN      ULONG   ByteCount
@@ -70,16 +72,7 @@ __AccumulateChecksum(

     *Accumulator = Current;
 }
-
-VOID
-AccumulateChecksum(
-    IN OUT  PULONG  Accumulator,
-    IN      PVOID   BaseVa,
-    IN      ULONG   ByteCount
-    )
-{
-    __AccumulateChecksum(Accumulator, BaseVa, ByteCount);
-}
+#endif

 BOOLEAN
 ChecksumVerify(
@@ -96,7 +89,7 @@ ChecksumVerify(
     Accumulator &= 0xFFFF;

     // See RFC 1624, section 5
-    __AccumulateChecksum(&Accumulator, (PUCHAR)&Embedded, sizeof (USHORT));
+    AccumulateChecksum(&Accumulator, (PUCHAR)&Embedded, sizeof (USHORT));

     return (Accumulator == 0xFFFF) ? TRUE : FALSE;
 }
@@ -120,7 +113,7 @@ __ChecksumIpVersion4PseudoHeader(
     Header.Protocol = Protocol;

     Accumulator = 0;
-    __AccumulateChecksum(&Accumulator, (PUCHAR)&Header, sizeof 
(IPV4_PSEUDO_HEADER));
+    AccumulateChecksum(&Accumulator, (PUCHAR)&Header, sizeof 
(IPV4_PSEUDO_HEADER));

     // As-per RFC1624, Accumulator should never be 0.
     ASSERT(Accumulator != 0);
@@ -161,7 +154,7 @@ __ChecksumIpVersion6PseudoHeader(
     Header.NextHeader = Protocol;

     Accumulator = 0;
-    __AccumulateChecksum(&Accumulator, (PUCHAR)&Header, sizeof 
(IPV6_PSEUDO_HEADER));
+    AccumulateChecksum(&Accumulator, (PUCHAR)&Header, sizeof 
(IPV6_PSEUDO_HEADER));

     // As-per RFC1624, Accumulator should never be 0.
     ASSERT(Accumulator != 0);
@@ -250,14 +243,14 @@ ChecksumIpVersion4Header(
     Header->Checksum = 0;

     Accumulator = 0;
-    __AccumulateChecksum(&Accumulator,
+    AccumulateChecksum(&Accumulator,
                          StartVa + Info->IpHeader.Offset,
                          Info->IpHeader.Length);

     Header->Checksum = Saved;

     if (Info->IpOptions.Length != 0)
-        __AccumulateChecksum(&Accumulator,
+        AccumulateChecksum(&Accumulator,
                              StartVa + Info->IpOptions.Offset,
                              Info->IpOptions.Length);

@@ -293,14 +286,14 @@ ChecksumTcpPacket(
     TcpHeader->Checksum = 0;

     Accumulator = PseudoHeaderChecksum;
-    __AccumulateChecksum(&Accumulator,
+    AccumulateChecksum(&Accumulator,
                          StartVa + Info->TcpHeader.Offset,
                          Info->TcpHeader.Length);

     TcpHeader->Checksum = Saved;

     if (Info->TcpOptions.Length != 0)
-        __AccumulateChecksum(&Accumulator,
+        AccumulateChecksum(&Accumulator,
                              StartVa + Info->TcpOptions.Offset,
                              Info->TcpOptions.Length);

@@ -338,7 +331,7 @@ ChecksumTcpPacket(
         ByteCount -= Offset;
         ByteCount = __min(ByteCount, Length);

-        __AccumulateChecksum(&Accumulator, BaseVa, ByteCount);
+        AccumulateChecksum(&Accumulator, BaseVa, ByteCount);

         Length -= ByteCount;

@@ -378,7 +371,7 @@ ChecksumUdpPacket(
     UdpHeader->Checksum = 0;

     Accumulator = PseudoHeaderChecksum;
-    __AccumulateChecksum(&Accumulator,
+    AccumulateChecksum(&Accumulator,
                          StartVa + Info->UdpHeader.Offset,
                          Info->UdpHeader.Length);

@@ -417,7 +410,7 @@ ChecksumUdpPacket(
         ByteCount -= Offset;
         ByteCount = __min(ByteCount, Length);

-        __AccumulateChecksum(&Accumulator, BaseVa, ByteCount);
+        AccumulateChecksum(&Accumulator, BaseVa, ByteCount);

         Length -= ByteCount;

diff --git a/vs2019/xenvif/xenvif.vcxproj b/vs2019/xenvif/xenvif.vcxproj
index 0412426..80f37b6 100644
--- a/vs2019/xenvif/xenvif.vcxproj
+++ b/vs2019/xenvif/xenvif.vcxproj
@@ -88,6 +88,11 @@
     <ClCompile Include="../../src/xenvif/controller.c" />
     <ClCompile Include="../../src/xenvif/vif.c" />
   </ItemGroup>
+  <ItemGroup>
+    <MASM Include="../../src/xenvif/amd64/checksum_amd64.asm">
+      <ExcludedFromBuild 
Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </MASM>
+  </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\src\xenvif\xenvif.rc" />
   </ItemGroup>
diff --git a/vs2022/xenvif/xenvif.vcxproj b/vs2022/xenvif/xenvif.vcxproj
index be84232..8b7d910 100644
--- a/vs2022/xenvif/xenvif.vcxproj
+++ b/vs2022/xenvif/xenvif.vcxproj
@@ -80,6 +80,11 @@
     <ClCompile Include="../../src/xenvif/controller.c" />
     <ClCompile Include="../../src/xenvif/vif.c" />
   </ItemGroup>
+  <ItemGroup>
+    <MASM Include="../../src/xenvif/amd64/checksum_amd64.asm">
+      <ExcludedFromBuild 
Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </MASM>
+  </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\src\xenvif\xenvif.rc" />
   </ItemGroup>
--
2.51.0.windows.1



--
Ngoc Tu Dinh | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.