add vllm kv layout for xqa mla

qsang-nv · qsang-nv · commit aafb23e99ee9 · 2025-10-21T19:37:26.000-07:00
Signed-off-by: Qidi Sang &lt;200703406+qsang-nv@users.noreply.github.com&gt;
diff --git a/cpp/kernels/xqa/mha.h b/cpp/kernels/xqa/mha.h
@@ -169,9 +169,14 @@ void launchMLA(cudaDeviceProp const& prop,
     uint32_t inputSeqLen, // uniform for all requests and causal mask is assumed
     float qScale, OutputHead* output, InputHead const* q,
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
     GMemCacheHead* pool, // global pool of pages
+#endif
     KVCachePageIndex const*
-        kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+        kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
+                         // [batchSize][maxNbPagesPerSeq] (Layout 1)
 #else
     GMemKVCacheHead* kvCacheData,
 #endif
diff --git a/cpp/kernels/xqa/mla_sm120.cu b/cpp/kernels/xqa/mla_sm120.cu
@@ -112,7 +112,11 @@ __device__ inline KVTilePartLoader::KVTilePartLoader(
     , tensorMap{tensorMap}
 #if USE_PAGED_KV_CACHE
     , nbPages{nbPages}
+#if PAGED_KV_CACHE_LAYOUT == 1
+    , baseOffset{idxReq * cacheList.maxNbPagesPerSeq}
+#else
     , baseOffset{((idxReq * beamWidth) * 2) * cacheList.maxNbPagesPerSeq}
+#endif
 #else
     , baseOffset{(idxReq * beamWidth) * 2}
 #endif
@@ -139,7 +143,11 @@ __device__ inline void KVTilePartLoader::loadData(Array2D<LdGrain, nbTokens, gra
         uint32_t const offset = nbTokens * (idxTile % exactDiv(tokensPerPage, nbTokens));
         if (warpElectSync())
         {
+#if PAGED_KV_CACHE_LAYOUT == 1
+            tma::loadAsync(&dst, tensorMap, DimsLE<4>{idxElemBeg, idxHeadGrp, offset, (uint32_t) pages[0]}, bar);
+#else
             tma::loadAsync(&dst, tensorMap, DimsLE<4>{idxElemBeg, offset, idxHeadGrp, (uint32_t) pages[0]}, bar);
+#endif
         }
     }
     else
@@ -149,8 +157,13 @@ __device__ inline void KVTilePartLoader::loadData(Array2D<LdGrain, nbTokens, gra
         {
             if (warpElectSync())
             {
+#if PAGED_KV_CACHE_LAYOUT == 1
+                tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
+                    DimsLE<4>{idxElemBeg, idxHeadGrp, 0, (uint32_t) pages[i]}, bar);
+#else
                 tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
                     DimsLE<4>{idxElemBeg, 0, idxHeadGrp, (uint32_t) pages[i]}, bar);
+#endif
             }
         }
     }
@@ -1859,13 +1872,18 @@ CUtensorMap makeTensorMapForQ(
 #endif // IS_MLA
 
 void launchMLA(cudaDeviceProp const& prop,
-    uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
+    uint32_t inputSeqLen, // uniform for all requests and causal mask is assumed
     float qScale, OutputHead* output, InputHead const* q,
-    float* attentionSinks, // [headGrpSize], not supported.
 #if USE_PAGED_KV_CACHE
-    GMemCacheHead* pool,   // global pool of pages
+#if PAGED_KV_CACHE_LAYOUT == 1
+    GMemCacheHead* kCacheVLLM, // K cache pool for VLLM layout
+    GMemCacheHead* vCacheVLLM, // V cache pool for VLLM layout
+#else
+    GMemCacheHead* pool, // global pool of pages
+#endif
     KVCachePageIndex const*
-        kvCachePageList,   // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+        kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
+                         // [batchSize][maxNbPagesPerSeq] (Layout 1)
 #else
     GMemKVCacheHead* kvCacheData,
 #endif
@@ -1916,7 +1934,11 @@ void launchMLA(cudaDeviceProp const& prop,
     auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
 #if USE_PAGED_KV_CACHE
     uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
+#if PAGED_KV_CACHE_LAYOUT == 1
+    KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen, maxNbPagesPerSeq};
+#else
     KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
+#endif
     auto const dtype = []
     {
         if (std::is_same_v<CacheElem, half>)
@@ -1936,10 +1958,17 @@ void launchMLA(cudaDeviceProp const& prop,
 
     auto const tensorMapQ
         = makeTensorMapForQ(q, dtype, validElemsPerHead, headGrpSize * inputSeqLen * batchSize, partElemsK);
+#if PAGED_KV_CACHE_LAYOUT == 1
+    auto const tensorMapK = makeTensorMapForPagedKVCache(
+        kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile);
+    auto const tensorMapV = makeTensorMapForPagedKVCache(
+        vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile);
+#else
     auto const tensorMapK = makeTensorMapForPagedKVCache(
         pool, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile);
     auto const tensorMapV = makeTensorMapForPagedKVCache(
         pool, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile);
+#endif
 
     uint32_t const nbCgas = exactDiv(dimGrid.x, 4) * dimGrid.y * dimGrid.z;
     auto const cgaXBuf = static_cast<Vec<CgaXBuffer, nbProducerCtasPerCga>*>(scratch);
diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp
@@ -79,7 +79,22 @@ class ManagedMemBuf
     {
         if (!isTracing)
         {
+#if CUDA_VERSION >= 13000
+            cudaMemLocation location;
+            if (dstDevice == cudaCpuDeviceId)
+            {
+                location.type = cudaMemLocationTypeHost;
+                location.id = 0;
+            }
+            else
+            {
+                location.type = cudaMemLocationTypeDevice;
+                location.id = dstDevice;
+            }
+            checkCuda(cudaMemPrefetchAsync(get(), sizeof(T) * size(), location, 0, stream));
+#else
             checkCuda(cudaMemPrefetchAsync(get(), sizeof(T) * size(), dstDevice, stream));
+#endif
         }
     }
 
@@ -507,6 +522,9 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
 #endif
 #if IS_MLA
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+        // VLLM format: K and V share the same pageList, no copy needed
+#else
         for (uint32_t idxReq = 0; idxReq < batchSize; idxReq++)
         {
             for (uint32_t idxBeam = 0; idxBeam < beamWidth; idxBeam++)
@@ -517,6 +535,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
                 }
             }
         }
+#endif
 #else
         static_assert(false, "not implemented");
 #endif
@@ -691,7 +710,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
 #else
             &output[0][0][0], &qHeads[0][0][0],
 #endif
+#if PAGED_KV_CACHE_LAYOUT == 1 && USE_PAGED_KV_CACHE
+            cacheKHeads.get(), cacheVHeads.get(),
+#else
             cacheHeads.get(),
+#endif
 #if USE_PAGED_KV_CACHE
             pageListArg,
 #endif
@@ -790,7 +813,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         float ms;
         checkCuda(cudaEventElapsedTime(&ms, tic, toc));
         ms /= nbIters;
+#if CUDA_VERSION >= 13000
+        int memoryClockRateKHz;
+        checkCuda(cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, device));
+        float const bandwidth = 2.f * prop.memoryBusWidth * memoryClockRateKHz * 1000 / 8;
+#else
         float const bandwidth = 2.f * prop.memoryBusWidth * prop.memoryClockRate * 1000 / 8;
+#endif
 #if BEAM_WIDTH == 1
         size_t nbLoadedCacheTokens = seqLen * beamWidth * batchSize;
 #else
@@ -819,7 +848,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         {
             printf("done\n");
             printf("time: %f ms\n", ms);
+#if CUDA_VERSION >= 13000
+            printf("mem bus width = %d\nmem clock rate = %d\n", prop.memoryBusWidth, memoryClockRateKHz);
+#else
             printf("mem bus width = %d\nmem clock rate = %d\n", prop.memoryBusWidth, prop.memoryClockRate);
+#endif
             printf("bandwidth = %e\n", (float) bandwidth);
             printf("traffic=%e\n", (float) totalTraffic);
         }
diff --git a/cpp/kernels/xqa/test/warmup.cu b/cpp/kernels/xqa/test/warmup.cu
@@ -12,7 +12,15 @@ __global__ void kernel_warmup(uint64_t cycles)
 
 void warmup(cudaDeviceProp const& prop, float ms, cudaStream_t stream = nullptr)
 {
+#if CUDA_VERSION >= 13000
+    int device;
+    checkCuda(cudaGetDevice(&device));
+    int clockRateKHz;
+    checkCuda(cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, device));
+    uint64_t const nbCycles = std::round(clockRateKHz * ms); // clockRate is in kHz
+#else
     uint64_t const nbCycles = std::round(prop.clockRate * ms); // clockRate is in kHz
+#endif
     kernel_warmup<<<16, 128, 0, stream>>>(nbCycles);
     checkCuda(cudaGetLastError());
 }