# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.78% 468.612us 53.77% 2.576ms 2.576ms 0.000us 0.00% 3.664ms 3.664ms 1
xformers_flash3::flash_fwd 4.05% 193.923us 43.19% 2.069ms 689.708us 0.000us 0.00% 3.664ms 1.221ms 3
flash_attn_3::fwd 1.52% 72.582us 39.15% 1.875ms 625.067us 2.752ms 100.00% 3.664ms 1.221ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.752ms 100.00% 2.752ms 917.464us 3
Activity Buffer Request 35.57% 1.704ms 35.57% 1.704ms 1.704ms 911.394us 33.11% 911.394us 911.394us 1
aten::empty 0.91% 43.821us 0.91% 43.821us 7.304us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.25% 12.121us 0.25% 12.121us 4.040us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.89% 42.701us 0.89% 42.701us 14.234us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.31% 15.029us 0.79% 38.050us 6.342us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.48% 23.021us 0.48% 23.021us 3.837us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 46.23% 2.215ms 46.23% 2.215ms 2.215ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.790ms
Self CUDA time total: 2.752ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.55% 315.485us 49.52% 2.386ms 2.386ms 0.000us 0.00% 3.791ms 3.791ms 1
xformers_flash3::flash_fwd 2.94% 141.873us 42.50% 2.048ms 682.535us 0.000us 0.00% 3.791ms 1.264ms 3
flash_attn_3::fwd 1.10% 52.803us 39.56% 1.906ms 635.244us 2.857ms 100.00% 3.791ms 1.264ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.05% 2.858ms 2.858ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 100.00% 2.857ms 952.327us 3
Activity Buffer Request 37.05% 1.785ms 37.05% 1.785ms 1.785ms 933.660us 32.68% 933.660us 933.660us 1
aten::empty 0.60% 29.019us 0.60% 29.019us 4.837us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.710us 0.12% 5.710us 1.903us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.69% 33.350us 0.69% 33.350us 11.117us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.801us 0.47% 22.752us 3.792us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 13.951us 0.29% 13.951us 2.325us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 50.48% 2.432ms 50.48% 2.432ms 2.432ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.818ms
Self CUDA time total: 2.857ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.44% 303.576us 47.74% 2.252ms 2.252ms 0.000us 0.00% 3.845ms 3.845ms 1
xformers_flash3::flash_fwd 3.02% 142.344us 40.83% 1.926ms 641.984us 0.000us 0.00% 3.845ms 1.282ms 3
flash_attn_3::fwd 1.11% 52.511us 37.81% 1.784ms 594.536us 2.878ms 100.00% 3.845ms 1.282ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.880ms 100.05% 2.880ms 2.880ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.487us 3
Activity Buffer Request 35.25% 1.663ms 35.25% 1.663ms 1.663ms 967.007us 33.59% 967.007us 967.007us 1
aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.72% 33.781us 0.72% 33.781us 11.260us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.350us 0.47% 21.990us 3.665us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 13.640us 0.29% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 52.26% 2.465ms 52.26% 2.465ms 2.465ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.717ms
Self CUDA time total: 2.878ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.01% 303.306us 50.06% 2.525ms 2.525ms 0.000us 0.00% 3.923ms 3.923ms 1
xformers_flash3::flash_fwd 2.90% 146.364us 43.59% 2.199ms 733.113us 0.000us 0.00% 3.923ms 1.308ms 3
flash_attn_3::fwd 1.02% 51.431us 40.69% 2.053ms 684.325us 2.938ms 100.00% 3.923ms 1.308ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.195us 3
Activity Buffer Request 34.86% 1.758ms 34.86% 1.758ms 1.758ms 985.691us 33.55% 985.691us 985.691us 1
aten::empty 0.57% 28.860us 0.57% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.561us 0.11% 5.561us 1.854us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.14% 208.674us 4.14% 208.674us 69.558us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 9.230us 0.45% 22.800us 3.800us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.27% 13.570us 0.27% 13.570us 2.262us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 49.94% 2.520ms 49.94% 2.520ms 2.520ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.045ms
Self CUDA time total: 2.938ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.53% 307.446us 44.37% 2.468ms 2.468ms 0.000us 0.00% 4.694ms 4.694ms 1
xformers_flash3::flash_fwd 2.65% 147.575us 38.45% 2.139ms 712.966us 0.000us 0.00% 4.694ms 1.565ms 3
flash_attn_3::fwd 0.89% 49.519us 35.79% 1.991ms 663.774us 3.515ms 100.00% 4.694ms 1.565ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.517ms 100.05% 3.517ms 3.517ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.515ms 100.00% 3.515ms 1.172ms 3
Activity Buffer Request 30.66% 1.706ms 30.66% 1.706ms 1.706ms 1.179ms 33.55% 1.179ms 1.179ms 1
aten::empty 0.52% 28.861us 0.52% 28.861us 4.810us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 6.000us 0.11% 6.000us 2.000us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.61% 201.015us 3.61% 201.015us 67.005us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.290us 0.39% 21.930us 3.655us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.25% 13.640us 0.25% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.63% 3.095ms 55.63% 3.095ms 3.095ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.563ms
Self CUDA time total: 3.515ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.46% 305.147us 45.13% 2.521ms 2.521ms 0.000us 0.00% 4.658ms 4.658ms 1
xformers_flash3::flash_fwd 2.65% 147.824us 39.28% 2.194ms 731.306us 0.000us 0.00% 4.658ms 1.553ms 3
flash_attn_3::fwd 0.94% 52.350us 36.63% 2.046ms 682.031us 3.488ms 100.00% 4.658ms 1.553ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.489ms 100.05% 3.489ms 3.489ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.488ms 100.00% 3.488ms 1.163ms 3
Activity Buffer Request 31.45% 1.757ms 31.45% 1.757ms 1.757ms 1.171ms 33.57% 1.171ms 1.171ms 1
aten::empty 0.54% 29.960us 0.54% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.370us 0.10% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.61% 201.885us 3.61% 201.885us 67.295us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.170us 0.39% 21.900us 3.650us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.25% 13.730us 0.25% 13.730us 2.288us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.87% 3.065ms 54.87% 3.065ms 3.065ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.586ms
Self CUDA time total: 3.488ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.99 True
xformers_meff cuda_attn_L256_bfloat16 1.04 True
xformers_meff cuda_attn_L320_bfloat16 1.07 True
xformers_meff cuda_attn_L384_bfloat16 1.08 True
xformers_meff cuda_attn_L448_bfloat16 1.26 True
xformers_meff cuda_attn_L512_bfloat16 1.25 True
▶ UV Install Logs