Mojo module
matmul_sm90
Aliases
NumWarpPerWarpGroup
alias NumWarpPerWarpGroup = 0 if (_resolve_warp_size() == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warpgroup_size()"), "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()"), "value">, 0), {1}, apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()")), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warpgroup_size()"), "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()"), "value">, 0), {1}, apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()")), "value">) == 0) ^ True) & ((_resolve_warp_size() < 0) ^ (_resolve_warpgroup_size() < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warpgroup_size()"), "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()"), "value">, 0), {1}, apply(:!lit.generator<() -> !lit.struct<@stdlib::@builtin::@int::@Int>> @stdlib::@gpu::@globals::@"_resolve_warp_size()")), "value">)
Functions
-
cluster_size
: -
consumer_main_loop
: -
cpasync_wgmma_kernel
: -
find_K_alignment_upto_16B
: Find alignment among 1B, 2B, 4B, 16B based on the row's bytes. -
hopper_matmul_tma_wgmma
: -
hopper_matmul_tma_wgmma_kernel
: -
promote_to_cuda_cores
: -
tma_wgmma_warp_specialized_gemm_kernel
: -
tma_wgmma_warp_specialized_gemm_kernel_persistent
: -
warp_specialize_gemm_with_multicasting
: -
warp_specialized_gemm_output
:
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!