Mojo module
matmul_gpu
Aliases
-
tile_shapes_128X128X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_128X128X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_128X256X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_128X256X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_128X64X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_128X64X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X128X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X128X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X256X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X256X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X64X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_256X64X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X128X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X128X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X256X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X256X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X64X32 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
: -
tile_shapes_64X64X64 = _get_block_warp_tile_shape[::Int,::Int,::Int]()
:
Functions
-
__nvvm_ldg_f4
: -
matmul_kernel
: Matrix Multiplication using shared memory. This version loads blocks of size tile_size x tile_size from A and B and updates a tile_size x tile_size in C. The thread block should have shape (tile_size, tile_size, 1). Each thread is mapped one element in C. The grid should have shape (N/tile_size, M/tile_size, 1). N is the first dimension for coalesced access. -
matmul_kernel_naive
: -
multistage_gemm
: -
split_k_reduce
:
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!