Mojo struct
SM100TensorAccumulatorTS
@register_passable(trivial)
struct SM100TensorAccumulatorTS[operand_type: DType, accum_type: DType, MMA_M: Int, MMA_N: Int, BM: Int, BN: Int, BK: Int, num_consumer_threads: Int, swizzle_b: TensorMapSwizzle = TensorMapSwizzle(__init__[__mlir_type.!pop.int_literal](3)), transpose_b: Bool = True, cta_group: Int = 1]
Fields
- mbar (
UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=8]
): - phase (
SIMD[uint32, 1]
):
Implemented traits
AnyType
,
AsyncTensorAccumulatorTS
,
Copyable
,
Movable
,
UnknownDestructibility
Aliases
a_frag_size
alias a_frag_size = 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 16), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 16), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((MMA_M * 16) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 16), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)
a_t
alias a_t = TMemOperand[operand_type, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">), 0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), BK, 16, num_consumer_threads]
ab_t
alias ab_t = UMMADescriptorTS[operand_type, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">), MMA_M=0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), MMA_N=BK, MMA_K=16, consumer_group_size=num_consumer_threads]
accum_t
alias accum_t = accum_type
b_offset
alias b_offset = MMAOperandOffsetFn()
b_t
alias b_t = MMASmemDescriptor
c_frag_size
alias c_frag_size = 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((MMA_M * MMA_N) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)
c_t
alias c_t = TMemAccumulator[accum_type, 0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), 0), lt(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), MMA_N, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">), num_consumer_threads]
idesc
alias idesc = create[::DType,::DType,::DType,::IndexList[::Int()
MMA_K
alias MMA_K = 16
num_k_mmas
alias num_k_mmas = (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BK, "value">, 16) + -1) if ((BK < 0) & ((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BK, "value">, 16) == 0) ^ True)) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BK, "value">, 16)
num_m_blocks_per_warp
alias num_m_blocks_per_warp = 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, 2), #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)
num_m_mmas
alias num_m_mmas = 0 if (MMA_M == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 0), {1}, MMA_M), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 0), {1}, MMA_M), "value">) == 0) ^ True) & ((BM < 0) ^ (MMA_M < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BM, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_M, "value">, 0), {1}, MMA_M), "value">)
num_n_mmas
alias num_n_mmas = 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:@stdlib::@builtin::@int::@Int BN, "value">, #lit.struct.extract<:@stdlib::@builtin::@int::@Int cond(eq(#lit.struct.extract<:@stdlib::@builtin::@int::@Int MMA_N, "value">, 0), {1}, MMA_N), "value">)
operand_t
alias operand_t = operand_type
smem_ptr_t
alias smem_ptr_t = UnsafePointer[SIMD[operand_type, 1], address_space=AddressSpace(3)]
Methods
__init__
__init__(smem: UnsafePointer[SharedMemBarrier, address_space=AddressSpace(3), alignment=8]) -> Self
check_constraints
static check_constraints()
init
init(self)
a_mma_descriptor
static a_mma_descriptor(a_tmem: SIMD[uint32, 1]) -> TMemOperand[operand_type, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">), 0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), BK, 16, num_consumer_threads]
b_mma_descriptor
static b_mma_descriptor[dtype_b: DType](p_b: UnsafePointer[SIMD[dtype_b, 1], address_space=AddressSpace(3)]) -> MMASmemDescriptor
mma
mma(self, a: TMemOperand[operand_type, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">), 0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), BK, 16, num_consumer_threads], b: MMASmemDescriptor, c: TMemAccumulator[accum_type, 0 if (0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">) == 0) ^ True) & ((0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) < 0) ^ (BM < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)})), "value">, 0), {1}, cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {0}, cond(and(ne(rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0), xor(lt(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), 0), lt(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0))), {value = add(div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), -1)}, {value = div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">)}))), "value">), MMA_N, 0 if (num_consumer_threads == 0) else (div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) + -1) if (((rem_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">) == 0) ^ True) & (((BM * 2) < 0) ^ (num_consumer_threads < 0))) else div_s(mul(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BM, "value">, 2), #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int num_consumer_threads, "value">, 0), {1}, num_consumer_threads), "value">), 0 if (MMA_N == 0) else (div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) + -1) if (((rem_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">) == 0) ^ True) & ((BN < 0) ^ (MMA_N < 0))) else div_s(#lit.struct.extract<:_stdlib::_builtin::_int::_Int BN, "value">, #lit.struct.extract<:_stdlib::_builtin::_int::_Int cond(eq(#lit.struct.extract<:_stdlib::_builtin::_int::_Int MMA_N, "value">, 0), {1}, MMA_N), "value">), num_consumer_threads], c_scale: SIMD[uint32, 1], wg_idx: SIMD[uint32, 1] = __init__[__mlir_type.!pop.int_literal](0))
wait_group
wait_group[wgmma_left_in_flight: Int = 0](mut self)
Was this page helpful?
Thank you! We'll create more content like this.
Thank you for helping us improve!