Mojo function

rms_norm_fused_residual_add_gpu_block

rms_norm_fused_residual_add_gpu_block[mut1: Bool, origin1: Origin[mut1], layout1: Layout, mut2: Bool, origin2: Origin[mut2], layout2: Layout, dtype: DType, //, simd_width: Int, max_warps_per_block: Int, input_fn: fn[width: Int](row: Int, col: Int) capturing -> SIMD[dtype, width], residual_input_fn: fn[width: Int](row: Int, col: Int) capturing -> SIMD[dtype, width], output_fn: fn[width: Int, alignment: Int](row: Int, col: Int, val: SIMD[dtype, width]) capturing -> None, output_residual_fn: fn[width: Int, alignment: Int](row: Int, col: Int, val: SIMD[dtype, width]) capturing -> None, multiply_before_cast: Bool](gamma1: LayoutTensor[dtype, layout1, origin1], epsilon1: Scalar[dtype], weight_offset1: Scalar[dtype], gamma2: LayoutTensor[dtype, layout2, origin2], epsilon2: Scalar[dtype], weight_offset2: Scalar[dtype], num_cols: Int)