xetla/group_2gemm_2compute__policy_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "group/gemm/common.hpp"


namespace gpu::xetla::group {


template <typename compute_attr_, typename perf_tuning_knob_,

        gpu_arch arch_tag_>

struct compute_policy_default_xmx {};


template <typename compute_attr_, typename perf_tuning_knob_>

struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_,

        gpu_arch::Xe> {

    using compute_attr = compute_attr_;

    using perf_tuning_knob = perf_tuning_knob_;

    static constexpr int k_stride = perf_tuning_knob::k_stride;

    static constexpr int stages = perf_tuning_knob::stages;

    static constexpr int sync_freq = perf_tuning_knob::sync_freq;

    static constexpr gpu_arch arch_tag = gpu_arch::Xe;

    using dtype_mma_acc = typename compute_attr::dtype_acc;

    using dtype_mma_a = typename compute_attr::dtype_a;

    using dtype_mma_b = typename compute_attr::dtype_b;


    static constexpr uint32_t block_bytes_x_a = 32;

    static constexpr uint32_t block_size_x_a

            = block_bytes_x_a / sizeof(dtype_mma_a);

    static constexpr uint32_t block_size_y_a = 16;


    static constexpr uint32_t block_size_x_b = 16;

    static constexpr uint32_t block_bytes_y_b = 32;

    static constexpr uint32_t block_size_y_b

            = block_bytes_y_b / sizeof(dtype_mma_b);

    static_assert(block_size_x_a == block_size_y_b,

            "mat_a x need to match with mat_b y");

};


template <typename compute_attr_, typename perf_tuning_knob_,

        gpu_arch arch_tag_ = gpu_arch::Xe>

struct compute_policy_unaligned_xmx {};


template <typename compute_attr_, typename perf_tuning_knob_>

struct compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_,

        gpu_arch::Xe> {

    using compute_attr = compute_attr_;

    using perf_tuning_knob = perf_tuning_knob_;

    static constexpr int k_stride = perf_tuning_knob::k_stride;

    static constexpr int stages = perf_tuning_knob::stages;

    static constexpr int sync_freq = perf_tuning_knob::sync_freq;

    static constexpr gpu_arch arch_tag = gpu_arch::Xe;

    using dtype_mma_acc = typename compute_attr::dtype_acc;

    using dtype_mma_a = typename compute_attr::dtype_a;

    using dtype_mma_b = typename compute_attr::dtype_b;


    static constexpr uint32_t block_bytes_x_a = 32;

    static constexpr uint32_t block_size_x_a

            = block_bytes_x_a / sizeof(dtype_mma_a);

    static constexpr uint32_t block_size_y_a = 16;


    static constexpr uint32_t block_size_x_b = 16;

    static constexpr uint32_t block_bytes_y_b = 32;

    static constexpr uint32_t block_size_y_b

            = block_bytes_y_b / sizeof(dtype_mma_b);

    static_assert(block_size_x_a == block_size_y_b,

            "mat_a x need to match with mat_b y");

};


template <typename compute_attr_, typename perf_tuning_knob_,

        gpu_arch arch_tag_>

struct compute_policy_default_fpu {};


template <typename compute_attr_, typename perf_tuning_knob_>

struct compute_policy_default_fpu<compute_attr_, perf_tuning_knob_,

        gpu_arch::Xe> {

    using compute_attr = compute_attr_;

    using perf_tuning_knob = perf_tuning_knob_;

    static constexpr int k_stride = perf_tuning_knob::k_stride;

    static constexpr int stages = perf_tuning_knob::stages;

    static constexpr int sync_freq = perf_tuning_knob::sync_freq;

    static constexpr gpu_arch arch_tag = gpu_arch::Xe;

    using dtype_mma_acc = typename compute_attr::dtype_acc;

    using dtype_mma_a = typename compute_attr::dtype_a;

    using dtype_mma_b = typename compute_attr::dtype_b;


    static constexpr uint32_t block_bytes_x_a = 32;

    static constexpr uint32_t block_size_x_a

            = block_bytes_x_a / sizeof(dtype_mma_a);

    static constexpr uint32_t block_size_y_a = 16;

    static constexpr uint32_t block_bytes_x_b = 64;

    static constexpr uint32_t block_size_x_b

            = block_bytes_x_b / sizeof(dtype_mma_b);

    static constexpr uint32_t block_size_y_b = block_size_x_a;

};


} // namespace gpu::xetla::group

gpu::xetla::group
Definition limitation.hpp:607

gpu::xetla::gpu_arch
gpu_arch
Definition common.hpp:73

gpu::xetla::gpu_arch::Xe
@ Xe

gpu::xetla::group::compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_acc
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:117

gpu::xetla::group::compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::perf_tuning_knob
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:112

gpu::xetla::group::compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_b
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:119

gpu::xetla::group::compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::compute_attr
compute_attr_ compute_attr
Definition compute_policy.hpp:111

gpu::xetla::group::compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_a
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:118

gpu::xetla::group::compute_policy_default_fpu
Compute policy for fpu engine.
Definition compute_policy.hpp:105

gpu::xetla::group::compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::compute_attr
compute_attr_ compute_attr
Definition compute_policy.hpp:41

gpu::xetla::group::compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_b
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:49

gpu::xetla::group::compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::perf_tuning_knob
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:42

gpu::xetla::group::compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_acc
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:47

gpu::xetla::group::compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_a
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:48

gpu::xetla::group::compute_policy_default_xmx
Compute policy for xmx engine.
Definition compute_policy.hpp:35

gpu::xetla::group::compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_a
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:83

gpu::xetla::group::compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_acc
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:82

gpu::xetla::group::compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::dtype_mma_b
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:84

gpu::xetla::group::compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::compute_attr
compute_attr_ compute_attr
Definition compute_policy.hpp:76

gpu::xetla::group::compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe >::perf_tuning_knob
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:77

gpu::xetla::group::compute_policy_unaligned_xmx
Compute policy for unaligned shape and xmx engine.
Definition compute_policy.hpp:70