mirror of
https://gitee.com/openharmony/third_party_mesa3d
synced 2025-02-17 08:27:39 +00:00
tu,ir3: Implement VK_KHR_shader_integer_dot_product
- gen4 - has dp4acc and dp2acc, dp4acc is used to implement 4x8 dot product. - gen3 - has dp2acc, in OpenCL blob uses dp2acc for dot product on both get3 and gen4. - gen2 - unknown, lower everything. - gen1 - no dp2acc, lower everything. OpenCL blob doesn't advertise cl_qcom_dot_product8 but still generates code for it. The assembly is more verbose and uses yet to be documented mad32.u16 instruction. Passes: dEQP-VK.spirv_assembly.instruction.compute.opsdotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opudotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsudotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsdotaccsatkhr.* dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsudotaccsatkhr.* Only packed 4x8 unsigned and mixed versions are accelerated. However in theory we should be able to do better for signed version than current NIR lowering. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13986>
This commit is contained in:
parent
e1f89a1da2
commit
d77bfc117c
@ -493,7 +493,7 @@ Khronos extensions that are not part of any Vulkan version:
|
||||
VK_KHR_pipeline_executable_properties DONE (anv, radv, tu)
|
||||
VK_KHR_push_descriptor DONE (anv, lvp, radv, tu)
|
||||
VK_KHR_shader_clock DONE (anv, radv)
|
||||
VK_KHR_shader_integer_dot_product DONE (radv)
|
||||
VK_KHR_shader_integer_dot_product DONE (anv, radv, tu)
|
||||
VK_KHR_shader_non_semantic_info DONE (anv, radv)
|
||||
VK_KHR_shader_subgroup_uniform_control_flow DONE (anv, radv)
|
||||
VK_KHR_shader_terminate_invocation DONE (anv, radv, tu)
|
||||
|
@ -130,6 +130,9 @@ struct fd_dev_info {
|
||||
|
||||
bool has_getfiberid;
|
||||
|
||||
bool has_dp2acc;
|
||||
bool has_dp4acc;
|
||||
|
||||
struct {
|
||||
uint32_t RB_UNKNOWN_8E04_blit;
|
||||
uint32_t PC_POWER_CNTL;
|
||||
|
@ -224,6 +224,7 @@ a6xx_gen2 = dict(
|
||||
has_z24uint_s8uint = True,
|
||||
indirect_draw_wfm_quirk = True,
|
||||
depth_bounds_require_depth_test_quirk = True, # TODO: check if true
|
||||
has_dp2acc = False, # TODO: check if true
|
||||
magic = dict(
|
||||
TPL1_DBG_ECO_CNTL = 0,
|
||||
),
|
||||
@ -243,6 +244,7 @@ a6xx_gen3 = dict(
|
||||
has_sample_locations = True,
|
||||
has_ccu_flush_bug = True,
|
||||
has_8bpp_ubwc = False,
|
||||
has_dp2acc = True,
|
||||
magic = dict(
|
||||
# this seems to be a chicken bit that fixes cubic filtering:
|
||||
TPL1_DBG_ECO_CNTL = 0x1000000,
|
||||
@ -266,6 +268,8 @@ a6xx_gen4 = dict(
|
||||
has_lpac = True,
|
||||
has_shading_rate = True,
|
||||
has_getfiberid = True,
|
||||
has_dp2acc = True,
|
||||
has_dp4acc = True,
|
||||
magic = dict(
|
||||
TPL1_DBG_ECO_CNTL = 0x5008000,
|
||||
),
|
||||
|
@ -182,6 +182,8 @@ static const nir_shader_compiler_options options_a6xx = {
|
||||
.lower_uniforms_to_ubo = true,
|
||||
.lower_device_index_to_zero = true,
|
||||
.use_scoped_barrier = true,
|
||||
.has_udot_4x8 = true,
|
||||
.has_sudot_4x8 = true,
|
||||
};
|
||||
|
||||
struct ir3_compiler *
|
||||
@ -212,6 +214,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
||||
|
||||
compiler->max_variable_workgroup_size = 1024;
|
||||
|
||||
const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
|
||||
|
||||
if (compiler->gen >= 6) {
|
||||
compiler->samgq_workaround = true;
|
||||
/* a6xx split the pipeline state into geometry and fragment state, in
|
||||
@ -241,14 +245,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
||||
/* TODO: implement private memory on earlier gen's */
|
||||
compiler->has_pvtmem = true;
|
||||
|
||||
compiler->tess_use_shared =
|
||||
fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared;
|
||||
compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
|
||||
|
||||
compiler->storage_16bit =
|
||||
fd_dev_info(compiler->dev_id)->a6xx.storage_16bit;
|
||||
compiler->storage_16bit = dev_info->a6xx.storage_16bit;
|
||||
|
||||
compiler->has_getfiberid =
|
||||
fd_dev_info(compiler->dev_id)->a6xx.has_getfiberid;
|
||||
compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
|
||||
|
||||
compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
|
||||
compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
|
||||
} else {
|
||||
compiler->max_const_pipeline = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
@ -262,8 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
||||
}
|
||||
|
||||
if (compiler->gen >= 6) {
|
||||
compiler->reg_size_vec4 =
|
||||
fd_dev_info(compiler->dev_id)->a6xx.reg_size_vec4;
|
||||
compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
|
||||
} else if (compiler->gen >= 4) {
|
||||
/* On a4xx-a5xx, using r24.x and above requires using the smallest
|
||||
* threadsize.
|
||||
@ -309,6 +312,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
||||
|
||||
if (compiler->gen >= 6) {
|
||||
compiler->nir_options = options_a6xx;
|
||||
compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
|
||||
compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
|
||||
} else {
|
||||
compiler->nir_options = options;
|
||||
}
|
||||
|
@ -172,6 +172,9 @@ struct ir3_compiler {
|
||||
/* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
|
||||
uint32_t max_variable_workgroup_size;
|
||||
|
||||
bool has_dp2acc;
|
||||
bool has_dp4acc;
|
||||
|
||||
/* Type to use for 1b nir bools: */
|
||||
type_t bool_type;
|
||||
};
|
||||
|
@ -289,6 +289,76 @@ resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
|
||||
return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
|
||||
struct ir3_instruction **dst,
|
||||
struct ir3_instruction **src)
|
||||
{
|
||||
struct ir3_instruction *accumulator = NULL;
|
||||
if (alu->op == nir_op_udot_4x8_uadd_sat) {
|
||||
accumulator = create_immed(ctx->block, 0);
|
||||
} else {
|
||||
accumulator = src[2];
|
||||
}
|
||||
|
||||
dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
|
||||
|
||||
if (alu->op == nir_op_udot_4x8_uadd ||
|
||||
alu->op == nir_op_udot_4x8_uadd_sat) {
|
||||
dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
|
||||
} else {
|
||||
dst[0]->cat3.signedness = IR3_SRC_MIXED;
|
||||
}
|
||||
|
||||
/* For some reason (sat) doesn't work in unsigned case so
|
||||
* we have to emulate it.
|
||||
*/
|
||||
if (alu->op == nir_op_udot_4x8_uadd_sat) {
|
||||
dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
} else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
|
||||
struct ir3_instruction **dst,
|
||||
struct ir3_instruction **src)
|
||||
{
|
||||
int signedness;
|
||||
if (alu->op == nir_op_udot_4x8_uadd ||
|
||||
alu->op == nir_op_udot_4x8_uadd_sat) {
|
||||
signedness = IR3_SRC_UNSIGNED;
|
||||
} else {
|
||||
signedness = IR3_SRC_MIXED;
|
||||
}
|
||||
|
||||
struct ir3_instruction *accumulator = NULL;
|
||||
if (alu->op == nir_op_udot_4x8_uadd_sat ||
|
||||
alu->op == nir_op_sudot_4x8_iadd_sat) {
|
||||
accumulator = create_immed(ctx->block, 0);
|
||||
} else {
|
||||
accumulator = src[2];
|
||||
}
|
||||
|
||||
dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
|
||||
dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
|
||||
dst[0]->cat3.signedness = signedness;
|
||||
|
||||
dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0);
|
||||
dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
|
||||
dst[0]->cat3.signedness = signedness;
|
||||
|
||||
if (alu->op == nir_op_udot_4x8_uadd_sat) {
|
||||
dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
} else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
|
||||
dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0);
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||
{
|
||||
@ -744,6 +814,31 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||
dst[0] = ir3_BFREV_B(b, src[0], 0);
|
||||
break;
|
||||
|
||||
case nir_op_uadd_sat:
|
||||
dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
break;
|
||||
case nir_op_iadd_sat:
|
||||
dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
|
||||
dst[0]->flags |= IR3_INSTR_SAT;
|
||||
break;
|
||||
|
||||
case nir_op_udot_4x8_uadd:
|
||||
case nir_op_udot_4x8_uadd_sat:
|
||||
case nir_op_sudot_4x8_iadd:
|
||||
case nir_op_sudot_4x8_iadd_sat: {
|
||||
if (ctx->compiler->has_dp4acc) {
|
||||
emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
|
||||
} else if (ctx->compiler->has_dp2acc) {
|
||||
emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
|
||||
} else {
|
||||
ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
|
||||
nir_op_infos[alu->op].name);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
ir3_context_error(ctx, "Unhandled ALU op: %s\n",
|
||||
nir_op_infos[alu->op].name);
|
||||
|
@ -157,6 +157,7 @@ get_device_extensions(const struct tu_physical_device *device,
|
||||
.KHR_driver_properties = true,
|
||||
.KHR_separate_depth_stencil_layouts = true,
|
||||
.KHR_buffer_device_address = true,
|
||||
.KHR_shader_integer_dot_product = true,
|
||||
#ifndef TU_USE_KGSL
|
||||
.KHR_timeline_semaphore = true,
|
||||
#endif
|
||||
@ -790,6 +791,12 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
|
||||
features->computeFullSubgroups = true;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: {
|
||||
VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features =
|
||||
(VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext;
|
||||
features->shaderIntegerDotProduct = true;
|
||||
break;
|
||||
};
|
||||
|
||||
default:
|
||||
break;
|
||||
@ -1159,6 +1166,48 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
||||
props->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: {
|
||||
VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props =
|
||||
(VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext;
|
||||
|
||||
props->integerDotProduct8BitUnsignedAccelerated = false;
|
||||
props->integerDotProduct8BitSignedAccelerated = false;
|
||||
props->integerDotProduct8BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProduct4x8BitPackedUnsignedAccelerated =
|
||||
pdevice->info->a6xx.has_dp2acc;
|
||||
/* TODO: we should be able to emulate 4x8BitPackedSigned fast enough */
|
||||
props->integerDotProduct4x8BitPackedSignedAccelerated = false;
|
||||
props->integerDotProduct4x8BitPackedMixedSignednessAccelerated =
|
||||
pdevice->info->a6xx.has_dp2acc;
|
||||
props->integerDotProduct16BitUnsignedAccelerated = false;
|
||||
props->integerDotProduct16BitSignedAccelerated = false;
|
||||
props->integerDotProduct16BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProduct32BitUnsignedAccelerated = false;
|
||||
props->integerDotProduct32BitSignedAccelerated = false;
|
||||
props->integerDotProduct32BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProduct64BitUnsignedAccelerated = false;
|
||||
props->integerDotProduct64BitSignedAccelerated = false;
|
||||
props->integerDotProduct64BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated =
|
||||
pdevice->info->a6xx.has_dp2acc;
|
||||
/* TODO: we should be able to emulate Saturating4x8BitPackedSigned fast enough */
|
||||
props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated =
|
||||
pdevice->info->a6xx.has_dp2acc;
|
||||
props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
|
||||
props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
|
Loading…
x
Reference in New Issue
Block a user