LLVM
8.0.1
|
#include "llvm/MC/SubtargetFeature.h"
#include <cstddef>
#include <cstdint>
#include "llvm/Support/Debug.h"
Go to the source code of this file.
Classes | |
struct | hsa_dim3_s |
struct | hsa_ext_control_directives_s |
The hsa_ext_control_directives_t specifies the values for the HSAIL control directives. More... | |
struct | amd_kernel_code_s |
AMD Kernel Code Object (amd_kernel_code_t). More... | |
Macros | |
#define | AMD_HSA_BITS_SET(dst, mask, val) |
#define | AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask ## _SHIFT) \ |
Typedefs | |
typedef uint8_t | hsa_powertwo8_t |
typedef uint32_t | hsa_ext_code_kind_t |
typedef uint8_t | hsa_ext_brig_profile8_t |
typedef uint8_t | hsa_ext_brig_machine_model8_t |
typedef uint64_t | hsa_ext_control_directive_present64_t |
typedef uint16_t | hsa_ext_exception_kind16_t |
typedef uint32_t | hsa_ext_code_kind32_t |
typedef struct hsa_dim3_s | hsa_dim3_t |
typedef uint32_t | amd_code_version32_t |
The version of the amd_*_code_t struct. More... | |
typedef uint64_t | amd_compute_pgm_resource_register64_t |
Shader program settings for CS. More... | |
typedef uint32_t | amd_code_property32_t |
Every amd_*_code_t has the following properties, which are composed of a number of bit fields. More... | |
typedef struct hsa_ext_control_directives_s | hsa_ext_control_directives_t |
The hsa_ext_control_directives_t specifies the values for the HSAIL control directives. More... | |
typedef struct amd_kernel_code_s | amd_kernel_code_t |
AMD Kernel Code Object (amd_kernel_code_t). More... | |
#define AMD_HSA_BITS_GET | ( | src, | |
mask | |||
) | ((src & mask) >> mask ## _SHIFT) \ |
Definition at line 53 of file AMDKernelCodeT.h.
#define AMD_HSA_BITS_SET | ( | dst, | |
mask, | |||
val | |||
) |
Definition at line 48 of file AMDKernelCodeT.h.
Referenced by getElementByteSizeValue().
typedef uint32_t amd_code_property32_t |
Every amd_*_code_t has the following properties, which are composed of a number of bit fields.
Every bit field has a mask (AMD_CODE_PROPERTY_*), bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
(Note that bit fields cannot be used as their layout is implementation defined in the C standard and so cannot be used to specify an ABI)
Definition at line 77 of file AMDKernelCodeT.h.
typedef uint32_t amd_code_version32_t |
The version of the amd_*_code_t struct.
Minor versions must be backward compatible.
Definition at line 41 of file AMDKernelCodeT.h.
typedef uint64_t amd_compute_pgm_resource_register64_t |
Shader program settings for CS.
Contains COMPUTE_PGM_RSRC1 and COMPUTE_PGM_RSRC2 registers.
Definition at line 67 of file AMDKernelCodeT.h.
typedef struct amd_kernel_code_s amd_kernel_code_t |
AMD Kernel Code Object (amd_kernel_code_t).
GPU CP uses the AMD Kernel Code Object to set up the hardware to execute the kernel dispatch.
Initial Kernel Register State.
Initial kernel register state will be set up by CP/SPI prior to the start of execution of every wavefront. This is limited by the constraints of the current hardware.
The order of the SGPR registers is defined, but the Finalizer can specify which ones are actually setup in the amd_kernel_code_t object using the enable_sgpr_* bit fields. The register numbers used for enabled registers are dense starting at SGPR0: the first enabled register is SGPR0, the next enabled register is SGPR1 etc.; disabled registers do not have an SGPR number.
The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and apply to all waves of the grid. It is possible to specify more than 16 User SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 are actually initialized. These are then immediately followed by the System SGPRs that are set up by ADC/SPI and can have different values for each wave of the grid dispatch.
SGPR register initial state is defined as follows:
Private Segment Buffer (enable_sgpr_private_segment_buffer): Number of User SGPR registers: 4. V# that can be used, together with Scratch Wave Offset as an offset, to access the Private/Spill/Arg segments using a segment address. It must be set as follows:
Dispatch Ptr (enable_sgpr_dispatch_ptr): Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet for kernel actually executing.
Queue Ptr (enable_sgpr_queue_ptr): Number of User SGPR registers: 2. 64 bit address of AmdQueue object for AQL queue on which the dispatch packet was queued.
Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This is directly copied from the kernargPtr in the dispatch packet. Having CP load it once avoids loading it at the beginning of every wavefront.
Dispatch Id (enable_sgpr_dispatch_id): Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch packet being executed.
Flat Scratch Init (enable_sgpr_flat_scratch_init): Number of User SGPR registers: 2. This is 2 SGPRs.
For CI/VI: The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE to base of memory for scratch for this dispatch. This is the same offset used in computing the Scratch Segment Buffer base address. The value of Scratch Wave Offset must be added by the kernel code and moved to SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
The second SGPR is 32 bit byte size of a single work-item's scratch memory usage. This is directly loaded from the dispatch packet Private Segment Byte Size and rounded up to a multiple of DWORD.
The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in flat memory instructions. Having CP load it once avoids loading it at the beginning of every wavefront.
For PI: This is the 64 bit base address of the scratch backing memory for allocated by CP for this dispatch.
Private Segment Size (enable_sgpr_private_segment_size): Number of User SGPR registers: 1. The 32 bit byte size of a single work-item's scratch memory allocation. This is the value from the dispatch packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
Having CP load it once avoids loading it at the beginning of every wavefront.
Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the X dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the Y dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
Only initialized if <16 previous SGPRs initialized.
Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the Z dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
Only initialized if <16 previous SGPRs initialized.
Work-Group Id X (enable_sgpr_workgroup_id_x): Number of System SGPR registers: 1. 32 bit work group id in X dimension of grid for wavefront. Always present.
Work-Group Id Y (enable_sgpr_workgroup_id_y): Number of System SGPR registers: 1. 32 bit work group id in Y dimension of grid for wavefront.
Work-Group Id Z (enable_sgpr_workgroup_id_z): Number of System SGPR registers: 1. 32 bit work group id in Z dimension of grid for wavefront. If present then Work-group Id Y will also be present
Work-Group Info (enable_sgpr_workgroup_info): Number of System SGPR registers: 1. {first_wave, 14'b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
Private Segment Wave Byte Offset (enable_sgpr_private_segment_wave_byte_offset): Number of System SGPR registers: 1. 32 bit byte offset from base of dispatch scratch base. Must be used as an offset with Private/Spill/Arg segment address when using Scratch Segment Buffer. It must be added to Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
The order of the VGPR registers is defined, but the Finalizer can specify which ones are actually setup in the amd_kernel_code_t object using the enableVgpr* bit fields. The register numbers used for enabled registers are dense starting at VGPR0: the first enabled register is VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have an VGPR number.
VGPR register initial state is defined as follows:
Work-Item Id X (always initialized): Number of registers: 1. 32 bit work item id in X dimension of work-group for wavefront lane.
Work-Item Id X (enable_vgpr_workitem_id > 0): Number of registers: 1. 32 bit work item id in Y dimension of work-group for wavefront lane.
Work-Item Id X (enable_vgpr_workitem_id > 0): Number of registers: 1. 32 bit work item id in Z dimension of work-group for wavefront lane.
The setting of registers is being done by existing GPU hardware as follows: 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data registers. 2) Work-group Id registers X, Y, Z are set by SPI which supports any combination including none. 3) Scratch Wave Offset is also set by SPI which is why its value cannot be added into the value Flat Scratch Offset which would avoid the Finalizer generated prolog having to do the add. 4) The VGPRs are set by SPI which only supports specifying either (X), (X, Y) or (X, Y, Z).
Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so they can be moved as a 64 bit value to the hardware required SGPRn-3 and SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
The global segment can be accessed either using flat operations or buffer operations. If buffer operations are used then the Global Buffer used to access HSAIL Global/Readonly/Kernarg (which are combine) segments using a segment address is not passed into the kernel code by CP since its base address is always 0. Instead the Finalizer generates prolog code to initialize 4 SGPRs with a V# that has the following properties, and then uses that in the buffer instructions:
When the Global Buffer is used to access the Kernarg segment, must add the dispatch packet kernArgPtr to a kernarg segment address before using this V#. Alternatively scalar loads can be used if the kernarg offset is uniform, as the kernarg segment is constant for the duration of the kernel execution.
typedef struct hsa_dim3_s hsa_dim3_t |
typedef uint8_t hsa_ext_brig_machine_model8_t |
Definition at line 28 of file AMDKernelCodeT.h.
typedef uint8_t hsa_ext_brig_profile8_t |
Definition at line 27 of file AMDKernelCodeT.h.
typedef uint32_t hsa_ext_code_kind32_t |
Definition at line 31 of file AMDKernelCodeT.h.
typedef uint32_t hsa_ext_code_kind_t |
Definition at line 26 of file AMDKernelCodeT.h.
typedef uint64_t hsa_ext_control_directive_present64_t |
Definition at line 29 of file AMDKernelCodeT.h.
typedef struct hsa_ext_control_directives_s hsa_ext_control_directives_t |
The hsa_ext_control_directives_t specifies the values for the HSAIL control directives.
These control how the finalizer generates code. This struct is used both as an argument to hsaFinalizeKernel to specify values for the control directives, and is used in HsaKernelCode to record the values of the control directives that the finalize used when generating the code which either came from the finalizer argument or explicit HSAIL control directives. See the definition of the control directives in HSA Programmer's Reference Manual which also defines how the values specified as finalizer arguments have to agree with the control directives in the HSAIL code.
typedef uint16_t hsa_ext_exception_kind16_t |
Definition at line 30 of file AMDKernelCodeT.h.
typedef uint8_t hsa_powertwo8_t |
Definition at line 25 of file AMDKernelCodeT.h.
Definition at line 78 of file AMDKernelCodeT.h.
enum amd_code_version_t |
Enumerator | |
---|---|
AMD_CODE_VERSION_MAJOR | |
AMD_CODE_VERSION_MINOR |
Definition at line 42 of file AMDKernelCodeT.h.
The values used to define the number of bytes to use for the swizzle element size.
Enumerator | |
---|---|
AMD_ELEMENT_2_BYTES | |
AMD_ELEMENT_4_BYTES | |
AMD_ELEMENT_8_BYTES | |
AMD_ELEMENT_16_BYTES |
Definition at line 58 of file AMDKernelCodeT.h.