llmnpc - llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h

Path: llmnpc / llama.cpp / ggml / src / ggml-hexagon / htp / htp-msg.h (raw)
  1#ifndef HTP_MSG_H
  2#define HTP_MSG_H
  3
  4#include <assert.h>
  5
  6// ggml-common.h must be included prio to this header
  7
  8// Mask to enable various stages of the Ops.
  9// Used for debugging and profiling.
 10enum {
 11    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
 12    HTP_OPMASK_QUANTIZE = (1 << 1),  // Enable Quantize
 13    HTP_OPMASK_COMPUTE  = (1 << 2),  // Enable Compute
 14};
 15
 16// Op flags
 17enum {
 18    HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0),  // Skip dynamic quantization (reuse quantized tensors)
 19    HTP_OPFLAGS_SKIP_COMPUTE  = (1 << 1),  // Skip actual computation (used for profiling)
 20    HTP_OPFLAGS_EARLY_WAKEUP  = (1 << 2)   // Send early wakeup notification
 21};
 22
 23enum htp_status {
 24    HTP_STATUS_OK             = 1,
 25    HTP_STATUS_INTERNAL_ERR   = 2,
 26    HTP_STATUS_NO_SUPPORT     = 3,
 27    HTP_STATUS_INVAL_PARAMS   = 4,
 28    HTP_STATUS_VTCM_TOO_SMALL = 5,
 29};
 30
 31// The values must match the ggml_type.
 32// Duplicated here because we can't include full ggml.h in the htp build.
 33// We have some static_asserts in the cpp code to ensure things are in sync.
 34enum htp_data_type {
 35    HTP_TYPE_F32   = 0,
 36    HTP_TYPE_F16   = 1,
 37    HTP_TYPE_Q4_0  = 2,
 38    HTP_TYPE_Q8_0  = 8,
 39    HTP_TYPE_I32   = 26,
 40    HTP_TYPE_I64   = 27,
 41    HTP_TYPE_MXFP4 = 39,
 42    HTP_TYPE_COUNT
 43};
 44
 45// Do not reorder first 4 (used as an index)
 46enum htp_op {
 47    HTP_OP_MUL = 0,
 48    HTP_OP_ADD = 1,
 49    HTP_OP_SUB = 2,
 50    HTP_OP_DIV = 3,
 51    HTP_OP_MUL_MAT,
 52    HTP_OP_MUL_MAT_ID,
 53    HTP_OP_RMS_NORM,
 54    HTP_OP_UNARY_SILU,
 55    HTP_OP_UNARY_GELU,
 56    HTP_OP_GLU_SWIGLU,
 57    HTP_OP_GLU_SWIGLU_OAI,
 58    HTP_OP_GLU_GEGLU,
 59    HTP_OP_SOFTMAX,
 60    HTP_OP_ADD_ID,
 61    HTP_OP_ROPE,
 62    HTP_OP_FLASH_ATTN_EXT,
 63    HTP_OP_SET_ROWS,
 64    HTP_OP_GET_ROWS,
 65    HTP_OP_SCALE,
 66    HTP_OP_CPY,
 67    HTP_OP_ARGSORT,
 68    HTP_OP_SQR,
 69    HTP_OP_SQRT,
 70    HTP_OP_SUM_ROWS,
 71    INVALID
 72};
 73
 74static inline size_t htp_t_block_size(uint32_t t) {
 75    switch (t) {
 76        case HTP_TYPE_F32:
 77            return 1;
 78        case HTP_TYPE_F16:
 79            return 1;
 80        case HTP_TYPE_Q4_0:
 81            return QK4_0;
 82        case HTP_TYPE_Q8_0:
 83            return QK8_0;
 84        case HTP_TYPE_MXFP4:
 85            return QK_MXFP4;
 86        default:
 87            assert(0 && "unsupported HTP data type");
 88    }
 89    return 0;
 90}
 91
 92static inline size_t htp_type_nbytes(uint32_t t) {
 93    switch (t) {
 94        case HTP_TYPE_F32:
 95            return 4;
 96        case HTP_TYPE_F16:
 97            return 2;
 98        case HTP_TYPE_Q4_0:
 99            return sizeof(block_q4_0);
100        case HTP_TYPE_Q8_0:
101            return sizeof(block_q8_0);
102        case HTP_TYPE_MXFP4:
103            return sizeof(block_mxfp4);
104        default:
105            assert(0 && "unsupported HTP data type");
106    }
107    return 0;
108}
109
110// Internal types
111#define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
112#define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
113#define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
114
115#define HTP_MAX_DIMS 4
116
117struct htp_tensor {
118    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
119    uint32_t type;                // Data type
120    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
121    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
122};
123
124#define HTP_MAX_OP_PARAMS 64
125
126struct htp_general_req {
127    uint32_t op;  // GGML/HTP Op
128    int32_t  op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
129    // Params for the op, e.g. epsilon of RMS norm
130    uint32_t flags;          // Request flags
131
132    struct htp_tensor src0;  // Input0 tensor
133    struct htp_tensor src1;  // Input1 tensor
134    struct htp_tensor src2;  // Input2 tensor
135    struct htp_tensor src3;  // Input3 tensor
136    struct htp_tensor src4;  // Input4 tensor
137    struct htp_tensor dst;   // Output tensor
138
139    // should be multiple of 64 bytes (cacheline)
140};
141
142struct htp_general_rsp {
143    uint32_t op;           // GGML/HTP Op
144    uint32_t status;       // HTP_STATUS_...
145    uint32_t prof_usecs;   // Number of usec per request
146    uint32_t prof_cycles;  // Number of cycles per request
147    uint32_t prof_pkts;    // Number of instruction packets per request
148    uint8_t  unused[44];   // Pad to 64 bytes
149};
150
151#define HTP_MAX_MESSAGE_SIZE   sizeof(struct htp_general_req)
152#define HTP_MAX_PACKET_BUFFERS 8
153
154#endif /* HTP_MSG_H */