llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#ifndef HTP_MSG_H
#define HTP_MSG_H

#include <assert.h>

// ggml-common.h must be included prio to this header

// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum {
    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
    HTP_OPMASK_QUANTIZE = (1 << 1),  // Enable Quantize
    HTP_OPMASK_COMPUTE  = (1 << 2),  // Enable Compute
};

// Op flags
enum {
    HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0),  // Skip dynamic quantization (reuse quantized tensors)
    HTP_OPFLAGS_SKIP_COMPUTE  = (1 << 1),  // Skip actual computation (used for profiling)
    HTP_OPFLAGS_EARLY_WAKEUP  = (1 << 2)   // Send early wakeup notification
};

enum htp_status {
    HTP_STATUS_OK             = 1,
    HTP_STATUS_INTERNAL_ERR   = 2,
    HTP_STATUS_NO_SUPPORT     = 3,
    HTP_STATUS_INVAL_PARAMS   = 4,
    HTP_STATUS_VTCM_TOO_SMALL = 5,
};

// The values must match the ggml_type.
// Duplicated here because we can't include full ggml.h in the htp build.
// We have some static_asserts in the cpp code to ensure things are in sync.
enum htp_data_type {
    HTP_TYPE_F32   = 0,
    HTP_TYPE_F16   = 1,
    HTP_TYPE_Q4_0  = 2,
    HTP_TYPE_Q8_0  = 8,
    HTP_TYPE_I32   = 26,
    HTP_TYPE_I64   = 27,
    HTP_TYPE_MXFP4 = 39,
    HTP_TYPE_COUNT
};

// Do not reorder first 4 (used as an index)
enum htp_op {
    HTP_OP_MUL = 0,
    HTP_OP_ADD = 1,
    HTP_OP_SUB = 2,
    HTP_OP_DIV = 3,
    HTP_OP_MUL_MAT,
    HTP_OP_MUL_MAT_ID,
    HTP_OP_RMS_NORM,
    HTP_OP_UNARY_SILU,
    HTP_OP_UNARY_GELU,
    HTP_OP_GLU_SWIGLU,
    HTP_OP_GLU_SWIGLU_OAI,
    HTP_OP_GLU_GEGLU,
    HTP_OP_SOFTMAX,
    HTP_OP_ADD_ID,
    HTP_OP_ROPE,
    HTP_OP_FLASH_ATTN_EXT,
    HTP_OP_SET_ROWS,
    HTP_OP_GET_ROWS,
    HTP_OP_SCALE,
    HTP_OP_CPY,
    HTP_OP_ARGSORT,
    HTP_OP_SQR,
    HTP_OP_SQRT,
    HTP_OP_SUM_ROWS,
    INVALID
};

static inline size_t htp_t_block_size(uint32_t t) {
    switch (t) {
        case HTP_TYPE_F32:
            return 1;
        case HTP_TYPE_F16:
            return 1;
        case HTP_TYPE_Q4_0:
            return QK4_0;
        case HTP_TYPE_Q8_0:
            return QK8_0;
        case HTP_TYPE_MXFP4:
            return QK_MXFP4;
        default:
            assert(0 && "unsupported HTP data type");
    }
    return 0;
}

static inline size_t htp_type_nbytes(uint32_t t) {
    switch (t) {
        case HTP_TYPE_F32:
            return 4;
        case HTP_TYPE_F16:
            return 2;
        case HTP_TYPE_Q4_0:
            return sizeof(block_q4_0);
        case HTP_TYPE_Q8_0:
            return sizeof(block_q8_0);
        case HTP_TYPE_MXFP4:
            return sizeof(block_mxfp4);
        default:
            assert(0 && "unsupported HTP data type");
    }
    return 0;
}

// Internal types
#define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
#define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks

#define HTP_MAX_DIMS 4

struct htp_tensor {
    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
    uint32_t type;                // Data type
    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
};

#define HTP_MAX_OP_PARAMS 64

struct htp_general_req {
    uint32_t op;  // GGML/HTP Op
    int32_t  op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
    // Params for the op, e.g. epsilon of RMS norm
    uint32_t flags;          // Request flags

    struct htp_tensor src0;  // Input0 tensor
    struct htp_tensor src1;  // Input1 tensor
    struct htp_tensor src2;  // Input2 tensor
    struct htp_tensor src3;  // Input3 tensor
    struct htp_tensor src4;  // Input4 tensor
    struct htp_tensor dst;   // Output tensor

    // should be multiple of 64 bytes (cacheline)
};

struct htp_general_rsp {
    uint32_t op;           // GGML/HTP Op
    uint32_t status;       // HTP_STATUS_...
    uint32_t prof_usecs;   // Number of usec per request
    uint32_t prof_cycles;  // Number of cycles per request
    uint32_t prof_pkts;    // Number of instruction packets per request
    uint8_t  unused[44];   // Pad to 64 bytes
};

#define HTP_MAX_MESSAGE_SIZE   sizeof(struct htp_general_req)
#define HTP_MAX_PACKET_BUFFERS 8

#endif /* HTP_MSG_H */