summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-hexagon/htp/hex-dma.h
blob: d1ddb0ecbf04ce5d05f27e98d79286ce61381d5e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#ifndef HTP_DMA_H
#define HTP_DMA_H

#include <HAP_farf.h>
#include <hexagon_types.h>
#include <stdbool.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef struct {
    void *dst;
    const void *src;
} dma_ptr;

typedef struct {
    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
    dma_ptr                         * dptr;  // dst/src pointers
    uint32_t                          push_idx;
    uint32_t                          pop_idx;
    uint32_t                          capacity;
    uint32_t                          idx_mask;
} dma_queue;

dma_queue * dma_queue_create(size_t capacity);
void        dma_queue_delete(dma_queue * q);
void        dma_queue_flush(dma_queue * q);

// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
// but those do not seem to always compiler properly.
static inline void dmstart(void * next) {
    asm volatile(" release(%0):at" : : "r"(next));
    asm volatile(" dmstart(%0)" : : "r"(next));
}

static inline void dmlink(void * cur, void * next) {
    asm volatile(" release(%0):at" : : "r"(next));
    asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
}

static inline unsigned int dmpoll(void) {
    unsigned int ret = 0;
    asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
    return ret;
}

static inline unsigned int dmwait(void) {
    unsigned int ret = 0;
    asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
    return ret;
}

static inline dma_ptr dma_make_ptr(void *dst, const void *src)
{
    dma_ptr p = { dst, src };
    return p;
}

static inline bool dma_queue_push(dma_queue * q,
                                  dma_ptr     dptr,
                                  size_t      dst_row_size,
                                  size_t      src_row_size,
                                  size_t      width, // width in bytes. number of bytes to transfer per row
                                  size_t      nrows) {
    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
        FARF(ERROR, "dma-push: queue full\n");
        return false;
    }

    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];

    desc->next           = NULL;
    desc->length         = 0;
    desc->desctype       = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
    desc->dstbypass      = 1;
    desc->srcbypass      = 1;
#if __HVX_ARCH__ >= 73
    desc->dstbypass      = 1;
    desc->srcbypass      = 1;
#else
    desc->dstbypass      = 0;
    desc->srcbypass      = 1;
#endif
    desc->order          = 0;
    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
    desc->src            = (void *) dptr.src;
    desc->dst            = (void *) dptr.dst;
    desc->allocation     = 0;
    desc->padding        = 0;
    desc->roiwidth       = width;
    desc->roiheight      = nrows;
    desc->srcstride      = src_row_size;
    desc->dststride      = dst_row_size;
    desc->srcwidthoffset = 0;
    desc->dstwidthoffset = 0;

    q->dptr[q->push_idx] = dptr;

    dmlink(q->tail, desc);
    q->tail = desc;

    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
    q->push_idx = (q->push_idx + 1) & q->idx_mask;
    return true;
}

static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
                                              dma_ptr     dptr,
                                              size_t      dst_row_size,
                                              size_t      src_row_size,
                                              size_t      nrows) {
    return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
}


static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
                                              dma_ptr     dptr,
                                              size_t      dst_row_size,
                                              size_t      src_row_size,
                                              size_t      nrows) {
    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
}

static inline dma_ptr dma_queue_pop(dma_queue * q) {
    dma_ptr dptr  = { NULL };

    if (q->push_idx == q->pop_idx) {
        return dptr;
    }

    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];

    // Wait for desc to complete
    while (1) {
        dmpoll();
        if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
            break;
        }
        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
    }

    dptr = q->dptr[q->pop_idx];

    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
    return dptr;
}

#ifdef __cplusplus
}  // extern "C"
#endif

#endif /* HTP_DMA_H */