llmnpc - llama.cpp/ggml/src/ggml-hexagon/htp/main.c

   1#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
   2#pragma clang diagnostic ignored "-Wunused-function"
   3
   4#include <HAP_farf.h>
   5#include <HAP_perf.h>
   6#include <AEEStdErr.h>
   7#include <dspqueue.h>
   8#include <HAP_compute_res.h>
   9#include <HAP_etm_config.h>
  10#include <HAP_mem.h>
  11#include <HAP_power.h>
  12#include <HAP_ps.h>
  13#include <qurt.h>
  14#include <qurt_thread.h>
  15#include <remote.h>
  16#include <string.h>
  17
  18#include "hex-dma.h"
  19#include "hex-utils.h"
  20
  21#define GGML_COMMON_DECL_C
  22#include "ggml-common.h"
  23#include "htp-ctx.h"
  24#include "htp-msg.h"
  25#include "htp-ops.h"
  26#include "worker-pool.h"
  27
  28AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
  29    struct htp_context * ctx;
  30    int                  err = 0;
  31
  32    ctx = calloc(1, sizeof(*ctx));
  33    if (ctx == NULL) {
  34        return AEE_ENOMEMORY;
  35    }
  36
  37    // Use the context structure as a handle
  38    *handle = (remote_handle64) ctx;
  39
  40    // Enable FARF logs
  41    HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
  42
  43    // Set client class
  44    {
  45        HAP_power_request_t request;
  46        memset(&request, 0, sizeof(HAP_power_request_t));
  47        request.type    = HAP_power_set_apptype;
  48        request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
  49
  50        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
  51            return err;
  52        }
  53    }
  54
  55    {
  56        HAP_power_request_t request;
  57        memset(&request, 0, sizeof(request));
  58
  59        request.type                              = HAP_power_set_DCVS_v3;
  60        request.dcvs_v3.set_dcvs_enable           = TRUE;
  61        request.dcvs_v3.dcvs_enable               = TRUE;
  62        request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
  63        request.dcvs_v3.set_bus_params            = TRUE;
  64        request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
  65        request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
  66        request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
  67        request.dcvs_v3.set_core_params           = TRUE;
  68        request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
  69        request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
  70        request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
  71        request.dcvs_v3.set_sleep_disable         = TRUE;
  72        request.dcvs_v3.sleep_disable             = TRUE;
  73        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
  74            return err;
  75        }
  76
  77        memset(&request, 0, sizeof(request));
  78        request.type         = HAP_power_set_HVX;
  79        request.hvx.power_up = TRUE;
  80        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
  81            return err;
  82        }
  83    }
  84
  85    {
  86        // Power on HMX
  87        HAP_power_request_t request;
  88        memset(&request, 0, sizeof(HAP_power_request_t));
  89        request.type         = HAP_power_set_HMX;
  90        request.hmx.power_up = TRUE;
  91        FARF(ALWAYS, "Powering HMX on\n");
  92        err = HAP_power_set((void *) &ctx, &request);
  93        if (err != AEE_SUCCESS) {
  94            FARF(ERROR, "Error powering on HMX.");
  95            return err;
  96        }
  97    }
  98
  99    return AEE_SUCCESS;
 100}
 101
 102AEEResult htp_iface_close(remote_handle64 handle) {
 103    struct htp_context * ctx = (struct htp_context *) handle;
 104
 105    if (!ctx) {
 106        return AEE_EBADPARM;
 107    }
 108
 109    if (ctx->queue) {
 110        FARF(ERROR, "Closing handle with queue still open");
 111        return AEE_EITEMBUSY;
 112    }
 113
 114    free(ctx);
 115    return AEE_SUCCESS;
 116}
 117
 118AEEResult htp_iface_enable_etm(remote_handle64 handle) {
 119    int err = HAP_user_etm_enable();
 120    if (err) {
 121        if (err == AEE_EVERSIONNOTSUPPORT) {
 122            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
 123        } else {
 124            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
 125        }
 126    }
 127    return err;
 128}
 129
 130AEEResult htp_iface_disable_etm(remote_handle64 handle) {
 131    int err = HAP_user_etm_disable();
 132    if (err) {
 133        if (err == AEE_EVERSIONNOTSUPPORT) {
 134            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
 135        } else {
 136            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
 137        }
 138    }
 139    return err;
 140}
 141
 142static int vtcm_acquire(struct htp_context * ctx) {
 143    int err;
 144    if (!ctx->vtcm_valid) {
 145        // Temporarily bump thread priority to make sure it's higher than other sessions.
 146        // This way the resource manager will notify the other thread to release VTCM.
 147        // Note that we need to reaquire VTCM at normal priority for this to work next time.
 148        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
 149        err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
 150        if (err != 0) {
 151            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
 152            abort();
 153        }
 154        HAP_compute_res_release_cached(ctx->vtcm_rctx);
 155        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
 156
 157        err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
 158        if (err != 0) {
 159            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
 160            abort();
 161        }
 162        ctx->vtcm_valid = true;
 163    }
 164
 165    ctx->vtcm_inuse = true;
 166    return 0;
 167}
 168
 169static int vtcm_release(struct htp_context * ctx) {
 170    ctx->vtcm_inuse = false;
 171
 172    if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
 173        ctx->vtcm_valid         = false;
 174        ctx->vtcm_needs_release = false;
 175        HAP_compute_res_release_cached(ctx->vtcm_rctx);
 176    }
 177
 178    return 0;
 179}
 180
 181static int vtcm_release_callback(unsigned int rctx, void * state) {
 182    struct htp_context * ctx = (struct htp_context *) state;
 183
 184    if (!ctx || ctx->vtcm_rctx != rctx) {
 185        return AEE_EBADPARM;
 186    }
 187
 188    // If VTCM is not inuse (not processing Ops) release it right here
 189    // otherwise we'll release it once we're done with the current Op.
 190
 191    if (ctx->vtcm_inuse) {
 192        ctx->vtcm_needs_release = false;
 193        return 0;
 194    }
 195
 196    ctx->vtcm_valid = false;
 197    HAP_compute_res_release_cached(ctx->vtcm_rctx);
 198
 199    return 0;
 200}
 201
 202static int vtcm_alloc(struct htp_context * ctx) {
 203    unsigned int vtcm_size = 8 * 1024 * 1024;  // 8MB default
 204    HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
 205
 206    compute_res_attr_t attr;
 207    HAP_compute_res_attr_init(&attr);
 208    HAP_compute_res_attr_set_serialize(&attr, 0);
 209    HAP_compute_res_attr_set_cache_mode(&attr, 1);
 210    HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
 211    HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
 212    HAP_compute_res_attr_set_hmx_param(&attr, 1);
 213
 214    // Allocate VTCM for scratch pads
 215    uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
 216    if (!rctx) {
 217        FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
 218        return AEE_ENOMEMORY;
 219    }
 220
 221    void * vtcm_ptr;
 222    if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
 223        HAP_compute_res_release(rctx);
 224        FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
 225        return AEE_ENOMEMORY;
 226    }
 227
 228    ctx->vtcm_base          = (uint8_t *) vtcm_ptr;
 229    ctx->vtcm_size          = vtcm_size;
 230    ctx->vtcm_rctx          = rctx;
 231    ctx->vtcm_valid         = false;
 232    ctx->vtcm_inuse         = false;
 233    ctx->vtcm_needs_release = false;
 234
 235    return 0;
 236}
 237
 238static void vtcm_free(struct htp_context * ctx) {
 239    if (ctx->vtcm_rctx) {
 240        HAP_compute_res_release(ctx->vtcm_rctx);
 241        ctx->vtcm_base = 0;
 242        ctx->vtcm_rctx = 0;
 243    }
 244}
 245
 246static void htp_packet_callback(dspqueue_t queue, int error, void * context);
 247static void htp_error_callback(dspqueue_t queue, int error, void * context);
 248
 249AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
 250    struct htp_context * ctx = (struct htp_context *) handle;
 251
 252    if (!ctx) {
 253        return AEE_EBADPARM;
 254    }
 255
 256    if (ctx->queue) {
 257        FARF(ERROR, "Queue already open");
 258        return AEE_EITEMBUSY;
 259    }
 260
 261    // Import queue created on the CPU
 262    int err = dspqueue_import(dsp_queue_id,         // Queue ID from dspqueue_export
 263                              htp_packet_callback,  // Packet callback
 264                              htp_error_callback,   // Error callback; no errors expected on the DSP
 265                              (void *) ctx,         // Callback context
 266                              &ctx->queue);
 267
 268    if (err) {
 269        FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
 270        return err;
 271    }
 272
 273    ctx->thread_id   = qurt_thread_get_id();
 274    ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
 275
 276    // allocate VTCM
 277    err = vtcm_alloc(ctx);
 278    if (err != AEE_SUCCESS) {
 279        FARF(ERROR, "Unable to allocate VTCM");
 280        return AEE_ENOMEMORY;
 281    }
 282
 283    qurt_sysenv_max_hthreads_t hw_threads;
 284    qurt_sysenv_get_max_hw_threads(&hw_threads);
 285    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
 286
 287    if (n_hvx == 0) {
 288        n_hvx = hw_nhvx;
 289    }
 290    if (n_hvx > hw_threads.max_hthreads) {
 291        n_hvx = hw_threads.max_hthreads;
 292    }
 293    if (n_hvx > HTP_MAX_NTHREADS) {
 294        n_hvx = HTP_MAX_NTHREADS;
 295    }
 296
 297    ctx->n_threads = n_hvx;
 298    for (int i = 0; i < ctx->n_threads; i++) {
 299        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
 300        ctx->dma[i] = dma_queue_create(64);
 301    }
 302
 303    // init worker pool
 304    err = worker_pool_init(&ctx->worker_pool, n_hvx);
 305    if (err != AEE_SUCCESS) {
 306        FARF(ERROR, "Unable to create worker pool");
 307        return err;
 308    }
 309
 310    FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
 311         sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
 312
 313    return AEE_SUCCESS;
 314}
 315
 316AEEResult htp_iface_stop(remote_handle64 handle) {
 317    struct htp_context * ctx = (struct htp_context *) handle;
 318    if (!ctx) {
 319        return AEE_EBADPARM;
 320    }
 321
 322    if (!ctx->queue) {
 323        FARF(ERROR, "Queue not open");
 324        return AEE_EBADSTATE;
 325    }
 326
 327    // Close queue. dspqueue_close() will also wait for callbacks to finish.
 328    int err    = dspqueue_close(ctx->queue);
 329    ctx->queue = NULL;
 330    if (err != 0) {
 331        FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
 332        return err;
 333    }
 334
 335    if (ctx->worker_pool) {
 336        // Release worker pool
 337        worker_pool_release(&ctx->worker_pool);
 338    }
 339
 340    for (int i = 0; i < ctx->n_threads; i++) {
 341        dma_queue_delete(ctx->dma[i]);
 342    }
 343
 344    vtcm_free(ctx);
 345
 346    return AEE_SUCCESS;
 347}
 348
 349static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 350    // No errors expected on the DSP.
 351    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
 352}
 353
 354struct profile_data {
 355    uint64_t usecs;
 356    uint64_t cycles;
 357    uint64_t pkts;
 358};
 359
 360static inline void profile_start(struct profile_data * d) {
 361    d->usecs  = HAP_perf_get_qtimer_count();
 362    d->cycles = hex_get_cycles();
 363    d->pkts   = hex_get_pktcnt();
 364}
 365
 366static inline void profile_stop(struct profile_data * d) {
 367    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
 368    d->cycles = hex_get_cycles() - d->cycles;
 369    d->pkts   = hex_get_pktcnt() - d->pkts;
 370}
 371
 372static int send_htp_rsp(struct htp_context *     c,
 373                        uint32_t                 op,
 374                        uint32_t                 status,
 375                        struct dspqueue_buffer * bufs,
 376                        size_t                   n_bufs,
 377                        struct profile_data *    prof) {
 378    // Prep response struct
 379    struct htp_general_rsp rsp;
 380    rsp.op          = op;
 381    rsp.status      = status;
 382    rsp.prof_usecs  = prof->usecs;
 383    rsp.prof_cycles = prof->cycles;
 384    rsp.prof_pkts   = prof->pkts;
 385
 386    int err = dspqueue_write(c->queue,
 387                             0,                       // Flags
 388                             n_bufs,
 389                             bufs,                    // Buffer references
 390                             sizeof(rsp),
 391                             (const uint8_t *) &rsp,  // Message
 392                             DSPQUEUE_TIMEOUT_NONE);
 393
 394    if (err != 0) {
 395        FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
 396    }
 397
 398    return err;
 399}
 400
 401static void proc_matmul_req(struct htp_context *     ctx,
 402                            struct htp_general_req * req,
 403                            struct dspqueue_buffer * bufs,
 404                            size_t                   n_bufs) {
 405    struct dspqueue_buffer rsp_bufs[1];
 406
 407    // We had written to the output buffer, we'd also need to flush it
 408    rsp_bufs[0].fd     = bufs[2].fd;
 409    rsp_bufs[0].ptr    = bufs[2].ptr;
 410    rsp_bufs[0].size   = bufs[2].size;
 411    rsp_bufs[0].offset = bufs[2].offset;
 412    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 413                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 414
 415    // Setup Op context
 416    struct htp_ops_context octx = { 0 };
 417    octx.ctx                    = ctx;
 418    octx.src0                   = req->src0;
 419    octx.src1                   = req->src1;
 420    octx.dst                    = req->dst;
 421    octx.flags                  = req->flags;
 422    octx.op                     = req->op;
 423
 424    // Update data pointers
 425    octx.src0.data = (uint32_t) bufs[0].ptr;
 426    octx.src1.data = (uint32_t) bufs[1].ptr;
 427    octx.dst.data  = (uint32_t) bufs[2].ptr;
 428    octx.n_threads = ctx->n_threads;
 429
 430    struct profile_data prof;
 431    profile_start(&prof);
 432
 433    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 434    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 435        rsp_status = op_matmul(&octx);
 436        vtcm_release(ctx);
 437    }
 438
 439    profile_stop(&prof);
 440    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 441}
 442
 443static void proc_argsort_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 444    struct dspqueue_buffer rsp_bufs[1];
 445
 446    // We had written to the output buffer, we'd also need to flush it
 447    rsp_bufs[0].fd     = bufs[1].fd;
 448    rsp_bufs[0].ptr    = bufs[1].ptr;
 449    rsp_bufs[0].offset = bufs[1].offset;
 450    rsp_bufs[0].size   = bufs[1].size;
 451    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 452                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 453
 454    // Setup Op context
 455    struct htp_ops_context octx = { 0 };
 456    octx.ctx                    = ctx;
 457    octx.src0                   = req->src0;
 458    octx.dst                    = req->dst;
 459    octx.flags                  = req->flags;
 460    octx.op                     = req->op;
 461
 462    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 463
 464    // Update data pointers
 465    octx.src0.data = (uint32_t) bufs[0].ptr;
 466    octx.dst.data  = (uint32_t) bufs[1].ptr;
 467    octx.n_threads = ctx->n_threads;
 468
 469    struct profile_data prof;
 470    profile_start(&prof);
 471
 472    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 473    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 474        rsp_status = op_argsort(&octx);
 475        vtcm_release(ctx);
 476    }
 477
 478    profile_stop(&prof);
 479    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 480}
 481
 482static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 483    struct dspqueue_buffer rsp_bufs[1];
 484
 485    // We had written to the output buffer, we'd also need to flush it
 486    rsp_bufs[0].fd     = bufs[1].fd;
 487    rsp_bufs[0].ptr    = bufs[1].ptr;
 488    rsp_bufs[0].offset = bufs[1].offset;
 489    rsp_bufs[0].size   = bufs[1].size;
 490    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 491                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 492
 493    // Setup Op context
 494    struct htp_ops_context octx = { 0 };
 495    octx.ctx                    = ctx;
 496    octx.src0                   = req->src0;
 497    octx.dst                    = req->dst;
 498    octx.flags                  = req->flags;
 499    octx.op                     = req->op;
 500
 501    // Update data pointers
 502    octx.src0.data = (uint32_t) bufs[0].ptr;
 503    octx.dst.data  = (uint32_t) bufs[1].ptr;
 504    octx.n_threads = ctx->n_threads;
 505
 506    struct profile_data prof;
 507    profile_start(&prof);
 508
 509    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 510    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 511        rsp_status = op_cpy(&octx);
 512        vtcm_release(ctx);
 513    }
 514
 515    profile_stop(&prof);
 516    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 517}
 518
 519static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 520    struct dspqueue_buffer rsp_bufs[1];
 521
 522    // We had written to the output buffer, we'd also need to flush it
 523    rsp_bufs[0].fd     = bufs[2].fd;
 524    rsp_bufs[0].ptr    = bufs[2].ptr;
 525    rsp_bufs[0].offset = bufs[2].offset;
 526    rsp_bufs[0].size   = bufs[2].size;
 527    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 528                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 529
 530    // Setup Op context
 531    struct htp_ops_context octx = { 0 };
 532    octx.ctx                    = ctx;
 533    octx.src0                   = req->src0;
 534    octx.src1                   = req->src1;
 535    octx.dst                    = req->dst;
 536    octx.flags                  = req->flags;
 537    octx.op                     = req->op;
 538
 539    // Update data pointers
 540    octx.src0.data = (uint32_t) bufs[0].ptr;
 541    octx.src1.data = (uint32_t) bufs[1].ptr;
 542    octx.dst.data  = (uint32_t) bufs[2].ptr;
 543    octx.n_threads = ctx->n_threads;
 544
 545    struct profile_data prof;
 546    profile_start(&prof);
 547
 548    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 549    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 550        rsp_status = op_get_rows(&octx);
 551        vtcm_release(ctx);
 552    }
 553
 554    profile_stop(&prof);
 555    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 556}
 557
 558static void proc_matmul_id_req(struct htp_context *     ctx,
 559                               struct htp_general_req * req,
 560                               struct dspqueue_buffer * bufs,
 561                               size_t                   n_bufs) {
 562    struct dspqueue_buffer rsp_bufs[1];
 563
 564    // We had written to the output buffer, we'd also need to flush it
 565    rsp_bufs[0].fd     = bufs[3].fd;
 566    rsp_bufs[0].ptr    = bufs[3].ptr;
 567    rsp_bufs[0].size   = bufs[3].size;
 568    rsp_bufs[0].offset = bufs[3].offset;
 569    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 570                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 571
 572    // Setup Op context
 573    struct htp_ops_context octx = { 0 };
 574    octx.ctx                    = ctx;
 575    octx.src0                   = req->src0;
 576    octx.src1                   = req->src1;
 577    octx.src2                   = req->src2;
 578    octx.dst                    = req->dst;
 579    octx.flags                  = req->flags;
 580    octx.op                     = req->op;
 581
 582    // Update data pointers
 583    octx.src0.data = (uint32_t) bufs[0].ptr;
 584    octx.src1.data = (uint32_t) bufs[1].ptr;
 585    octx.src2.data = (uint32_t) bufs[2].ptr;
 586    octx.dst.data  = (uint32_t) bufs[3].ptr;
 587    octx.n_threads = ctx->n_threads;
 588
 589    struct profile_data prof;
 590    profile_start(&prof);
 591
 592    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 593    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 594        rsp_status = op_matmul_id(&octx);
 595        vtcm_release(ctx);
 596    }
 597
 598    profile_stop(&prof);
 599    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 600}
 601
 602static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 603    struct dspqueue_buffer rsp_bufs[1];
 604
 605    // We had written to the output buffer, we'd also need to flush it
 606    rsp_bufs[0].fd     = bufs[2].fd;
 607    rsp_bufs[0].ptr    = bufs[2].ptr;
 608    rsp_bufs[0].offset = bufs[2].offset;
 609    rsp_bufs[0].size   = bufs[2].size;
 610    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 611                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 612
 613    // Setup Op context
 614    struct htp_ops_context octx = { 0 };
 615    octx.ctx                    = ctx;
 616    octx.src0                   = req->src0;
 617    octx.src1                   = req->src1;
 618    octx.dst                    = req->dst;
 619    octx.flags                  = req->flags;
 620    octx.op                     = req->op;
 621
 622    // Update data pointers
 623    octx.src0.data = (uint32_t) bufs[0].ptr;
 624    octx.src1.data = (uint32_t) bufs[1].ptr;
 625    octx.dst.data  = (uint32_t) bufs[2].ptr;
 626    octx.n_threads = ctx->n_threads;
 627
 628    struct profile_data prof;
 629    profile_start(&prof);
 630
 631    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 632    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 633        rsp_status = op_binary(&octx);
 634        vtcm_release(ctx);
 635    }
 636
 637    profile_stop(&prof);
 638    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 639}
 640
 641static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 642    struct dspqueue_buffer rsp_bufs[1];
 643
 644    // We had written to the output buffer, we'd also need to flush it
 645    rsp_bufs[0].fd     = bufs[3].fd;
 646    rsp_bufs[0].ptr    = bufs[3].ptr;
 647    rsp_bufs[0].offset = bufs[3].offset;
 648    rsp_bufs[0].size   = bufs[3].size;
 649    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 650                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 651
 652    // Setup Op context
 653    struct htp_ops_context octx = { 0 };
 654    octx.ctx                    = ctx;
 655    octx.src0                   = req->src0;
 656    octx.src1                   = req->src1;
 657    octx.src2                   = req->src2;
 658    octx.dst                    = req->dst;
 659    octx.flags                  = req->flags;
 660    octx.op                     = req->op;
 661
 662    // Update data pointers
 663    octx.src0.data = (uint32_t) bufs[0].ptr;
 664    octx.src1.data = (uint32_t) bufs[1].ptr;
 665    octx.src2.data = (uint32_t) bufs[2].ptr;
 666    octx.dst.data  = (uint32_t) bufs[3].ptr;
 667    octx.n_threads = ctx->n_threads;
 668
 669    struct profile_data prof;
 670    profile_start(&prof);
 671
 672    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 673    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 674        rsp_status = op_binary(&octx);
 675        vtcm_release(ctx);
 676    }
 677
 678    profile_stop(&prof);
 679    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 680}
 681
 682static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 683    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
 684
 685    // We had written to the output buffer, we'd also need to flush it
 686    rsp_bufs[0].fd     = bufs[1].fd;
 687    rsp_bufs[0].ptr    = bufs[1].ptr;
 688    rsp_bufs[0].offset = bufs[1].offset;
 689    rsp_bufs[0].size   = bufs[1].size;
 690    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 691                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 692
 693    // Setup Op context
 694    struct htp_ops_context octx = { 0 };
 695    octx.ctx                    = ctx;
 696    octx.src0                   = req->src0;
 697    octx.dst                    = req->dst;
 698    octx.flags                  = req->flags;
 699    octx.op                     = req->op;
 700
 701    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 702
 703    // Update data pointers
 704    octx.src0.data = (uint32_t) bufs[0].ptr;
 705    octx.dst.data  = (uint32_t) bufs[1].ptr;
 706    octx.n_threads = ctx->n_threads;
 707
 708    struct profile_data prof;
 709    profile_start(&prof);
 710
 711    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 712    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 713        rsp_status = op_unary(&octx);
 714        vtcm_release(ctx);
 715    }
 716
 717    profile_stop(&prof);
 718    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 719}
 720
 721static void proc_sum_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 722    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
 723
 724    // We had written to the output buffer, we'd also need to flush it
 725    rsp_bufs[0].fd     = bufs[1].fd;
 726    rsp_bufs[0].ptr    = bufs[1].ptr;
 727    rsp_bufs[0].offset = bufs[1].offset;
 728    rsp_bufs[0].size   = bufs[1].size;
 729    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 730                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 731
 732    // Setup Op context
 733    struct htp_ops_context octx = { 0 };
 734    octx.ctx                    = ctx;
 735    octx.src0                   = req->src0;
 736    octx.dst                    = req->dst;
 737    octx.flags                  = req->flags;
 738    octx.op                     = req->op;
 739
 740    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 741
 742    // Update data pointers
 743    octx.src0.data = (uint32_t) bufs[0].ptr;
 744    octx.dst.data  = (uint32_t) bufs[1].ptr;
 745    octx.n_threads = ctx->n_threads;
 746
 747    struct profile_data prof;
 748    profile_start(&prof);
 749
 750    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 751    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 752        rsp_status = op_sum_rows(&octx);
 753        vtcm_release(ctx);
 754    }
 755
 756    profile_stop(&prof);
 757    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 758}
 759
 760static void proc_activations_req(struct htp_context *     ctx,
 761                                 struct htp_general_req * req,
 762                                 struct dspqueue_buffer * bufs,
 763                                 uint32_t                 n_bufs) {
 764    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
 765
 766    int write_idx = (n_bufs == 3) ? 2 : 1;
 767
 768    // We had written to the output buffer, we'd also need to flush it
 769    rsp_bufs[0].fd     = bufs[write_idx].fd;
 770    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
 771    rsp_bufs[0].offset = bufs[write_idx].offset;
 772    rsp_bufs[0].size   = bufs[write_idx].size;
 773    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 774                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
 775
 776    // Setup Op context
 777    struct htp_ops_context octx = { 0 };
 778    octx.ctx                    = ctx;
 779    octx.src0                   = req->src0;
 780    if (3 == n_bufs) {
 781        octx.src1 = req->src1;
 782    }
 783    octx.dst   = req->dst;
 784    octx.flags = req->flags;
 785    octx.op    = req->op;
 786
 787    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 788
 789    // Update data pointers
 790    octx.src0.data = (uint32_t) bufs[0].ptr;
 791    if (3 == n_bufs) {
 792        octx.src1.data = (uint32_t) bufs[1].ptr;
 793        octx.dst.data  = (uint32_t) bufs[2].ptr;
 794    } else {
 795        octx.dst.data = (uint32_t) bufs[1].ptr;
 796    }
 797    octx.n_threads = ctx->n_threads;
 798
 799    struct profile_data prof;
 800    profile_start(&prof);
 801
 802    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 803    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 804        if (octx.op == HTP_OP_SOFTMAX) {
 805            rsp_status = op_softmax(&octx);
 806        } else {
 807            rsp_status = op_activations(&octx);
 808        }
 809        vtcm_release(ctx);
 810    }
 811
 812    profile_stop(&prof);
 813    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 814}
 815
 816static void proc_rope_req(struct htp_context *     ctx,
 817                          struct htp_general_req * req,
 818                          struct dspqueue_buffer * bufs,
 819                          uint32_t                 n_bufs) {
 820    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
 821
 822    int write_idx = n_bufs - 1;
 823
 824    // We had written to the output buffer, we'd also need to flush it
 825    rsp_bufs[0].fd     = bufs[write_idx].fd;
 826    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
 827    rsp_bufs[0].offset = bufs[write_idx].offset;
 828    rsp_bufs[0].size   = bufs[write_idx].size;
 829    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 830                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
 831
 832    // Setup Op context
 833    struct htp_ops_context octx = { 0 };
 834    octx.ctx                    = ctx;
 835    octx.src0                   = req->src0;
 836    octx.src1                   = req->src1;
 837    if (4 == n_bufs) {
 838        octx.src2 = req->src2;
 839    }
 840    octx.dst   = req->dst;
 841    octx.flags = req->flags;
 842    octx.op    = req->op;
 843
 844    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 845
 846    // Update data pointers
 847    octx.src0.data = (uint32_t) bufs[0].ptr;
 848    octx.src1.data = (uint32_t) bufs[1].ptr;
 849    if (4 == n_bufs) {
 850        octx.src2.data = (uint32_t) bufs[2].ptr;
 851        octx.dst.data  = (uint32_t) bufs[3].ptr;
 852    } else {
 853        octx.dst.data = (uint32_t) bufs[2].ptr;
 854    }
 855    octx.n_threads = ctx->n_threads;
 856
 857    struct profile_data prof;
 858    profile_start(&prof);
 859
 860    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 861    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 862        rsp_status = op_rope(&octx);
 863        vtcm_release(ctx);
 864    }
 865
 866    profile_stop(&prof);
 867    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 868}
 869
 870static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
 871    struct dspqueue_buffer rsp_bufs[1];
 872
 873    // We had written to the output buffer, we'd also need to flush it
 874    rsp_bufs[0].fd     = bufs[2].fd;
 875    rsp_bufs[0].ptr    = bufs[2].ptr;
 876    rsp_bufs[0].offset = bufs[2].offset;
 877    rsp_bufs[0].size   = bufs[2].size;
 878    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 879                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 880
 881    // Setup Op context
 882    struct htp_ops_context octx = { 0 };
 883    octx.ctx                    = ctx;
 884    octx.src0                   = req->src0;
 885    octx.src1                   = req->src1;
 886    octx.dst                    = req->dst;
 887    octx.flags                  = req->flags;
 888    octx.op                     = req->op;
 889
 890    // Update data pointers
 891    octx.src0.data = (uint32_t) bufs[0].ptr;
 892    octx.src1.data = (uint32_t) bufs[1].ptr;
 893    octx.dst.data  = (uint32_t) bufs[2].ptr;
 894    octx.n_threads = ctx->n_threads;
 895
 896    struct profile_data prof;
 897    profile_start(&prof);
 898
 899    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 900    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 901        rsp_status = op_set_rows(&octx);
 902        vtcm_release(ctx);
 903    }
 904
 905    profile_stop(&prof);
 906    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 907}
 908
 909static void proc_flash_attn_ext_req(struct htp_context *     ctx,
 910                                    struct htp_general_req * req,
 911                                    struct dspqueue_buffer * bufs,
 912                                    uint32_t                 n_bufs) {
 913    // Setup Op context
 914    struct htp_ops_context octx;
 915    memset(&octx, 0, sizeof(octx));
 916
 917    octx.ctx   = ctx;
 918    octx.n_threads = ctx->n_threads;
 919
 920    octx.src0  = req->src0;
 921    octx.src1  = req->src1;
 922    octx.src2  = req->src2;
 923    octx.src3  = req->src3;
 924    octx.src4  = req->src4;
 925    octx.dst   = req->dst;
 926    octx.flags = req->flags;
 927    octx.op    = req->op;
 928
 929    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
 930
 931    // Update data pointers
 932    octx.src0.data = (uint32_t) bufs[0].ptr;
 933    octx.src1.data = (uint32_t) bufs[1].ptr;
 934    octx.src2.data = (uint32_t) bufs[2].ptr;
 935
 936    int last_buf = 3;
 937
 938    if (octx.src3.ne[0]) {
 939        octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
 940    }
 941
 942    if (octx.src4.ne[0]) {
 943        octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
 944    }
 945
 946    octx.dst.data = (uint32_t) bufs[last_buf].ptr;
 947
 948    struct profile_data prof;
 949    profile_start(&prof);
 950
 951    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
 952    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
 953        rsp_status = op_flash_attn_ext(&octx);
 954        vtcm_release(ctx);
 955    }
 956
 957    profile_stop(&prof);
 958
 959    struct dspqueue_buffer rsp_buf = bufs[last_buf];
 960    rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
 961                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
 962
 963    send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
 964}
 965
 966static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
 967    struct htp_context * ctx = (struct htp_context *) context;
 968
 969    // Repeatedly read packets from the queue until it's empty. We don't
 970    // necessarily get a separate callback for each packet, and new packets
 971    // may arrive while we're processing the previous one. This ensures we
 972    // keep the DSP busy as much as possible and avoid waiting for the CPU.
 973
 974    while (1) {
 975        struct htp_general_req req;
 976        uint32_t               req_size;
 977
 978        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
 979        uint32_t               n_bufs;
 980        uint32_t               flags;
 981
 982        // Read packet from queue
 983        int err = dspqueue_read_noblock(queue, &flags,
 984                                        HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
 985                                        &n_bufs,                 // Number of buffer references
 986                                        bufs,                    // Buffer references
 987                                        sizeof(req),             // Max message length
 988                                        &req_size,               // Message length
 989                                        (uint8_t *) &req);       // Message
 990
 991        if (err == AEE_EWOULDBLOCK) {
 992            // Consumed all packets available for now
 993            return;
 994        }
 995
 996        if (err != 0) {
 997            FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
 998            return;
 999        }
1000
1001        if (req_size != sizeof(req)) {
1002            FARF(ERROR, "Invalid request size");
1003            continue;
1004        }
1005
1006        if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
1007            // Host wants early notification
1008            dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
1009        }
1010
1011        // Process packet based on its message type
1012        switch (req.op) {
1013            case HTP_OP_MUL_MAT:
1014                if (n_bufs != 3) {
1015                    FARF(ERROR, "Bad matmul-req buffer list");
1016                    continue;
1017                }
1018                proc_matmul_req(ctx, &req, bufs, n_bufs);
1019                break;
1020
1021            case HTP_OP_MUL_MAT_ID:
1022                if (n_bufs != 4) {
1023                    FARF(ERROR, "Bad matmul-id-req buffer list");
1024                    continue;
1025                }
1026                proc_matmul_id_req(ctx, &req, bufs, n_bufs);
1027                break;
1028
1029            case HTP_OP_MUL:
1030            case HTP_OP_ADD:
1031            case HTP_OP_SUB:
1032            case HTP_OP_DIV:
1033                if (n_bufs != 3) {
1034                    FARF(ERROR, "Bad binary-req buffer list");
1035                    continue;
1036                }
1037                proc_binary_req(ctx, &req, bufs);
1038                break;
1039
1040            case HTP_OP_RMS_NORM:
1041            case HTP_OP_SCALE:
1042                if (n_bufs != 2) {
1043                    FARF(ERROR, "Bad unary-req buffer list");
1044                    continue;
1045                }
1046
1047                proc_unary_req(ctx, &req, bufs);
1048                break;
1049
1050            case HTP_OP_SQR:
1051            case HTP_OP_SQRT:
1052                if (n_bufs != 2) {
1053                    FARF(ERROR, "Bad unary-req buffer list");
1054                    continue;
1055                }
1056
1057                proc_unary_req(ctx, &req, bufs);
1058                break;
1059
1060            case HTP_OP_SUM_ROWS:
1061                if (n_bufs != 2) {
1062                    FARF(ERROR, "Bad unary-req buffer list");
1063                    continue;
1064                }
1065
1066                proc_sum_rows_req(ctx, &req, bufs);
1067                break;
1068
1069            case HTP_OP_UNARY_SILU:
1070            case HTP_OP_UNARY_GELU:
1071                if (n_bufs != 2) {
1072                    FARF(ERROR, "Bad act-req buffer list");
1073                    continue;
1074                }
1075                proc_activations_req(ctx, &req, bufs, n_bufs);
1076                break;
1077
1078            case HTP_OP_GLU_SWIGLU:
1079            case HTP_OP_GLU_SWIGLU_OAI:
1080            case HTP_OP_SOFTMAX:
1081            case HTP_OP_GLU_GEGLU:
1082                if ((n_bufs != 2) && (n_bufs != 3)) {
1083                    FARF(ERROR, "Bad act-req buffer list");
1084                    continue;
1085                }
1086                proc_activations_req(ctx, &req, bufs, n_bufs);
1087                break;
1088
1089            case HTP_OP_ADD_ID:
1090                if (n_bufs != 4) {
1091                    FARF(ERROR, "Bad add-id-req buffer list");
1092                    continue;
1093                }
1094                proc_add_id_req(ctx, &req, bufs);
1095                break;
1096
1097            case HTP_OP_ROPE:
1098                if ((n_bufs != 3) && (n_bufs != 4)) {
1099                    FARF(ERROR, "Bad rope-req buffer list");
1100                    continue;
1101                }
1102                proc_rope_req(ctx, &req, bufs, n_bufs);
1103                break;
1104
1105            case HTP_OP_FLASH_ATTN_EXT:
1106                if (!(n_bufs >= 4 && n_bufs <= 6)) {
1107                    FARF(ERROR, "Bad flash-attn-ext-req buffer list");
1108                    continue;
1109                }
1110                proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
1111                break;
1112
1113            case HTP_OP_SET_ROWS:
1114                if (n_bufs != 3) {
1115                    FARF(ERROR, "Bad set-rows-req buffer list");
1116                    continue;
1117                }
1118                proc_set_rows_req(ctx, &req, bufs);
1119                break;
1120
1121            case HTP_OP_GET_ROWS:
1122                if (n_bufs != 3) {
1123                    FARF(ERROR, "Bad get-rows-req buffer list");
1124                    continue;
1125                }
1126                proc_get_rows_req(ctx, &req, bufs);
1127                break;
1128
1129            case HTP_OP_CPY:
1130                if (n_bufs != 2) {
1131                    FARF(ERROR, "Bad cpy-req buffer list");
1132                    continue;
1133                }
1134                proc_cpy_req(ctx, &req, bufs);
1135                break;
1136
1137            case HTP_OP_ARGSORT:
1138                if (n_bufs != 2) {
1139                    FARF(ERROR, "Bad argsort-req buffer list");
1140                    continue;
1141                }
1142                proc_argsort_req(ctx, &req, bufs);
1143                break;
1144
1145            default:
1146                FARF(ERROR, "Unknown Op %u", req.op);
1147                break;
1148        }
1149    }
1150}