llmnpc - llama.cpp/tools/mtmd/mtmd-audio.h

Path: llmnpc / llama.cpp / tools / mtmd / mtmd-audio.h (raw)
  1#pragma once
  2
  3#include "ggml.h"
  4#include "clip-model.h"
  5
  6#include <cstdint>
  7#include <vector>
  8#include <string>
  9
 10#define MTMD_INTERNAL_HEADER
 11
 12struct mtmd_audio_mel {
 13    int n_len;
 14    int n_len_org;
 15    int n_mel;
 16
 17    std::vector<float> data;
 18};
 19
 20struct mtmd_audio_mel_filters {
 21    int32_t n_mel;
 22    int32_t n_fft;
 23
 24    std::vector<float> data;
 25};
 26
 27// cache for audio processing, each processor instance owns its own cache
 28struct mtmd_audio_cache {
 29    std::vector<float> sin_vals;
 30    std::vector<float> cos_vals;
 31
 32    std::vector<float> hann_window;
 33
 34    mtmd_audio_mel_filters filters;
 35
 36    void fill_sin_cos_table(int n);
 37
 38    void fill_hann_window(int length, bool periodic);
 39
 40    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
 41    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
 42    void fill_mel_filterbank_matrix(int   n_mel,
 43                                    int   n_fft,
 44                                    int   sample_rate,               // e.g. 16000
 45                                    float fmin             = 0.0f,   // e.g. 0.0
 46                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
 47                                    bool  slaney_area_norm = true,
 48                                    float scale = 1.0f  // optional extra scaling
 49    );
 50};
 51
 52struct mtmd_audio_preprocessor {
 53    const clip_hparams & hparams;
 54
 55    mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
 56
 57    virtual ~mtmd_audio_preprocessor() = default;
 58    virtual void initialize() = 0; // NOT thread-safe
 59    virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
 60};
 61
 62struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
 63    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
 64    void initialize() override;
 65    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
 66
 67  private:
 68    mtmd_audio_cache cache;
 69};
 70
 71struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
 72    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
 73    void initialize() override;
 74    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
 75
 76  private:
 77    mtmd_audio_cache cache;
 78};
 79
 80//
 81// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
 82//
 83struct mtmd_audio_streaming_istft {
 84    mtmd_audio_streaming_istft(int n_fft, int hop_length);
 85
 86    // reset streaming state
 87    void reset();
 88
 89    // process a single STFT frame (streaming)
 90    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
 91    // returns: up to hop_length samples
 92    std::vector<float> process_frame(const float * frame_spectrum);
 93
 94    // flush remaining samples at end of stream
 95    std::vector<float> flush();
 96
 97  private:
 98    int n_fft;
 99    int hop_length;
100    int n_fft_bins;
101
102    // Own cache for output processing
103    mtmd_audio_cache cache;
104
105    // Streaming state
106    std::vector<float> overlap_buffer;
107    std::vector<float> window_sum_buffer;
108    int                padding_to_remove;
109
110    // Working buffers for IFFT
111    std::vector<float> ifft_in;
112    std::vector<float> ifft_out;
113};