summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/mtmd-audio.h
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/mtmd/mtmd-audio.h
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tools/mtmd/mtmd-audio.h')
-rw-r--r--llama.cpp/tools/mtmd/mtmd-audio.h113
1 files changed, 113 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h
new file mode 100644
index 0000000..016c739
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip-model.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#define MTMD_INTERNAL_HEADER
+
+struct mtmd_audio_mel {
+ int n_len;
+ int n_len_org;
+ int n_mel;
+
+ std::vector<float> data;
+};
+
+struct mtmd_audio_mel_filters {
+ int32_t n_mel;
+ int32_t n_fft;
+
+ std::vector<float> data;
+};
+
+// cache for audio processing, each processor instance owns its own cache
+struct mtmd_audio_cache {
+ std::vector<float> sin_vals;
+ std::vector<float> cos_vals;
+
+ std::vector<float> hann_window;
+
+ mtmd_audio_mel_filters filters;
+
+ void fill_sin_cos_table(int n);
+
+ void fill_hann_window(int length, bool periodic);
+
+ // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+ // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+ void fill_mel_filterbank_matrix(int n_mel,
+ int n_fft,
+ int sample_rate, // e.g. 16000
+ float fmin = 0.0f, // e.g. 0.0
+ float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
+ bool slaney_area_norm = true,
+ float scale = 1.0f // optional extra scaling
+ );
+};
+
+struct mtmd_audio_preprocessor {
+ const clip_hparams & hparams;
+
+ mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
+
+ virtual ~mtmd_audio_preprocessor() = default;
+ virtual void initialize() = 0; // NOT thread-safe
+ virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
+};
+
+struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
+ mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+ void initialize() override;
+ bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+ private:
+ mtmd_audio_cache cache;
+};
+
+struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
+ mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+ void initialize() override;
+ bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+ private:
+ mtmd_audio_cache cache;
+};
+
+//
+// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
+//
+struct mtmd_audio_streaming_istft {
+ mtmd_audio_streaming_istft(int n_fft, int hop_length);
+
+ // reset streaming state
+ void reset();
+
+ // process a single STFT frame (streaming)
+ // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
+ // returns: up to hop_length samples
+ std::vector<float> process_frame(const float * frame_spectrum);
+
+ // flush remaining samples at end of stream
+ std::vector<float> flush();
+
+ private:
+ int n_fft;
+ int hop_length;
+ int n_fft_bins;
+
+ // Own cache for output processing
+ mtmd_audio_cache cache;
+
+ // Streaming state
+ std::vector<float> overlap_buffer;
+ std::vector<float> window_sum_buffer;
+ int padding_to_remove;
+
+ // Working buffers for IFFT
+ std::vector<float> ifft_in;
+ std::vector<float> ifft_out;
+};