1#pragma once
2
3#include "ggml.h"
4#include "clip-model.h"
5
6#include <cstdint>
7#include <vector>
8#include <string>
9
10#define MTMD_INTERNAL_HEADER
11
12struct mtmd_audio_mel {
13 int n_len;
14 int n_len_org;
15 int n_mel;
16
17 std::vector<float> data;
18};
19
20struct mtmd_audio_mel_filters {
21 int32_t n_mel;
22 int32_t n_fft;
23
24 std::vector<float> data;
25};
26
27// cache for audio processing, each processor instance owns its own cache
28struct mtmd_audio_cache {
29 std::vector<float> sin_vals;
30 std::vector<float> cos_vals;
31
32 std::vector<float> hann_window;
33
34 mtmd_audio_mel_filters filters;
35
36 void fill_sin_cos_table(int n);
37
38 void fill_hann_window(int length, bool periodic);
39
40 // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
41 // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
42 void fill_mel_filterbank_matrix(int n_mel,
43 int n_fft,
44 int sample_rate, // e.g. 16000
45 float fmin = 0.0f, // e.g. 0.0
46 float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
47 bool slaney_area_norm = true,
48 float scale = 1.0f // optional extra scaling
49 );
50};
51
52struct mtmd_audio_preprocessor {
53 const clip_hparams & hparams;
54
55 mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
56
57 virtual ~mtmd_audio_preprocessor() = default;
58 virtual void initialize() = 0; // NOT thread-safe
59 virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
60};
61
62struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
63 mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
64 void initialize() override;
65 bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
66
67 private:
68 mtmd_audio_cache cache;
69};
70
71struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
72 mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
73 void initialize() override;
74 bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
75
76 private:
77 mtmd_audio_cache cache;
78};
79
80//
81// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
82//
83struct mtmd_audio_streaming_istft {
84 mtmd_audio_streaming_istft(int n_fft, int hop_length);
85
86 // reset streaming state
87 void reset();
88
89 // process a single STFT frame (streaming)
90 // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
91 // returns: up to hop_length samples
92 std::vector<float> process_frame(const float * frame_spectrum);
93
94 // flush remaining samples at end of stream
95 std::vector<float> flush();
96
97 private:
98 int n_fft;
99 int hop_length;
100 int n_fft_bins;
101
102 // Own cache for output processing
103 mtmd_audio_cache cache;
104
105 // Streaming state
106 std::vector<float> overlap_buffer;
107 std::vector<float> window_sum_buffer;
108 int padding_to_remove;
109
110 // Working buffers for IFFT
111 std::vector<float> ifft_in;
112 std::vector<float> ifft_out;
113};