diff options
Diffstat (limited to 'llama.cpp/tools/mtmd/mtmd-audio.h')
| -rw-r--r-- | llama.cpp/tools/mtmd/mtmd-audio.h | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h new file mode 100644 index 0000000..016c739 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-audio.h @@ -0,0 +1,113 @@ +#pragma once + +#include "ggml.h" +#include "clip-model.h" + +#include <cstdint> +#include <vector> +#include <string> + +#define MTMD_INTERNAL_HEADER + +struct mtmd_audio_mel { + int n_len; + int n_len_org; + int n_mel; + + std::vector<float> data; +}; + +struct mtmd_audio_mel_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector<float> data; +}; + +// cache for audio processing, each processor instance owns its own cache +struct mtmd_audio_cache { + std::vector<float> sin_vals; + std::vector<float> cos_vals; + + std::vector<float> hann_window; + + mtmd_audio_mel_filters filters; + + void fill_sin_cos_table(int n); + + void fill_hann_window(int length, bool periodic); + + // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. + // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. + void fill_mel_filterbank_matrix(int n_mel, + int n_fft, + int sample_rate, // e.g. 16000 + float fmin = 0.0f, // e.g. 0.0 + float fmax = -1.0f, // e.g. sr/2; pass -1 for auto + bool slaney_area_norm = true, + float scale = 1.0f // optional extra scaling + ); +}; + +struct mtmd_audio_preprocessor { + const clip_hparams & hparams; + + mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} + + virtual ~mtmd_audio_preprocessor() = default; + virtual void initialize() = 0; // NOT thread-safe + virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0; +}; + +struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override; + + private: + mtmd_audio_cache cache; +}; + +struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override; + + private: + mtmd_audio_cache cache; +}; + +// +// streaming ISTFT - converts spectrogram frames back to audio one frame at a time +// +struct mtmd_audio_streaming_istft { + mtmd_audio_streaming_istft(int n_fft, int hop_length); + + // reset streaming state + void reset(); + + // process a single STFT frame (streaming) + // frame_spectrum: [n_fft_bins x 2] interleaved real/imag + // returns: up to hop_length samples + std::vector<float> process_frame(const float * frame_spectrum); + + // flush remaining samples at end of stream + std::vector<float> flush(); + + private: + int n_fft; + int hop_length; + int n_fft_bins; + + // Own cache for output processing + mtmd_audio_cache cache; + + // Streaming state + std::vector<float> overlap_buffer; + std::vector<float> window_sum_buffer; + int padding_to_remove; + + // Working buffers for IFFT + std::vector<float> ifft_in; + std::vector<float> ifft_out; +}; |
