llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

#version 450

layout (push_constant) uniform parameter
{
    uint ne; uint a_offset; uint d_offset;
    uint ne00; uint ne01;
    uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13;
    float sf0; float sf1; float sf2; float sf3;
    float pixel_offset;
} p;

#include "types.glsl"

layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
#define NEAREST  0
#define BILINEAR 1
#define BICUBIC  2
#define BILINEAR_ANTIALIAS 513

layout (constant_id = 0) const uint scale_mode = 0;

float fetch_nearest(uint i10, uint i11, uint i12, uint i13) {
    const uint i00 = uint(i10 / p.sf0);
    const uint i01 = uint(i11 / p.sf1);
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);

    return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00];
}

float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
    const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;

    const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00];
    const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00];
    const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00];
    const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00];

    return
        v00 * (1.0-d.x) * (1.0-d.y) +
        v01 * d.x       * (1.0-d.y) +
        v10 * (1.0-d.x) * d.y +
        v11 * d.x       * d.y;
}

float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
    const ivec2 ne0 = ivec2(p.ne00, p.ne01);

    const vec2 c = (vec2(i10, i11) + p.pixel_offset) / vec2(p.sf0, p.sf1) - p.pixel_offset;
    const vec2 c0f = floor(c);
    const vec2 d = c - c0f;
    const ivec2 c0 = max(ivec2(c0f), 0);
    const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1);

    return fetch_bilinear(c0, c1, d, i12, i13);
}

float triangle_filter(float x) {
    return max(1.0f - abs(x), 0.0f);
}

float interpolate_bilinear_antialias(uint i10, uint i11, uint i12, uint i13) {
    const float support1  = max(1.0f, 1.0f / p.sf1);
    const float invscale1 = 1.0f / support1;
    const float support0  = max(1.0f, 1.0f / p.sf0);
    const float invscale0 = 1.0f / support0;

    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);

    const float y = (float(i11) + p.pixel_offset) / p.sf1;
    const float x = (float(i10) + p.pixel_offset) / p.sf0;

    // the range of source pixels that contribute
    const int x_min = max(int(x - support0 + p.pixel_offset), 0);
    const int x_max = min(int(x + support0 + p.pixel_offset), int(p.ne00));
    const int y_min = max(int(y - support1 + p.pixel_offset), 0);
    const int y_max = min(int(y + support1 + p.pixel_offset), int(p.ne01));

    // bilinear filter with antialiasing
    float val = 0.0f;
    float total_weight = 0.0f;

    for (int sy = y_min; sy < y_max; sy++) {
        const float weight_y = triangle_filter((sy - y + p.pixel_offset) * invscale1);

        for (int sx = x_min; sx < x_max; sx++) {
            const float weight_x = triangle_filter((sx - x + p.pixel_offset) * invscale0);
            const float weight = weight_x * weight_y;

            if (weight <= 0.0f) {
                continue;
            }

            const float pixel = data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + sy * p.nb01 + sx * p.nb00];
            val += pixel * weight;
            total_weight += weight;
        }
    }

    if (total_weight > 0.0f) {
        val /= total_weight;
    }

    return val;
}

// Bicubic interpolation with alpha = -0.75
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
const vec4 bcoeffs1 = vec4( 1.25, -2.25,  0.0, 1.0);
const vec4 bcoeffs2 = vec4(-0.75,  3.75, -6.0, 3.0);
vec4 powers(float x) { return vec4(x*x*x, x*x, x, 1); }

float bicubic(float p0, float p1, float p2, float p3, float x) {
    return p0 * dot(bcoeffs2, powers(x + 1)) +
           p1 * dot(bcoeffs1, powers(x    )) +
           p2 * dot(bcoeffs1, powers(1 - x)) +
           p3 * dot(bcoeffs2, powers(2 - x));
}

#define FETCH(a,b) data_a[base + clamp(i.x+(a), 0, res.x) * p.nb00 + clamp(i.y+(b), 0, res.y) * p.nb01]

float interpolate_bicubic(uint i10, uint i11, uint i12, uint i13) {
    const ivec2 res = ivec2(p.ne00 - 1, p.ne01 - 1);

    const vec2 coord = (vec2(i10, i11) + p.pixel_offset) / vec2(p.sf0, p.sf1) - p.pixel_offset;
    const vec2 d = fract(coord);
    const ivec2 i = ivec2(floor(coord));

    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
    const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;

    return bicubic(
        bicubic(FETCH(-1,-1), FETCH(0,-1), FETCH(1,-1), FETCH(2,-1), d.x),
        bicubic(FETCH(-1, 0), FETCH(0, 0), FETCH(1, 0), FETCH(2, 0), d.x),
        bicubic(FETCH(-1, 1), FETCH(0, 1), FETCH(1, 1), FETCH(2, 1), d.x),
        bicubic(FETCH(-1, 2), FETCH(0, 2), FETCH(1, 2), FETCH(2, 2), d.x), d.y);
}

void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

    if (idx >= p.ne) {
        return;
    }

    const uint i10 = idx % p.ne10;
    const uint i11 = (idx / p.ne10) % p.ne11;
    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;

    float result;
    switch (scale_mode) {
        case NEAREST:
            result = fetch_nearest(i10, i11, i12, i13);
            break;
        case BILINEAR:
            result = interpolate_bilinear(i10, i11, i12, i13);
            break;
        case BICUBIC:
            result = interpolate_bicubic(i10, i11, i12, i13);
            break;
        case BILINEAR_ANTIALIAS:
            result = interpolate_bilinear_antialias(i10, i11, i12, i13);
            break;
    }

    data_d[p.d_offset + idx] = D_TYPE(result);
}