1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
kernel void kernel_ssm_conv_f32_f32(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
int ne10,
ulong nb11,
ulong nb0,
ulong nb1,
ulong nb2
){
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int ir = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int nc = ne10;
global float * s = (global float *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
global float * c = (global float *) (src1 + ir*nb11);
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int i0 = 0; i0 < nc; ++i0) {
sumf += s[i0] * c[i0];
}
d[0] = sumf;
}
kernel void kernel_ssm_conv_f32_f32_4(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
int ne10,
ulong nb11,
ulong nb0,
ulong nb1,
ulong nb2
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int ir = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int nc = ne10;
global float4 * s = (global float4 *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
global float4 * c = (global float4 *) (src1 + ir*nb11);
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int i0 = 0; i0 < nc/4; ++i0) {
sumf += dot(s[i0], c[i0]);
}
d[0] = sumf;
}
|