GEMMStore Playground
Device: Detecting...
WebGPU: Checking...
Load Spec File
NaiveGEMMF16
OptimizedGEMMF16
SubgroupsGEMMF16
NaiveGEMMF32
OptimizedGEMMF32
NaiveGEMVF16
OptimizedGEMVF16
NaiveGEMVF32
OptimizedGEMVF32
mul_mat_mat_F16xF16xF32
mul_mat_mat_F16xF32xF32
mul_mat_mat_F32xF32xF32
mul_mat_mat_reg_tile_scalar_F16xF16xF32
mul_mat_mat_reg_tile_scalar_F16xF32xF32
mul_mat_mat_reg_tile_scalar_F32xF32xF32
mul_mat_mat_reg_tile_scalar_Q4_0xF32xF32
mul_mat_mat_reg_tile_vectorized_F16xF16xF32
mul_mat_mat_reg_tile_vectorized_F16xF32xF32
mul_mat_mat_reg_tile_vectorized_F32xF32xF32
mul_mat_mat_reg_tile_vectorized_Q4_0xF32xF32
mul_mat_mat_subgroup_scalar_F32xF32xF32
mul_mat_mat_subgroup_scalar_F16xF32xF32
mul_mat_mat_subgroup_scalar_F16xF16xF32
mul_mat_mat_subgroup_scalar_Q4_0xF32xF32
mul_mat_mat_subgroup_vectorized_F32xF32xF32
mul_mat_mat_subgroup_vectorized_F16xF32xF32
mul_mat_mat_subgroup_vectorized_F16xF16xF32
mul_mat_mat_subgroup_vectorized_Q4_0xF32xF32
mul_mat_vec_F16xF16xF32
mul_mat_vec_F16xF32xF32
mul_mat_vec_F32xF32xF32
mul_mat_vec_scalar_F16xF16xF32
mul_mat_vec_scalar_F16xF32xF32
mul_mat_vec_scalar_F32xF32xF32
mul_mat_vec_scalar_Q4_0xF32xF32
mul_mat_vec_vectorized_F16xF16xF32
mul_mat_vec_vectorized_F16xF32xF32
mul_mat_vec_vectorized_F32xF32xF32
OptimizedLlamaGEMVF16xF32
TensorCoresF16xF16
flash_attn
Upload .js
Download
Run Benchmark
Kernel Specification
JavaScript
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// GEMMStore Playground - Edit this spec and click "Run Benchmark" window.CustomKernel = { name: "CustomKernel", description: "A simple matrix multiplication kernel - edit me!", inputArgs: [ { name: "matrixA", storage_type: "storage", type: "float[]", fill: "randf32", sizes: [1024*1024], io_type: "input", order: 0 }, { name: "matrixB", storage_type: "storage", type: "float[]", fill: "randf32", sizes: [1024*1024], io_type: "input", order: 1 }, { name: "matrixC", storage_type: "storage", type: "float[]", fill: "don't fill", sizes: [1024*1024], io_type: "output", order: 2 }, { name: "matrixSize", storage_type: "uniform", type: "int", values: [1024] } ], tuningArgs: [ { name: "workgroupSizeX", type: "fixed", values: ["16"], description: "Workgroup size in X dimension" }, { name: "workgroupSizeY", type: "fixed", values: ["16"], description: "Workgroup size in Y dimension" } ], wgsl_shader: ` @compute @workgroup_size({{workgroupSizeX}}, {{workgroupSizeY}}) fn main(@builtin(global_invocation_id) global_id: vec3<u32>) { let row = global_id.y; let col = global_id.x; if (row >= matrixSize || col >= matrixSize) { return; } var sum = 0.0; for (var k = 0u; k < matrixSize; k++) { let a_val = {{matrixA}}[row * matrixSize + k]; let b_val = {{matrixB}}[k * matrixSize + col]; sum += a_val * b_val; } {{matrixC}}[row * matrixSize + col] = sum; }`, requiredFeatures: [], measureArgs: ["ms", "GFLOPs"], js_reference: async function matrixMultiplyGPU(A, B, n) { const adapter = await navigator.gpu.requestAdapter(); const device = await adapter.requestDevice(); const bufferA = device.createBuffer({ size: A.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST }); const bufferB = device.createBuffer({ size: B.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST }); const bufferC = device.createBuffer({ size: n * n * 4, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC }); const uniformBuffer = device.createBuffer({ size: 4, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST }); const stagingBuffer = device.createBuffer({ size: n * n * 4, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }); device.queue.writeBuffer(bufferA, 0, A); device.queue.writeBuffer(bufferB, 0, B); device.queue.writeBuffer(uniformBuffer, 0, new Uint32Array([n])); const computeShader = ` @group(0) @binding(0) var<storage, read> A: array<f32>; @group(0) @binding(1) var<storage, read> B: array<f32>; @group(0) @binding(2) var<storage, read_write> C: array<f32>; @group(0) @binding(3) var<uniform> n: u32; @compute @workgroup_size(16, 16) fn main(@builtin(global_invocation_id) global_id: vec3<u32>) { let row = global_id.y; let col = global_id.x; if (row >= n || col >= n) { return; } var sum = 0.0; for (var k = 0u; k < n; k++) { sum += A[row * n + k] * B[k * n + col]; } C[row * n + col] = sum; } `; const pipeline = device.createComputePipeline({ layout: 'auto', compute: { module: device.createShaderModule({ code: computeShader }), entryPoint: 'main' }, }); const bindGroup = device.createBindGroup({ layout: pipeline.getBindGroupLayout(0), entries: [ { binding: 0, resource: { buffer: bufferA } }, { binding: 1, resource: { buffer: bufferB } }, { binding: 2, resource: { buffer: bufferC } }, { binding: 3, resource: { buffer: uniformBuffer } }, ], }); const encoder = device.createCommandEncoder(); const pass = encoder.beginComputePass(); pass.setPipeline(pipeline); pass.setBindGroup(0, bindGroup); pass.dispatchWorkgroups(Math.ceil(n / 16), Math.ceil(n / 16)); pass.end(); encoder.copyBufferToBuffer(bufferC, 0, stagingBuffer, 0, n * n * 4); device.queue.submit([encoder.finish()]); await stagingBuffer.mapAsync(GPUMapMode.READ); const result = new Float32Array(stagingBuffer.getMappedRange()); const copy = result.slice(); stagingBuffer.unmap(); return copy; }, valid_Args: function(A, B, C, n, workgroupSizeX, workgroupSizeY) { var first = A.length === n*n && B.length === n*n && C.length === n*n; var second = parseInt(workgroupSizeX) * parseInt(workgroupSizeY) <= 256; return first && second; }, num_Workgroups: function(A, B, C, n, workgroupSizeX, workgroupSizeY) { var numWorkgroupsX = Math.ceil(n / parseInt(workgroupSizeX)); var numWorkgroupsY = Math.ceil(n / parseInt(workgroupSizeY)); return [numWorkgroupsX, numWorkgroupsY]; } };
Benchmark Results
No results yet. Edit the spec and click "Run Benchmark".
Debug Log
Logs will appear here when you run a benchmark.