cpp-cheat/opengl/glfw_compute_shader_ssbo_inc.c

265 lines
8.8 KiB
C

/*
Compute shader that increments an SSBO, and copies it to a texture that is then displayed.
Usage:
./prog [width [work_group_width [cpu]]]]
- width: window width
- work_group_width. Must divide width.
- cpu: if '1', use CPU, else GPU. If set, work_group_width is ignored.
This illustrates that even without shared memory, there is a minimal
work group size that gives better performance. My FPS went up until a size of 16 was reached.
TODO why? Couldn't the NVIDIA compiler just auto join them in this case?
Work groups of size 1 are very slow (60FPS). 16 by 16 was 1400FPS!
*/
#include "common.h"
static const GLuint WIDTH = 512;
static const GLuint WORK_GROUP_WIDTH = 16;
static const GLfloat vertices_xy_uv[] = {
-1.0, 1.0, 0.0, 1.0,
1.0, 1.0, 0.0, 0.0,
1.0, -1.0, 1.0, 0.0,
-1.0, -1.0, 1.0, 1.0,
};
static const GLuint indices[] = {
0, 1, 2,
0, 2, 3,
};
static const GLchar *vertex_shader_source =
"#version 330 core\n"
"in vec2 coord2d;\n"
"in vec2 vertexUv;\n"
"out vec2 fragmentUv;\n"
"void main() {\n"
" gl_Position = vec4(coord2d, 0, 1);\n"
" fragmentUv = vertexUv;\n"
"}\n";
static const GLchar *fragment_shader_source =
"#version 330 core\n"
"in vec2 fragmentUv;\n"
"out vec3 color;\n"
"uniform sampler2D textureSampler;\n"
"void main() {\n"
" float r = texture(textureSampler, fragmentUv.yx).r;\n"
" color = vec3(r, r, r);\n"
"}\n";
static const char compute_shader_source_template[] =
"#version 430\n"
"layout (local_size_x = %d, local_size_y = %d) in;\n"
"layout (r32f, binding = 0) uniform image2D img_output;\n"
"layout (std430, binding=0) buffer temperatures {\n"
" float temperature[];\n"
"};\n"
"uniform uint width;\n"
"void main() {\n"
" ivec2 gid = ivec2(gl_GlobalInvocationID.xy);\n"
" ivec2 dims = imageSize(img_output);\n"
/* TODO: there must be a better way to do this with some GLSL magic? */
" uint i = gid.y * width + gid.x;\n"
" float t = temperature[i];\n"
" vec4 pixel = vec4(t, 0.0, 0.0, 1.0);\n"
" imageStore(img_output, gid, pixel);\n"
/* TODO: does the above pass just a single float, or can we reduce memory bandwidth with something like: */
/*" imageStore(img_output, pixel_coords, t);\n"*/
" temperature[i] = mod(t + 0.01, 1.0);\n"
"}\n";
void error_callback(int error, const char* description) {
puts(description);
}
int main(int argc, char **argv) {
GLFWwindow *window;
GLfloat *temperatures;
GLint
coord2d_location,
textureSampler_location,
vertexUv_location,
width_location
;
GLuint
compute_program,
ebo,
height,
program,
ssbo,
texture,
width,
work_group_width,
vao,
vbo
;
int cpu;
size_t n_temperatures;
char *compute_shader_source, *work_group_width_str;
/* CLI arguments. */
if (argc > 1) {
width = strtol(argv[1], NULL, 10);
} else {
width = WIDTH;
}
height = width;
if (argc > 2) {
work_group_width = strtol(argv[2], NULL, 10);
} else {
work_group_width = WORK_GROUP_WIDTH;
}
if (argc > 3) {
cpu = (argv[3][0] == '1');
} else {
cpu = 0;
}
/* Window. */
glfwInit();
glfwSetErrorCallback(error_callback);
glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
window = glfwCreateWindow(width, height, __FILE__, NULL, NULL);
glfwMakeContextCurrent(window);
glfwSwapInterval(1);
glewInit();
/* Shader. */
program = common_get_shader_program(vertex_shader_source, fragment_shader_source);
coord2d_location = glGetAttribLocation(program, "coord2d");
vertexUv_location = glGetAttribLocation(program, "vertexUv");
textureSampler_location = glGetUniformLocation(program, "textureSampler");
if (!cpu) {
/* Compute shader. */
int work_group_width_len = snprintf(NULL, 0, "%d", work_group_width);
size_t compute_shader_source_len = sizeof(compute_shader_source_template) + 2 * work_group_width_len;
compute_shader_source = malloc(compute_shader_source_len);
snprintf(
compute_shader_source,
compute_shader_source_len,
compute_shader_source_template,
work_group_width,
work_group_width
);
compute_program = common_get_compute_program(compute_shader_source);
free(compute_shader_source);
width_location = glGetUniformLocation(compute_program, "width");
}
/* vbo */
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices_xy_uv), vertices_xy_uv, GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
/* ebo */
glGenBuffers(1, &ebo);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
/* vao */
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexAttribPointer(coord2d_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices_xy_uv[0]), (GLvoid*)0);
glEnableVertexAttribArray(coord2d_location);
glVertexAttribPointer(vertexUv_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices_xy_uv[0]), (GLvoid*)(2 * sizeof(vertices_xy_uv[0])));
glEnableVertexAttribArray(vertexUv_location);
glBindVertexArray(0);
/* ssbo */
srand(time(NULL));
n_temperatures = width * height;
temperatures = malloc(n_temperatures * sizeof(temperatures[0]));
for (size_t i = 0; i < n_temperatures; ++i) {
/* Doable with just a simple fragment shader + time uniform, but easier to see what is going on. */
temperatures[i] = 0.0;
/* Not doable with simple fragment shader, so closer to real applications. */
/*temperatures[i] = rand() / (float)RAND_MAX;*/
}
if (!cpu) {
glGenBuffers(1, &ssbo);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
/* GL_DYNAMIC_COPY because we are copying to GPU memory several times. */
glBufferData(GL_SHADER_STORAGE_BUFFER, n_temperatures * sizeof(temperatures[0]), temperatures, GL_DYNAMIC_COPY);
/* 0 corresponds to the 0 of the shader at: "layout (std430, binding=0)". */
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
free(temperatures);
}
/* Texture. */
glGenTextures(1, &texture);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texture);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
if (!cpu) {
glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, width, height, 0, GL_RED, GL_FLOAT, NULL);
/* Bind to image unit so can write to specific pixels from the compute shader. */
glBindImageTexture(0, texture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32F);
}
/* Constant state. */
glViewport(0, 0, width, height);
glClearColor(1.0f, 1.0f, 1.0f, 1.0f);
/* Main loop. */
common_fps_init();
while (!glfwWindowShouldClose(window)) {
if (cpu) {
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
GLfloat *t = &temperatures[i * width + j];
*t = fmod(*t + 0.01, 1.0);
}
}
/* GL_RED because there is no better way to do grayscale.
* http://stackoverflow.com/questions/680125/can-i-use-a-grayscale-image-with-the-opengl-glteximage2d-function */
glTexImage2D(
GL_TEXTURE_2D, 0, GL_RED, width, height,
0, GL_RED, GL_FLOAT, temperatures
);
} else {
/* Compute. */
glUseProgram(compute_program);
glUniform1ui(width_location, width);
glDispatchCompute((GLuint)width / work_group_width, (GLuint)height / work_group_width, 1);
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
}
/* Draw. */
glClear(GL_COLOR_BUFFER_BIT);
glUseProgram(program);
glUniform1i(textureSampler_location, 0);
glBindVertexArray(vao);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
glBindVertexArray(0);
glfwSwapBuffers(window);
glfwPollEvents();
common_fps_print();
}
/* Cleanup. */
glDeleteBuffers(1, &ebo);
if (cpu) {
free(temperatures);
} else {
glDeleteBuffers(1, &ssbo);
}
glDeleteBuffers(1, &vbo);
glDeleteVertexArrays(1, &vao);
glDeleteTextures(1, &texture);
glDeleteProgram(program);
glDeleteProgram(compute_program);
glfwTerminate();
return EXIT_SUCCESS;
}