/*
Compute shader that increments an SSBO, and copies it to a texture that is then displayed.

Usage:

    ./prog [width [work_group_width [cpu]]]]

- width: window width
- work_group_width. Must divide width.
- cpu: if '1', use CPU, else GPU. If set, work_group_width is ignored.

This illustrates that even without shared memory, there is a minimal
work group size that gives better performance. My FPS went up until a size of 16 was reached.

TODO why? Couldn't the NVIDIA compiler just auto join them in this case?

Work groups of size 1 are very slow (60FPS). 16 by 16 was 1400FPS!
*/

#include "common.h"

static const GLuint WIDTH = 512;
static const GLuint WORK_GROUP_WIDTH = 16;
static const GLfloat vertices_xy_uv[] = {
    -1.0,  1.0, 0.0, 1.0,
     1.0,  1.0, 0.0, 0.0,
     1.0, -1.0, 1.0, 0.0,
    -1.0, -1.0, 1.0, 1.0,
};
static const GLuint indices[] = {
    0, 1, 2,
    0, 2, 3,
};

static const GLchar *vertex_shader_source =
    "#version 330 core\n"
    "in vec2 coord2d;\n"
    "in vec2 vertexUv;\n"
    "out vec2 fragmentUv;\n"
    "void main() {\n"
    "    gl_Position = vec4(coord2d, 0, 1);\n"
    "    fragmentUv = vertexUv;\n"
    "}\n";
static const GLchar *fragment_shader_source =
    "#version 330 core\n"
    "in vec2 fragmentUv;\n"
    "out vec3 color;\n"
    "uniform sampler2D textureSampler;\n"
    "void main() {\n"
    "    float r = texture(textureSampler, fragmentUv.yx).r;\n"
    "    color = vec3(r, r, r);\n"
    "}\n";
static const char compute_shader_source_template[] =
    "#version 430\n"
    "layout (local_size_x = %d, local_size_y = %d) in;\n"
    "layout (r32f, binding = 0) uniform image2D img_output;\n"
    "layout (std430, binding=0) buffer temperatures {\n"
    "    float temperature[];\n"
    "};\n"
    "uniform uint width;\n"
    "void main() {\n"
    "    ivec2 gid = ivec2(gl_GlobalInvocationID.xy);\n"
    "    ivec2 dims = imageSize(img_output);\n"
    /* TODO: there must be a better way to do this with some GLSL magic? */
    "    uint i = gid.y * width + gid.x;\n"
    "    float t = temperature[i];\n"
    "    vec4 pixel = vec4(t, 0.0, 0.0, 1.0);\n"
    "    imageStore(img_output, gid, pixel);\n"
    /* TODO: does the above pass just a single float, or can we reduce memory bandwidth with something like: */
    /*"    imageStore(img_output, pixel_coords, t);\n"*/
    "    temperature[i] = mod(t + 0.01, 1.0);\n"
    "}\n";

void error_callback(int error, const char* description) {
    puts(description);
}

int main(int argc, char **argv) {
    GLFWwindow *window;
    GLfloat *temperatures;
    GLint
        coord2d_location,
        textureSampler_location,
        vertexUv_location,
        width_location
    ;
    GLuint
        compute_program,
        ebo,
        height,
        program,
        ssbo,
        texture,
        width,
        work_group_width,
        vao,
        vbo
    ;
    int cpu;
    size_t n_temperatures;
    char *compute_shader_source, *work_group_width_str;

    /* CLI arguments. */
    if (argc > 1) {
        width = strtol(argv[1], NULL, 10);
    } else {
        width = WIDTH;
    }
    height = width;
    if (argc > 2) {
        work_group_width = strtol(argv[2], NULL, 10);
    } else {
        work_group_width = WORK_GROUP_WIDTH;
    }
    if (argc > 3) {
        cpu = (argv[3][0] == '1');
    } else {
        cpu = 0;
    }

    /* Window. */
    glfwInit();
    glfwSetErrorCallback(error_callback);
    glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
    window = glfwCreateWindow(width, height, __FILE__, NULL, NULL);
    glfwMakeContextCurrent(window);
    glfwSwapInterval(1);
    glewInit();

    /* Shader. */
    program = common_get_shader_program(vertex_shader_source, fragment_shader_source);
    coord2d_location = glGetAttribLocation(program, "coord2d");
    vertexUv_location = glGetAttribLocation(program, "vertexUv");
	textureSampler_location = glGetUniformLocation(program, "textureSampler");

    if (!cpu) {
        /* Compute shader. */
        int work_group_width_len = snprintf(NULL, 0, "%d", work_group_width);
        size_t compute_shader_source_len = sizeof(compute_shader_source_template) + 2 * work_group_width_len;
        compute_shader_source = malloc(compute_shader_source_len);
        snprintf(
            compute_shader_source,
            compute_shader_source_len,
            compute_shader_source_template,
            work_group_width,
            work_group_width
        );
        compute_program = common_get_compute_program(compute_shader_source);
        free(compute_shader_source);
        width_location = glGetUniformLocation(compute_program, "width");
    }

    /* vbo */
    glGenBuffers(1, &vbo);
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices_xy_uv), vertices_xy_uv, GL_STATIC_DRAW);
    glBindBuffer(GL_ARRAY_BUFFER, 0);

    /* ebo */
    glGenBuffers(1, &ebo);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);

    /* vao */
    glGenVertexArrays(1, &vao);
    glBindVertexArray(vao);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glVertexAttribPointer(coord2d_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices_xy_uv[0]), (GLvoid*)0);
    glEnableVertexAttribArray(coord2d_location);
    glVertexAttribPointer(vertexUv_location, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(vertices_xy_uv[0]), (GLvoid*)(2 * sizeof(vertices_xy_uv[0])));
    glEnableVertexAttribArray(vertexUv_location);
    glBindVertexArray(0);

    /* ssbo */
    srand(time(NULL));
    n_temperatures = width * height;
    temperatures = malloc(n_temperatures * sizeof(temperatures[0]));
    for (size_t i = 0; i < n_temperatures; ++i) {
        /* Doable with just a simple fragment shader + time uniform, but easier to see what is going on. */
        temperatures[i] = 0.0;
        /* Not doable with simple fragment shader, so closer to real applications. */
        /*temperatures[i] = rand() / (float)RAND_MAX;*/
    }
    if (!cpu) {
        glGenBuffers(1, &ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
        /* GL_DYNAMIC_COPY because we are copying to GPU memory several times. */
        glBufferData(GL_SHADER_STORAGE_BUFFER, n_temperatures * sizeof(temperatures[0]), temperatures, GL_DYNAMIC_COPY);
        /* 0 corresponds to the 0 of the shader at: "layout (std430, binding=0)". */
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
        free(temperatures);
    }

    /* Texture. */
    glGenTextures(1, &texture);
    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, texture);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    if (!cpu) {
        glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, width, height, 0, GL_RED, GL_FLOAT, NULL);
        /* Bind to image unit so can write to specific pixels from the compute shader. */
        glBindImageTexture(0, texture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32F);
    }

    /* Constant state. */
    glViewport(0, 0, width, height);
    glClearColor(1.0f, 1.0f, 1.0f, 1.0f);

    /* Main loop. */
    common_fps_init();
    while (!glfwWindowShouldClose(window)) {
        if (cpu) {
            for (unsigned int i = 0; i < height; ++i) {
                for (unsigned int j = 0; j < width; ++j) {
                    GLfloat *t = &temperatures[i * width + j];
                    *t = fmod(*t + 0.01, 1.0);
                }
            }
            /* GL_RED because there is no better way to do grayscale.
             * http://stackoverflow.com/questions/680125/can-i-use-a-grayscale-image-with-the-opengl-glteximage2d-function */
            glTexImage2D(
                GL_TEXTURE_2D, 0, GL_RED, width, height,
                0, GL_RED, GL_FLOAT, temperatures
            );
        } else {
            /* Compute. */
            glUseProgram(compute_program);
            glUniform1ui(width_location, width);
            glDispatchCompute((GLuint)width / work_group_width, (GLuint)height / work_group_width, 1);
            glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
        }

        /* Draw. */
        glClear(GL_COLOR_BUFFER_BIT);
        glUseProgram(program);
        glUniform1i(textureSampler_location, 0);
        glBindVertexArray(vao);
        glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
        glBindVertexArray(0);
        glfwSwapBuffers(window);

        glfwPollEvents();
        common_fps_print();
    }

    /* Cleanup. */
    glDeleteBuffers(1, &ebo);
    if (cpu) {
        free(temperatures);
    } else {
        glDeleteBuffers(1, &ssbo);
    }
    glDeleteBuffers(1, &vbo);
    glDeleteVertexArrays(1, &vao);
	glDeleteTextures(1, &texture);
    glDeleteProgram(program);
    glDeleteProgram(compute_program);
    glfwTerminate();
    return EXIT_SUCCESS;
}