/*
Process an arbitrary input vector with a given shader and print output to stdout.

Sample usage:

    echo '1 2 3' | tr ' ' '\n' >vec_io.vec
    ./prog vec_io.cl vec_io.vec

Output:

    2.000000e+00
    3.000000e+00
    4.000000e+00

Or you can use the default kernel and stdin input:

    echo '1 2 3' | tr ' ' '\n' | ./prog

Multi vector operations can be done by reusing parts of the vector as in `+=`, e.g.:

	echo '1 2 3 4' | tr ' ' '\n' | ./vec_io.out vec_io_sum.cl

gives:

	4.000000e+00
	6.000000e+00
	3.000000e+00
	4.000000e+00

which represents:

	(1, 2) += (3, 4) == (4, 6)

Set global work size and work group size different than defaults (n and 1):

    ./prog -g 10 -l 5 vec_io.cl vec_io.vec

Generate a binary shader , and then use it (clCreateProgramWithBinary) instead of the CL C:

    ./prog vec_io.cl vec_io.vec
    ./prog -b vec_io.c.bin.tmp vec_io.vec

This allows you to modify the binary shader while reverse engineer it.

Generic boilerplate that:

- takes a vector as input either from stdin or from a file, one per line
- processes it with a kernel read from a file, one vector item per work item (configurable with options)
- produces as output a vector of the same size to stdout

This allows you to quickly play with different kernels without recompiling the C code.

But is unsuitable for real applications, which require  querying the CL implementation
for limits, specially work group and memory maximum sizes. Although you could use a script
to parse clinfo and get those values out... hmmm...
*/

#include "common.h"

int main(int argc, char **argv) {
    char *source_path;
    cl_float *io;
    cl_mem buffer;
    Common common;
    FILE *input_vector_file;
    float f;
    int a, global_work_size_given, use_cache;
    size_t i, global_work_size, local_work_size, n, nmax, io_sizeof;

    /* Treat CLI arguments. */
    global_work_size_given = 0;
    local_work_size = 1;
    use_cache = 0;
    for (a = 1; a < argc; ++a) {
        if (argv[a][0] == '-') {
            switch(argv[a][1]) {
                case 'b':
                    use_cache = 1;
                break;
                case 'g':
                    a++;
                    global_work_size = strtoul(argv[a], NULL, 10);
                    global_work_size_given = 1;
                break;
                case 'l':
                    a++;
                    local_work_size = strtoul(argv[a], NULL, 10);
                break;
            }
        } else {
            break;
        }
    }
    if (argc > a) {
        source_path = argv[a];
    } else {
        source_path = (char *)"vec_io.cl";
    }
    a++;
    if (argc > a) {
        input_vector_file = fopen(argv[a], "r");
    } else {
        input_vector_file = stdin;
    }

    /* Initialize data. */
    n = 0;
    nmax = n + 1;
    io = (cl_float *)malloc(nmax * sizeof(*io));
    while(fscanf(input_vector_file, "%f", &f) != EOF) {
        io[n] = f;
        n++;
        if (n == nmax) {
            nmax *= 2;
            io = (cl_float *)realloc(io, nmax * sizeof(*io));
        }
    }
    io_sizeof = n * sizeof(*io);
    if (!global_work_size_given) {
        global_work_size = n;
    }

    /* Create kernel. */
    common_create_kernel_or_use_cache(&common, use_cache, source_path, (char *)(__FILE__ ".bin.tmp"));
    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, io_sizeof, io, NULL);
    clSetKernelArg(common.kernel, 0, sizeof(buffer), &buffer);
    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(common.command_queue);
    clFinish(common.command_queue);
    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, io_sizeof, io, 0, NULL, NULL);

    /* Print result. */
    for (i = 0; i < n; ++i) {
        printf("%.6e\n", io[i]);
    }

    /* Cleanup. */
    clReleaseMemObject(buffer);
    common_deinit(&common);
    free(io);
    fclose(input_vector_file);
    return EXIT_SUCCESS;
}