mirror of
https://github.com/libretro/cpp-cheat.git
synced 2025-04-03 03:12:24 +00:00
Start opencl for real
This commit is contained in:
parent
5475036c59
commit
e52e4475f9
17
bullet.md
Normal file
17
bullet.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Bullet
|
||||
|
||||
## Build
|
||||
|
||||
Tested on Ubuntu 2.83.5 in Ubuntu 15.10.
|
||||
|
||||
git clone https://github.com/bulletphysics/bullet3
|
||||
cd build3
|
||||
./premake4_linux gmake
|
||||
cd gmake
|
||||
make
|
||||
|
||||
Outputs are `.a` and executables under `bin/`. The most interesting is:
|
||||
|
||||
./bin/App_ExampleBrowser_gmake_x64_release
|
||||
|
||||
which allows you to view with OpenGL and interact with mouse dragging with the examples under `examples/`.
|
@ -64,6 +64,7 @@
|
||||
1. [Parameter without name](parameter_without_name.c)
|
||||
1. [Static array argument](static_array_argument.c)
|
||||
1. [_Noreturn](noreturn.c)
|
||||
1. [Identifier list](identifier_list.c)
|
||||
1. [Operator](operator.c)
|
||||
1. [sizeof()](sizeof.c)
|
||||
1. [Sequence point](sequence_point.c)
|
||||
|
11
c/function.c
11
c/function.c
@ -1,5 +1,4 @@
|
||||
/*
|
||||
# function
|
||||
/*# function
|
||||
|
||||
A function is basically a branch, but in which you have to:
|
||||
|
||||
@ -462,13 +461,13 @@ int main() {
|
||||
|
||||
http://stackoverflow.com/questions/5481579/whats-the-difference-between-function-prototype-and-declaration
|
||||
|
||||
- Prototype is a declaration that specifies the arguments.
|
||||
- Prototype is a declaration that specifies the arguments.
|
||||
Only a single prototype can exist.
|
||||
|
||||
- a declaration can not be a prototype if it does not have any arguments.
|
||||
- a declaration can not be a prototype if it does not have any arguments.
|
||||
The arguments are left unspecified.
|
||||
|
||||
- to specify a prototype that takes no arguments, use `f(void)`
|
||||
- to specify a prototype that takes no arguments, use `f(void)`
|
||||
|
||||
In C++ the insanity is reduced, and every declaration is a prototype,
|
||||
so `f()` is the same as `f(void)`.
|
||||
@ -543,7 +542,7 @@ int main() {
|
||||
/*
|
||||
# K&R function declaration
|
||||
|
||||
This form of funciton declaration, while standard,
|
||||
This form of function declaration, while standard,
|
||||
is almost completely obsolete and forgotten today.
|
||||
|
||||
It is however still ANSI C.
|
||||
|
26
c/identifier_list.c
Normal file
26
c/identifier_list.c
Normal file
@ -0,0 +1,26 @@
|
||||
/*
|
||||
# Identifier list function declarator
|
||||
|
||||
Old style thing that should never be done today.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* TODO without definition. Should never be done. Conforming or not? */
|
||||
/*int f(x, y);*/
|
||||
|
||||
int f(x, y)
|
||||
int x;
|
||||
int y;
|
||||
{ return x + y; }
|
||||
|
||||
/* Also identifier list: it is the only optional one. */
|
||||
void g() {}
|
||||
|
||||
/* Identifier type list. This one is not optional. */
|
||||
void h(void) {}
|
||||
|
||||
int main() {
|
||||
assert(f(1, 2) == 3);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -10,3 +10,4 @@ Programs in this directory should be run manually one by one because they do thi
|
||||
1. [Command line arguments](command_line_arguments.c)
|
||||
1. [abort](abort.c)
|
||||
1. [clock](clock.c)
|
||||
1. [Ugly grammar](ugly_grammar.c)
|
||||
|
36
c/interactive/ugly_grammar.c
Normal file
36
c/interactive/ugly_grammar.c
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
# Ugly grammar
|
||||
|
||||
C allows for grammar obscenities to be compatible with a distant past.
|
||||
|
||||
Here are some perfectly legal jewels. Don't compile with `-pedantic-errors`, only `-std=c99`.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* After you've removed -pedantic-errors. */
|
||||
#define ON 1
|
||||
|
||||
#if ON
|
||||
/* Empty declaration. */
|
||||
int;
|
||||
|
||||
/* Declaration without type. */
|
||||
a;
|
||||
f(void);
|
||||
int dec_arg_no_type(x, y)
|
||||
{ return 1; }
|
||||
|
||||
/* Declaration-list function arguments in declaration. TODO should be illegal? */
|
||||
int g(x, y);
|
||||
|
||||
/* Declaration-list function arguments in definition. */
|
||||
int g(x, y)
|
||||
int x;
|
||||
int y;
|
||||
{ return 1; }
|
||||
#endif
|
||||
|
||||
int main(void) {
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
.POSIX:
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all:
|
||||
gcc -std=c89 -o main.out main.c -lOpenCL
|
||||
|
||||
clean:
|
||||
rm hello
|
@ -2,12 +2,20 @@
|
||||
|
||||
1. [Getting started](getting-started.md)
|
||||
1. Examples
|
||||
1. [min](min.c)
|
||||
1. [hello_world](hello_world.c)
|
||||
1. [increment](inc.c)
|
||||
1. [Build error](build_error.c)
|
||||
1. [Pass by value](pass_by_value.c)
|
||||
1. [Work item built-ins](work_item_builtin.c)
|
||||
1. [Increment vector](inc_vector.c)
|
||||
1. [Vector built-in](vector_builtin.c)
|
||||
1. Tools
|
||||
1. [clinfo](clinfo.md)
|
||||
1. Theory
|
||||
1. [Introduction](introduction.md)
|
||||
1. [Concepts](concepts.md)
|
||||
1. [Implementations](implementations.md)
|
||||
1. [Alternatives](alternatives.md)
|
||||
1. [Architecture](architecture.md)
|
||||
1. [C](c.md)
|
||||
1. [Host API](host-api.md)
|
||||
1. [Bibliography](bibliography.md)
|
||||
1. [TODO](TODO.md)
|
||||
|
@ -1,3 +1,8 @@
|
||||
# TODO
|
||||
|
||||
1. Compare speeds of `CL_DEVICE_TYPE_GPU` and `CL_DEVICE_TYPE_CPU`.
|
||||
1. synchronization, work_group_barrier https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/work_group_barrier.html || http://stackoverflow.com/questions/7673990/in-opencl-what-does-mem-fence-do-as-opposed-to-barrier https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/mem_fence.html `mem_fence` (TODO not in OpenCL 2?)
|
||||
1. images
|
||||
1. local and private memory to optimize things. Done in HandsOnOpenCL exercise 8 chapter 21 of 2011 OpenCL programming guide.
|
||||
1. create a bunch of educational and actually useful examples where GPU owns CPU and time them
|
||||
1. understand why kernels / work items / groups are SIMD, even if they seem completely independent. How does it work? They can only be parallel is the same instruction is to be used on all kernels at once? What breaks it's efficiency? Branching clearly does: we could do an `switch (get_global_id())` and have completely different code running on each kernel. Looks like that is correct: https://news.ycombinator.com/item?id=1969631 | http://stackoverflow.com/questions/5897454/conditionals-in-gpu-programming
|
||||
1. how much parallelism do GPUs actually have? http://stackoverflow.com/questions/6490572/cuda-how-many-concurrent-threads-in-total | http://gamedev.stackexchange.com/questions/17243/how-many-parallel-units-does-a-gpu-have Depends on what that means, data parallelism? Don't forge that CPU's have 4 wide SIMD nowadays.
|
||||
|
@ -6,24 +6,18 @@ NVIDIA's. More closed since controlled by NVIDIA. Also more popular for the same
|
||||
|
||||
<https://www.reddit.com/r/programming/comments/49uw97/cuda_reverse_engineered_to_run_on_nonnvidia/>
|
||||
|
||||
<https://en.wikipedia.org/wiki/CUDA> NVIDIA's, only runs in NVIDIA hardware. TODO could AMD implement it legally without paying royalties to NVIDIA?
|
||||
|
||||
## OpenMP
|
||||
|
||||
<http://stackoverflow.com/questions/7263193/opencl-performance-vs-openmp>
|
||||
|
||||
## RenderScript
|
||||
|
||||
Google's choice for Android: <http://stackoverflow.com/questions/14385843/why-did-google-choose-renderscript-instead-of-opencl>
|
||||
|
||||
Google somewhat opposes OpenCL, maybe because it was created by Apple?
|
||||
|
||||
## Vulkan
|
||||
|
||||
<https://en.wikipedia.org/wiki/Vulkan_%28API%29>
|
||||
|
||||
Also by Khronos.
|
||||
|
||||
TODO why another?
|
||||
|
||||
- <http://gamedev.stackexchange.com/questions/96014/what-is-vulkan-and-how-does-it-differ-from-opengl>
|
||||
|
||||
Derived from <https://en.wikipedia.org/wiki/Mantle_%28API%29> by AMD, now abandoned in favor of Vulkan, and will somewhat be the new OpenGL.
|
||||
|
||||
## Metal
|
||||
|
||||
<https://en.wikipedia.org/wiki/Metal_%28API%29>
|
||||
@ -33,3 +27,20 @@ Apple's response to Google's RenderScript.
|
||||
## DirectX
|
||||
|
||||
Microsoft, Windows, Xbox.
|
||||
|
||||
## Cilk
|
||||
|
||||
<https://en.wikipedia.org/wiki/Cilk>
|
||||
|
||||
Intel's
|
||||
|
||||
## DirectCompute
|
||||
|
||||
<https://en.wikipedia.org/wiki/DirectCompute>
|
||||
|
||||
Microsoft's
|
||||
|
||||
## Unified parallel C
|
||||
|
||||
- <https://en.wikipedia.org/wiki/Unified_Parallel_C>
|
||||
- OpenGL compute shaders <http://stackoverflow.com/questions/15868498/what-is-the-difference-between-opencl-and-opengls-compute-shader>
|
||||
|
@ -1,8 +1,30 @@
|
||||
# Applications
|
||||
|
||||
For an application to experience speedup compared to the CPU, it must:
|
||||
|
||||
- be highly parallelizable
|
||||
- do a lot of work per input byte, because IO is very expensive
|
||||
|
||||
## Actual applications
|
||||
|
||||
- Monte Carlo
|
||||
|
||||
- PDEs
|
||||
|
||||
- <https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model>
|
||||
- Reverse Time Migration: RTM <http://www.slb.com/services/seismic/geophysical_processing_characterization/dp/technologies/depth/prestackdepth/rtm.aspx>
|
||||
|
||||
Matrix multiplication:
|
||||
|
||||
- <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
|
||||
- <https://developer.nvidia.com/cublas>
|
||||
|
||||
Not surprising, since rendering is just a bunch of matrix multiplications, with fixed matrices and varying vectors.
|
||||
|
||||
Sparse: <http://stackoverflow.com/questions/3438826/sparse-matrix-multiplication-on-gpu-or-cpu>
|
||||
|
||||
Bolt: C++ STL GPU powered implementation by AMD: <http://developer.amd.com/tools-and-sdks/opencl-zone/bolt-c-template-library/>
|
||||
|
||||
## Non-applications
|
||||
|
||||
Vector addition. Too little work per input byte (1 CPU cycle). <https://forums.khronos.org/showthread.php/7741-CPU-faster-in-vector-addition-than-GPU>, <http://stackoverflow.com/questions/15194798/vector-step-addition-slower-on-cuda> <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
|
||||
|
94
opencl/architecture.md
Normal file
94
opencl/architecture.md
Normal file
@ -0,0 +1,94 @@
|
||||
# Concepts
|
||||
|
||||
Hierarchy from top to bottom:
|
||||
|
||||
- Host: the entire system
|
||||
- Device group: multiple devices, e.g. one GPU and one CPU.
|
||||
- Compute device
|
||||
- Compute unit
|
||||
- Processing element
|
||||
- Work group
|
||||
- Work item
|
||||
|
||||
## TODO
|
||||
|
||||
GPU vs CPU hardware level.
|
||||
|
||||
<https://youtu.be/e-2bTxKuS2U?list=PLTfYiv7-a3l7mYEdjk35wfY-KQj5yVXO2&t=319> mentions GPU has no cache.
|
||||
|
||||
## Platform
|
||||
|
||||
TODO what is a platform?
|
||||
|
||||
<http://stackoverflow.com/questions/3444664/does-any-opencl-host-have-more-than-one-platform>
|
||||
|
||||
## Compute device
|
||||
|
||||
One CPU, one GPU, etc.
|
||||
|
||||
## Compute unit
|
||||
|
||||
TODO vs core?
|
||||
|
||||
Can be obtained with: `clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)`
|
||||
|
||||
## Processing element
|
||||
|
||||
TODO
|
||||
|
||||
## Work group
|
||||
|
||||
Contains many work items.
|
||||
|
||||
Work items inside the same work group can share local memory, and can synchronize.
|
||||
|
||||
Work groups have a maximum size (otherwise the concept wouldn't even exist).
|
||||
|
||||
Ideally we would like to have a single work group for all items, as that would allow us to worry less about the location of memory on the Global / Constant / Local / Private hierarchy.
|
||||
|
||||
But memory localization on GPUs is important enough that OpenCL exposes this extra level.
|
||||
|
||||
Synchronization only works inside a single work groups: http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups
|
||||
|
||||
### Local size
|
||||
|
||||
Size of the work group.
|
||||
|
||||
On CPU: always 1. TODO why?
|
||||
|
||||
On GPU; must divide Global size.
|
||||
|
||||
### Uniform work group
|
||||
|
||||
### Non-uniform work group
|
||||
|
||||
Work groups with different sizes.
|
||||
|
||||
Application: take care of edge cases of the data, e.g. image edges: <https://software.intel.com/en-us/articles/opencl-20-non-uniform-work-groups>
|
||||
|
||||
## Work item
|
||||
|
||||
Each work item runs your kernel code in parallel to the other ones.
|
||||
|
||||
An work item can be seen as a thread.
|
||||
|
||||
Contains private memory, which no other work item can see.
|
||||
|
||||
## Local and Private memory
|
||||
|
||||
TODO: why use those at all instead of global memory?
|
||||
|
||||
- <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
|
||||
- <http://stackoverflow.com/questions/9885880/effect-of-private-memory-in-opencl>
|
||||
|
||||
Might be faster, and global memory is limited.
|
||||
|
||||
HandsOnOpencl Example 8 shows how matrix multiplication becomes 10x faster with some local memory usage. Looks like memory access was the bottleneck.
|
||||
|
||||
It also shows how we must make an explicit copy to use private memory.
|
||||
|
||||
### Local memory
|
||||
|
||||
- <http://stackoverflow.com/questions/8888718/how-to-declare-local-memory-in-opencl>
|
||||
- <http://stackoverflow.com/questions/2541929/how-do-i-use-local-memory-in-opencl>
|
||||
- <http://stackoverflow.com/questions/17574570/create-local-array-dynamic-inside-opencl-kernel>
|
@ -2,16 +2,44 @@
|
||||
|
||||
Khronos standards:
|
||||
|
||||
- <https://www.khronos.org/registry/cl/>
|
||||
- <https://www.khronos.org/registry/cl/>
|
||||
|
||||
Qualifiers: <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/qualifiers.html>
|
||||
|
||||
Tutorials with sample code:
|
||||
|
||||
- <https://github.com/sschaetz/nvidia-opencl-examples> hosts the samples from <https://developer.nvidia.com/opencl>. But a header is missing: <https://github.com/sschaetz/nvidia-opencl-examples/issues/1>
|
||||
- <http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-2-0-samples/> AMD samples. TODO could not find them yet...
|
||||
- <https://github.com/enjalot/adventures_in_opencl> Worked after `sudo apt-get install libxmu-dev libxi-dev`.
|
||||
- <https://github.com/bgaster/opencl-book-samples> The book is commercial.
|
||||
- <https://github.com/vpeurala/openclhelloworld> Simple hello world.
|
||||
- <https://github.com/HandsOnOpenCL/Exercises-Solutions> Vector addition, matrix multiplication, Conway's game of life. On Ubuntu 15.10, I had to comment out some constants on `err.h`, but it worked fine.
|
||||
- <https://github.com/HandsOnOpenCL/Exercises-Solutions>
|
||||
|
||||
- vector addition
|
||||
- great matrix multiplication on Example 8 with multiple methods compared for speed
|
||||
- [rectangle method](https://en.wikipedia.org/wiki/Rectangle_method) integration
|
||||
- Conway's game of life.
|
||||
|
||||
On Ubuntu 15.10 NVIDIA, I had to comment out some constants on `err.h`, but C worked fine.
|
||||
|
||||
C++ does not compile. First it includes `CL/OpenCL.h` instead of `CL/cl.h`, and after that missing symbols.
|
||||
|
||||
- <https://github.com/vpeurala/openclhelloworld> Simple hello world.
|
||||
|
||||
- <https://github.com/bgaster/opencl-book-samples>, for the book OpenCL programming guide.
|
||||
|
||||
The book is commercial <http://www.heterogeneouscompute.org/?page_id=5>. It is a good read. Part II has many application case studies.
|
||||
|
||||
Cannot compile most examples, several OpenCL are missing in Chapter 7 on , e.g. `clCreateSubBuffer`.
|
||||
|
||||
- <https://github.com/enjalot/adventures_in_opencl> TODO get working. The following might help: `sudo apt-get install libxmu-dev libxi-dev && sudo pip install pyopencl`.
|
||||
|
||||
- <https://bitbucket.org/erwincoumans/opencl_course> A few examples: image rotation. Build failed with: `Error: solution '0MySolution' needs at least one project`, but if I cd into directories and do `g++ main.cpp -lOpenCL` it works mostly.
|
||||
|
||||
Does platform selection based on vendor string.
|
||||
|
||||
Tutorials:
|
||||
|
||||
- <http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook> Most of it deals with setup. Does have a few concepts. Little code, and not version tracked.
|
||||
|
||||
Video tutorials:
|
||||
|
||||
- David Gohara <https://www.youtube.com/watch?v=QA483lIvL-4&list=PLTfYiv7-a3l7mYEdjk35wfY-KQj5yVXO2>
|
||||
|
||||
Demos:
|
||||
|
||||
@ -20,7 +48,6 @@ Demos:
|
||||
Resources:
|
||||
|
||||
- <http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-resources/>
|
||||
- <http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook>
|
||||
- The specifications of your hardware, e.g. <http://www.nvidia.com/object/nvs_techspecs.html>
|
||||
|
||||
Big programs:
|
||||
@ -40,3 +67,38 @@ Big programs:
|
||||
- Interactive fluid simulations
|
||||
|
||||
- <https://www.youtube.com/watch?v=LGTxZRRwvsI>
|
||||
|
||||
## NVIDIA samples
|
||||
|
||||
Used to be part of the GPU computing SDK, then renamed CUDA SDK.
|
||||
|
||||
<https://github.com/sschaetz/nvidia-opencl-examples> hosts the samples from <https://developer.nvidia.com/opencl> SDK 4.2.9, which have to be downloaded one by one!
|
||||
|
||||
<https://github.com/marwan-abdellah/GPU-Computing-SDK-4.2.9/> hosts a superset, but that again fails with <https://github.com/marwan-abdellah/GPU-Computing-SDK-4.2.9/issues/1>
|
||||
|
||||
But a header is missing and it does not compile: <https://github.com/sschaetz/nvidia-opencl-examples/issues/1>
|
||||
|
||||
I'm not the only one who noticed: <https://streamcomputing.eu/blog/2012-09-10/nvidias-industry-leading-support-for-opencl/>
|
||||
|
||||
## AMD samples
|
||||
|
||||
<http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk>
|
||||
|
||||
Come with the SDK.
|
||||
|
||||
Tested version 3.0. Most examples work, except a few that depend on extensions which NVIDIA didn't have.
|
||||
|
||||
To compile the examples:
|
||||
|
||||
ln -fs /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 lib/x86_64/libOpenCL.so`
|
||||
cd samples/opencl/cl/1.x
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build .
|
||||
|
||||
Binaries fall under the `bin/` directory of each examples.
|
||||
|
||||
The SDK also comes with pre-built binaries under `samples/opencl/bin`. Just make sure you only run the ones whose source is under `1.x` if that's all that your implementation supports. They work fine.
|
||||
|
||||
License: looks like a custom MIT, you can redistribute, modify and reuse samples.
|
||||
|
42
opencl/build_error.c
Normal file
42
opencl/build_error.c
Normal file
@ -0,0 +1,42 @@
|
||||
/*
|
||||
http://stackoverflow.com/questions/9464190/error-code-11-what-are-all-possible-reasons-of-getting-error-cl-build-prog
|
||||
|
||||
Real programs should always check for build failures and print the error message,
|
||||
which can contain great debug messages (piped from some forked compiler :-))
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#define COUNT (7)
|
||||
|
||||
int main(void) {
|
||||
const char *source =
|
||||
"__kernel void main(__global size_t *out) {\n"
|
||||
" asdfqwer;"
|
||||
"}\n";
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
{
|
||||
size_t len;
|
||||
char error_message[2048];
|
||||
cl_int err = clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
if (err != CL_SUCCESS) {
|
||||
/* Would be wise to have a switch that transforms all possible codes into strings here. */
|
||||
printf("clBuildProgram failed. Error code: %d\n", err);
|
||||
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(error_message), error_message, &len);
|
||||
printf("error message = %s\n", error_message);
|
||||
}
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
}
|
49
opencl/c.md
Normal file
49
opencl/c.md
Normal file
@ -0,0 +1,49 @@
|
||||
# C
|
||||
|
||||
OpenCL C is a C99 variant in which kernels must be written.
|
||||
|
||||
<https://en.wikipedia.org/wiki/OpenCL#OpenCL_C_language>
|
||||
|
||||
C99 extensions:
|
||||
|
||||
- new qualifiers
|
||||
- new types, in particular short vector types and image types: <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/dataTypes.html>
|
||||
- new built-in functions
|
||||
|
||||
C99 restrictions: <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/restrictions.html>
|
||||
|
||||
- pointers to functions are not allowed
|
||||
|
||||
## Qualifiers
|
||||
|
||||
<https://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/qualifiers.html>
|
||||
|
||||
### __global
|
||||
|
||||
### __local
|
||||
|
||||
Shared across the work group.
|
||||
|
||||
### __constant
|
||||
|
||||
### __private
|
||||
|
||||
Default qualifier for arguments and local function variables: <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/global.html>
|
||||
|
||||
### __kernel
|
||||
|
||||
Marks a function that can be called from the host.
|
||||
|
||||
Must return `void`, <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/restrictions.html>
|
||||
|
||||
<https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/restrictions.html>:
|
||||
|
||||
> Arguments to kernel functions declared in a program that are pointers must be declared with the __global, __constant, or __local qualifier.
|
||||
|
||||
## Built-in functions
|
||||
|
||||
- <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/integerFunctions.html>
|
||||
- <https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/workItemFunctions.html>
|
||||
- synchronization, e.g. `work_group_barrier` (ex `barrier`)
|
||||
|
||||
Those may in particular take vector type arguments.
|
@ -1,15 +0,0 @@
|
||||
# Concepts
|
||||
|
||||
## Platform
|
||||
|
||||
TODO what is a platform?
|
||||
|
||||
<http://stackoverflow.com/questions/3444664/does-any-opencl-host-have-more-than-one-platform>
|
||||
|
||||
## Compute unit
|
||||
|
||||
TODO vs core? Each compute unit has a number of cores it seems:
|
||||
|
||||
## Work group
|
||||
|
||||
TODO
|
@ -1,5 +1,7 @@
|
||||
# Getting started
|
||||
|
||||
Tested in Ubuntu 15.10 NVIDIA 352.
|
||||
|
||||
## NVIDIA
|
||||
|
||||
On Ubuntu 15.10 with an NVIDIA NVS 5400M, Lenovo T430: <http://askubuntu.com/questions/541114/how-to-make-opencl-work-on-14-10-nvidia-331-89-drivers/693043#693043>
|
||||
|
@ -1,67 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#define MEM_SIZE (128)
|
||||
|
||||
static const char *source_str =
|
||||
"__kernel void hello(__global char *string) {\n"
|
||||
" string[0] = 'h';\n"
|
||||
" string[1] = 'e';\n"
|
||||
" string[2] = 'l';\n"
|
||||
" string[3] = 'l';\n"
|
||||
" string[4] = 'o';\n"
|
||||
" string[5] = ' ';\n"
|
||||
" string[6] = 'w';\n"
|
||||
" string[7] = 'o';\n"
|
||||
" string[8] = 'r';\n"
|
||||
" string[9] = 'l';\n"
|
||||
" string[10] = 'd';\n"
|
||||
" string[11] = '\\0';\n"
|
||||
"}\n";
|
||||
|
||||
int main(void) {
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue command_queue = NULL;
|
||||
cl_mem memobj = NULL;
|
||||
cl_program program = NULL;
|
||||
cl_kernel kernel = NULL;
|
||||
cl_platform_id platform_id = NULL;
|
||||
cl_uint ret_num_devices;
|
||||
cl_uint ret_num_platforms;
|
||||
cl_int ret;
|
||||
char string[MEM_SIZE];
|
||||
size_t source_size;
|
||||
|
||||
source_size = strlen(source_str);
|
||||
|
||||
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
|
||||
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
|
||||
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
|
||||
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
|
||||
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(char), NULL, &ret);
|
||||
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
|
||||
(const size_t *)&source_size, &ret);
|
||||
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
|
||||
kernel = clCreateKernel(program, "hello", &ret);
|
||||
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
|
||||
|
||||
/* Execute OpenCL Kernel */
|
||||
ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
|
||||
/* Copy results from the memory buffer */
|
||||
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0,
|
||||
MEM_SIZE * sizeof(char),string, 0, NULL, NULL);
|
||||
puts(string);
|
||||
|
||||
ret = clFlush(command_queue);
|
||||
ret = clFinish(command_queue);
|
||||
ret = clReleaseKernel(kernel);
|
||||
ret = clReleaseProgram(program);
|
||||
ret = clReleaseMemObject(memobj);
|
||||
ret = clReleaseCommandQueue(command_queue);
|
||||
ret = clReleaseContext(context);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
61
opencl/host-api.md
Normal file
61
opencl/host-api.md
Normal file
@ -0,0 +1,61 @@
|
||||
# Host API
|
||||
|
||||
## clEnqueueNDRangeKernel
|
||||
|
||||
<https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/clEnqueueNDRangeKernel.html>
|
||||
|
||||
The single most important function of the API: dispatches kernels into work items and groups.
|
||||
|
||||
`NDRange` stands for N-Dimensional Range.
|
||||
|
||||
Most important arguments:
|
||||
|
||||
- `work_dim`: how many dimensions will be used. Usually maximum 3 are allowed.
|
||||
|
||||
E.g., if `work_dim` is 2, then each work item is identified by 2 numbers.
|
||||
|
||||
TODO: why are work groups and work items treated as multi-dimensional? <http://stackoverflow.com/questions/34294133/why-are-opencl-work-groups-3-dimensional> Looks like it does not reflect in any way GPU architecture: adjacent groups don't communicate any faster, it's just to make code more readable.
|
||||
|
||||
Main use cases:
|
||||
|
||||
- 2D: 2D images
|
||||
- 3D: 2D images with multiple layers of information. The third dimension for example is smaller on some GPUs, as there are usually less layers than pixels.
|
||||
|
||||
- `global_work_offset`: that is the first global work item ID. Default if `NULL`: `0, 0, 0, ...`.
|
||||
|
||||
Each dimension of the global id can be retrieved in kernels with `get_global_id(dim)`.
|
||||
|
||||
- `global_work_size`: total number of work items we want
|
||||
|
||||
- `local_work_size`: how many work items will go into each work group.
|
||||
|
||||
Should normally divide `global_work_size`.
|
||||
|
||||
If `NULL`, OpenCL does smart things.
|
||||
|
||||
This determines the value of `get_local_id` in kernels. E.g., if we have:
|
||||
|
||||
- `work_dim == 1`
|
||||
- `global_work_offset == 0`
|
||||
- `global_work_size == 6`
|
||||
- `local_work_size == 3`
|
||||
|
||||
Then the work item with global ID `4` has local ID 1:
|
||||
|
||||
Global ID 0 1 2 3 4 5
|
||||
|---| |---|
|
||||
Local ID 0 1 2 0 1 2
|
||||
|
||||
In the past, all groups had the same number of work items, but OpenCL 2 non-uniform groups made things more complex and convenient.
|
||||
|
||||
## clEnqueueTask
|
||||
|
||||
Convenient subset of <https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueTask.html>
|
||||
|
||||
Maximum one task at a time, not made for parallelism, so not very useful in practice.
|
||||
|
||||
## How to initialize a buffer
|
||||
|
||||
`clCreateBuffer` + `CL_MEM_COPY_HOST_PTR` vs. `clCreateBuffer` + `clEnqueueWriteBuffer`?
|
||||
|
||||
<http://stackoverflow.com/questions/3832963/two-ways-to-create-a-buffer-object-in-opencl-clcreatebuffer-vs-clcreatebuffer>
|
58
opencl/implementations.md
Normal file
58
opencl/implementations.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Implementations
|
||||
|
||||
<https://en.wikipedia.org/wiki/OpenCL#Implementations>
|
||||
|
||||
There is a certain "installable client driver loader (ICD loader)" which forwards calls to the proprietary implementation.
|
||||
|
||||
## Gallium Compute
|
||||
|
||||
<http://www.x.org/wiki/XorgEVoC/GalliumCompute/>
|
||||
|
||||
Looks like the major open source implementation.
|
||||
|
||||
## pocl
|
||||
|
||||
Portable OpenCL implementation.
|
||||
|
||||
<https://github.com/pocl/pocl>
|
||||
|
||||
<http://portablecl.org/>
|
||||
|
||||
MIT and LLVM based.
|
||||
|
||||
## NVIDIA
|
||||
|
||||
Implements it, but way after CUDA features.
|
||||
|
||||
- <http://stackoverflow.com/questions/20086153/is-there-a-way-to-upgrade-to-opencl-2-0>
|
||||
- <https://streamcomputing.eu/blog/2012-09-10/nvidias-industry-leading-support-for-opencl/>
|
||||
|
||||
Likely their monopoly + anti antitrust tactic.
|
||||
|
||||
Offers official Ubuntu packages as of 15.10.
|
||||
|
||||
TODO why are many symbols missing even though NVIDIA claims to support a given OpenCL version?
|
||||
|
||||
- `clCreateSubBuffer`
|
||||
- `clEnqueueReadBufferRect`
|
||||
|
||||
Threads:
|
||||
|
||||
- <https://devtalk.nvidia.com/default/topic/572548/support-for-opencl-1-2-gtx-690-ubuntu>
|
||||
- <http://stackoverflow.com/questions/3271243/clcreatesubbuffer-not-found-oo>
|
||||
- <https://devtalk.nvidia.com/default/topic/486564/nvidia-39-s-opencl-1-1-and-clcreatesubbuffer/>
|
||||
|
||||
## Intel
|
||||
|
||||
No official Linux packages.
|
||||
|
||||
- <http://askubuntu.com/questions/545763/installation-of-intel-opencl>
|
||||
- <http://askubuntu.com/questions/629456/intel-opencl-on-14-04>
|
||||
|
||||
## Altera FPGA
|
||||
|
||||
<https://www.altera.com/products/design-software/embedded-software-developers/opencl/overview.html>
|
||||
|
||||
## Transpiler
|
||||
|
||||
- <http://repo.or.cz/w/ppcg.git> C99 to OpenCL.
|
44
opencl/inc.c
Normal file
44
opencl/inc.c
Normal file
@ -0,0 +1,44 @@
|
||||
/*
|
||||
Pass an int by reference and increment it.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
int main(void) {
|
||||
const char *source =
|
||||
/* kernel pointer arguments must be __global, __constant, or __local. */
|
||||
/* https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/restrictions.html */
|
||||
"__kernel void increment(__global int *out) {\n"
|
||||
" out[0]++;\n"
|
||||
"}\n";
|
||||
cl_command_queue command_queue;
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_int input = 1;
|
||||
cl_kernel kernel;
|
||||
cl_mem buffer;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
command_queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), &input, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
/* The name of the kernel function we want to call. */
|
||||
kernel = clCreateKernel(program, "increment", NULL);
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
|
||||
clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
|
||||
clFlush(command_queue);
|
||||
clFinish(command_queue);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL);
|
||||
|
||||
assert(input == 2);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
45
opencl/inc_vector.c
Normal file
45
opencl/inc_vector.c
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
Increment a vector, one value per work item.
|
||||
|
||||
It is useless to do this on a GPU, not enough work / IO,
|
||||
it's just a clEnqueueNDRangeKernel + get_global_id hello world.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
int main(void) {
|
||||
const char *source =
|
||||
"__kernel void main(__global int *out) {\n"
|
||||
" out[get_global_id(0)]++;\n"
|
||||
"}\n";
|
||||
cl_command_queue command_queue;
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_int input[] = {1, 2};
|
||||
const size_t global_work_size = sizeof(input) / sizeof(cl_int);
|
||||
cl_kernel kernel;
|
||||
cl_mem buffer;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
command_queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
kernel = clCreateKernel(program, "main", NULL);
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
|
||||
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
|
||||
clFlush(command_queue);
|
||||
clFinish(command_queue);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
|
||||
|
||||
assert(input[0] == 2);
|
||||
assert(input[1] == 3);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -14,41 +14,16 @@ The OpenCL standard in maintained by the Khronos Group, the same guys who mainta
|
||||
|
||||
OpenCL, like any other language has versions. As of 2013 the latest version is OpenCL 2.0 released in preview (unstable) as of Jul 2013.
|
||||
|
||||
## Implementations
|
||||
## Versions
|
||||
|
||||
<https://en.wikipedia.org/wiki/OpenCL#Implementations>
|
||||
1.0: 2009, Apple.
|
||||
1.1: 2010.
|
||||
1.2: 2011.
|
||||
2.0: 2013. AMD support: 2015. <http://developer.amd.com/community/blog/2015/08/26/introducing-app-sdk-30-opencl-2/>
|
||||
|
||||
There is a certain "installable client driver loader (ICD loader)" which forwards calls to the proprietary implementation.
|
||||
|
||||
### Gallium Compute
|
||||
|
||||
<http://www.x.org/wiki/XorgEVoC/GalliumCompute/>
|
||||
|
||||
Looks like the major open source implementation.
|
||||
|
||||
### pocl
|
||||
|
||||
Portable OpenCL implementation.
|
||||
|
||||
<https://github.com/pocl/pocl>
|
||||
|
||||
<http://portablecl.org/>
|
||||
|
||||
MIT and LLVM based.
|
||||
|
||||
## Tools
|
||||
|
||||
- <http://repo.or.cz/w/ppcg.git> C99 to OpenCL.
|
||||
|
||||
## Alternatives
|
||||
|
||||
- <https://en.wikipedia.org/wiki/CUDA> NVIDIA's
|
||||
- OpenMP <http://stackoverflow.com/questions/7263193/opencl-performance-vs-openmp>
|
||||
- <https://en.wikipedia.org/wiki/Cilk> Intel's
|
||||
- <https://en.wikipedia.org/wiki/DirectCompute> Microsoft's
|
||||
- <https://en.wikipedia.org/wiki/Unified_Parallel_C>
|
||||
|
||||
## OpenCL + OpenGL for real time rendering of complex calculations
|
||||
## OpenCL and OpenGL integration
|
||||
|
||||
- <http://stackoverflow.com/questions/4005935/mix-opencl-with-opengl>
|
||||
- <http://enja.org/2010/08/27/adventures-in-opencl-part-2-particles-with-opengl/>
|
||||
|
||||
Also see compute shaders for OpenGL, they seem to integrate better.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
Minimal OpenCL program that does something observable: increment a number.
|
||||
Increment a vector. It is useless to do this on a GPU, not enough work / IO.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
@ -8,33 +8,35 @@ Minimal OpenCL program that does something observable: increment a number.
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
int main() {
|
||||
int main(void) {
|
||||
const char *source =
|
||||
"__kernel void main(int in, __global int *out) {\n"
|
||||
" out[0] = in + 1;\n"
|
||||
"}\n";
|
||||
cl_command_queue command_queue;
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_int input = 1;
|
||||
cl_int kernel_result = 0;
|
||||
cl_kernel kernel;
|
||||
cl_mem buffer;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
const char *source = "__kernel void increment(int in, __global int* out) { out[0] = in + 1; }";
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
command_queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int), NULL, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
kernel = clCreateKernel(program, "increment", NULL);
|
||||
kernel = clCreateKernel(program, "main", NULL);
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_int), &input);
|
||||
clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer);
|
||||
clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
|
||||
clFlush(command_queue);
|
||||
clFinish(command_queue);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof (cl_int), &kernel_result, 0, NULL, NULL);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL);
|
||||
|
||||
assert(kernel_result == 2);
|
||||
assert(input == 2);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
48
opencl/vector_builtin.c
Normal file
48
opencl/vector_builtin.c
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
Increment a vector using a vector built-in type.
|
||||
|
||||
TODO why does it not work?
|
||||
|
||||
http://stackoverflow.com/questions/13118228/how-to-pass-vector-paramater-to-opencl-kernel-in-c
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
int main(void) {
|
||||
const char *source =
|
||||
"__kernel void main(__global int2 *out) {\n"
|
||||
" out[get_global_id(0)]++;\n"
|
||||
"}\n";
|
||||
cl_command_queue command_queue;
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_int input[] = {0, 1, 2, 3};
|
||||
const size_t global_work_size = sizeof(input) / sizeof(cl_int2);
|
||||
cl_kernel kernel;
|
||||
cl_mem buffer;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
command_queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
kernel = clCreateKernel(program, "main", NULL);
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
|
||||
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
|
||||
clFlush(command_queue);
|
||||
clFinish(command_queue);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
|
||||
|
||||
assert(input[0] == 1);
|
||||
assert(input[1] == 2);
|
||||
assert(input[2] == 3);
|
||||
assert(input[3] == 4);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
68
opencl/work_item_builtin.c
Normal file
68
opencl/work_item_builtin.c
Normal file
@ -0,0 +1,68 @@
|
||||
/*
|
||||
https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/workItemFunctions.html
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#define NUM_FUNCTIONS (8)
|
||||
/* Play around with those parameters. */
|
||||
static size_t offset = 4;
|
||||
static size_t global = 1;
|
||||
static size_t local = 1;
|
||||
|
||||
int main(void) {
|
||||
const char *source =
|
||||
"__kernel void main(__global uint *out) {\n"
|
||||
" out[0] = get_work_dim();\n"
|
||||
/* Total number of work items that will run this kernel possibly in parallel. */
|
||||
" out[1] = get_global_size(0);\n"
|
||||
/* Unique global ID of the current work item. */
|
||||
" out[2] = get_global_id(0);\n"
|
||||
/* Number of work items in this group. */
|
||||
" out[3] = get_local_size(0);\n"
|
||||
/* Local ID of this work item inside this work group. */
|
||||
" out[4] = get_local_id(0);\n"
|
||||
/* How many work groups will execute this kernel, == global_size / local_size */
|
||||
" out[5] = get_num_groups(0);\n"
|
||||
/* Current group. */
|
||||
" out[6] = get_group_id(0);\n"
|
||||
/* Global offset passed on to NDrange. */
|
||||
" out[7] = get_global_offset(0);\n"
|
||||
"}\n";
|
||||
cl_command_queue command_queue;
|
||||
cl_context context;
|
||||
cl_device_id device;
|
||||
cl_uint output[NUM_FUNCTIONS];
|
||||
cl_kernel kernel;
|
||||
cl_mem buffer;
|
||||
cl_platform_id platform;
|
||||
cl_program program;
|
||||
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
command_queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NUM_FUNCTIONS * sizeof(cl_uint), NULL, NULL);
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
clBuildProgram(program, 1, &device, "", NULL, NULL);
|
||||
kernel = clCreateKernel(program, "main", NULL);
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
|
||||
clEnqueueNDRangeKernel(command_queue, kernel, 1, &offset, &global, &local, 0, NULL, NULL);
|
||||
clFlush(command_queue);
|
||||
clFinish(command_queue);
|
||||
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, NUM_FUNCTIONS * sizeof(cl_uint), &output, 0, NULL, NULL);
|
||||
|
||||
printf("work_dim = %d\n", output[0]);
|
||||
printf("global_size = %d\n", output[1]);
|
||||
printf("global_id = %d\n", output[2]);
|
||||
printf("local_size = %d\n", output[3]);
|
||||
printf("local_id = %d\n", output[4]);
|
||||
printf("num_groups = %d\n", output[5]);
|
||||
printf("group_id = %d\n", output[6]);
|
||||
printf("global_offset = %d\n", output[7]);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -1,5 +1,11 @@
|
||||
# Alternatives
|
||||
|
||||
- <https://en.wikipedia.org/wiki/Vulkan_%28API%29>
|
||||
## Vulkan
|
||||
|
||||
<http://gamedev.stackexchange.com/questions/96014/what-is-vulkan-and-how-does-it-differ-from-opengl>
|
||||
<https://en.wikipedia.org/wiki/Vulkan_%28API%29>
|
||||
|
||||
Also by Khronos.
|
||||
|
||||
<http://gamedev.stackexchange.com/questions/96014/what-is-vulkan-and-how-does-it-differ-from-opengl>
|
||||
|
||||
Derived from <https://en.wikipedia.org/wiki/Mantle_%28API%29> by AMD, now abandoned in favor of Vulkan, and will somewhat be the new OpenGL.
|
||||
|
@ -7,3 +7,4 @@
|
||||
1. [Error handling](error_handling.c)
|
||||
1. [Animation](animation.c)
|
||||
1. [Animation random walk](animation_random_walk.c)
|
||||
1. [Sound frequency](sound_frequency.c)
|
||||
|
3
sdl/TODO.md
Normal file
3
sdl/TODO.md
Normal file
@ -0,0 +1,3 @@
|
||||
# TODO
|
||||
|
||||
- Synthesise audio: <http://stackoverflow.com/questions/10110905/simple-wave-generator-with-sdl-in-c> not working
|
252
sdl/sound_frequency.c
Normal file
252
sdl/sound_frequency.c
Normal file
@ -0,0 +1,252 @@
|
||||
/*
|
||||
http://codereview.stackexchange.com/questions/41086/play-some-sine-waves-with-sdl2
|
||||
http://stackoverflow.com/questions/10110905/simple-wave-generator-with-sdl-in-c
|
||||
http://stackoverflow.com/questions/9137297/generating-sounds-without-a-library
|
||||
http://stackoverflow.com/questions/7678470/generating-sound-of-a-particular-frequency-using-gcc-in-ubuntu
|
||||
http://gamedev.stackexchange.com/questions/47009/is-there-a-sound-library-for-c-to-generate-sound-samples-from-code
|
||||
|
||||
TODO minimize, clean up.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <SDL2/SDL.h>
|
||||
|
||||
const double ChromaticRatio = 1.059463094359295264562;
|
||||
const double Tao = 6.283185307179586476925;
|
||||
|
||||
Uint32 sampleRate = 48000;
|
||||
Uint32 frameRate = 60;
|
||||
Uint32 floatStreamLength = 1024;
|
||||
Uint32 samplesPerFrame;
|
||||
Uint32 msPerFrame;
|
||||
double practicallySilent = 0.001;
|
||||
|
||||
Uint32 audioBufferLength = 48000;
|
||||
float *audioBuffer;
|
||||
|
||||
SDL_atomic_t audioCallbackLeftOff;
|
||||
Sint32 audioMainLeftOff;
|
||||
Uint8 audioMainAccumulator;
|
||||
|
||||
SDL_AudioDeviceID AudioDevice;
|
||||
SDL_AudioSpec audioSpec;
|
||||
|
||||
SDL_Event event;
|
||||
SDL_bool running = SDL_TRUE;
|
||||
|
||||
typedef struct {
|
||||
float *waveform;
|
||||
Uint32 waveformLength;
|
||||
double volume;
|
||||
double pan;
|
||||
double frequency;
|
||||
double phase;
|
||||
} voice;
|
||||
|
||||
void speak(voice *v) {
|
||||
float sample;
|
||||
Uint32 sourceIndex;
|
||||
double phaseIncrement = v->frequency/sampleRate;
|
||||
Uint32 i;
|
||||
if (v->volume > practicallySilent) {
|
||||
for (i = 0; (i + 1) < samplesPerFrame; i += 2) {
|
||||
v->phase += phaseIncrement;
|
||||
if (v->phase > 1)
|
||||
v->phase -= 1;
|
||||
|
||||
sourceIndex = v->phase*v->waveformLength;
|
||||
sample = v->waveform[sourceIndex]*v->volume;
|
||||
|
||||
audioBuffer[audioMainLeftOff+i] += sample*(1-v->pan);
|
||||
audioBuffer[audioMainLeftOff+i+1] += sample*v->pan;
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (i=0; i<samplesPerFrame; i+=1)
|
||||
audioBuffer[audioMainLeftOff+i] = 0;
|
||||
}
|
||||
audioMainAccumulator++;
|
||||
}
|
||||
|
||||
double getFrequency(double pitch) {
|
||||
return pow(ChromaticRatio, pitch-57)*440;
|
||||
}
|
||||
|
||||
int getWaveformLength(double pitch) {
|
||||
return sampleRate / getFrequency(pitch)+0.5f;
|
||||
}
|
||||
|
||||
void buildSineWave(float *data, Uint32 length) {
|
||||
Uint32 i;
|
||||
for (i=0; i < length; i++)
|
||||
data[i] = sin(i*(Tao/length));
|
||||
}
|
||||
|
||||
void logSpec(SDL_AudioSpec *as) {
|
||||
printf(
|
||||
" freq______%5d\n"
|
||||
" format____%5d\n"
|
||||
" channels__%5d\n"
|
||||
" silence___%5d\n"
|
||||
" samples___%5d\n"
|
||||
" size______%5d\n\n",
|
||||
(int) as->freq,
|
||||
(int) as->format,
|
||||
(int) as->channels,
|
||||
(int) as->silence,
|
||||
(int) as->samples,
|
||||
(int) as->size
|
||||
);
|
||||
}
|
||||
|
||||
void logVoice(voice *v) {
|
||||
printf(
|
||||
" waveformLength__%d\n"
|
||||
" volume__________%f\n"
|
||||
" pan_____________%f\n"
|
||||
" frequency_______%f\n"
|
||||
" phase___________%f\n",
|
||||
v->waveformLength,
|
||||
v->volume,
|
||||
v->pan,
|
||||
v->frequency,
|
||||
v->phase
|
||||
);
|
||||
}
|
||||
|
||||
void logWavedata(float *floatStream, Uint32 floatStreamLength, Uint32 increment) {
|
||||
printf("\n\nwaveform data:\n\n");
|
||||
Uint32 i=0;
|
||||
for (i = 0; i < floatStreamLength; i += increment)
|
||||
printf("%4d:%2.16f\n", i, floatStream[i]);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
void audioCallback(void *unused, Uint8 *byteStream, int byteStreamLength) {
|
||||
float* floatStream = (float*) byteStream;
|
||||
Sint32 localAudioCallbackLeftOff = SDL_AtomicGet(&audioCallbackLeftOff);
|
||||
Uint32 i;
|
||||
for (i = 0; i < floatStreamLength; i++) {
|
||||
floatStream[i] = audioBuffer[localAudioCallbackLeftOff];
|
||||
localAudioCallbackLeftOff++;
|
||||
if (localAudioCallbackLeftOff == audioBufferLength)
|
||||
localAudioCallbackLeftOff = 0;
|
||||
}
|
||||
SDL_AtomicSet(&audioCallbackLeftOff, localAudioCallbackLeftOff);
|
||||
}
|
||||
|
||||
int init(void) {
|
||||
SDL_Init(SDL_INIT_AUDIO | SDL_INIT_TIMER);
|
||||
SDL_AudioSpec want;
|
||||
SDL_zero(want);
|
||||
|
||||
want.freq = sampleRate;
|
||||
want.format = AUDIO_F32;
|
||||
want.channels = 2;
|
||||
want.samples = floatStreamLength;
|
||||
want.callback = audioCallback;
|
||||
|
||||
AudioDevice = SDL_OpenAudioDevice(NULL, 0, &want, &audioSpec, SDL_AUDIO_ALLOW_FORMAT_CHANGE);
|
||||
if (AudioDevice == 0) {
|
||||
printf("\nFailed to open audio: %s\n", SDL_GetError());
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("want:\n");
|
||||
logSpec(&want);
|
||||
printf("audioSpec:\n");
|
||||
logSpec(&audioSpec);
|
||||
|
||||
if (audioSpec.format != want.format) {
|
||||
printf("\nCouldn't get Float32 audio format.\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
sampleRate = audioSpec.freq;
|
||||
floatStreamLength = audioSpec.size / 4;
|
||||
samplesPerFrame = sampleRate / frameRate;
|
||||
msPerFrame = 1000 / frameRate;
|
||||
audioMainLeftOff = samplesPerFrame * 8;
|
||||
SDL_AtomicSet(&audioCallbackLeftOff, 0);
|
||||
|
||||
if (audioBufferLength % samplesPerFrame)
|
||||
audioBufferLength += samplesPerFrame - (audioBufferLength % samplesPerFrame);
|
||||
audioBuffer = malloc(sizeof(float) * audioBufferLength);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int onExit(void) {
|
||||
SDL_CloseAudioDevice(AudioDevice);
|
||||
SDL_Quit();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
float syncCompensationFactor = 0.0016;
|
||||
Sint32 mainAudioLead;
|
||||
Uint32 i;
|
||||
|
||||
voice testVoiceA;
|
||||
voice testVoiceB;
|
||||
voice testVoiceC;
|
||||
testVoiceA.volume = 1;
|
||||
testVoiceB.volume = 1;
|
||||
testVoiceC.volume = 1;
|
||||
testVoiceA.pan = 0.5;
|
||||
testVoiceB.pan = 0;
|
||||
testVoiceC.pan = 1;
|
||||
testVoiceA.phase = 0;
|
||||
testVoiceB.phase = 0;
|
||||
testVoiceC.phase = 0;
|
||||
testVoiceA.frequency = getFrequency(45);
|
||||
testVoiceB.frequency = getFrequency(49);
|
||||
testVoiceC.frequency = getFrequency(52);
|
||||
Uint16 C0waveformLength = getWaveformLength(0);
|
||||
testVoiceA.waveformLength = C0waveformLength;
|
||||
testVoiceB.waveformLength = C0waveformLength;
|
||||
testVoiceC.waveformLength = C0waveformLength;
|
||||
float sineWave[C0waveformLength];
|
||||
buildSineWave(sineWave, C0waveformLength);
|
||||
testVoiceA.waveform = sineWave;
|
||||
testVoiceB.waveform = sineWave;
|
||||
testVoiceC.waveform = sineWave;
|
||||
|
||||
if (init())
|
||||
return 1;
|
||||
|
||||
SDL_Delay(42);
|
||||
SDL_PauseAudioDevice(AudioDevice, 0);
|
||||
while (running) {
|
||||
while (SDL_PollEvent(&event) != 0) {
|
||||
if (event.type == SDL_QUIT) {
|
||||
running = SDL_FALSE;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < samplesPerFrame; i++)
|
||||
audioBuffer[audioMainLeftOff+i] = 0;
|
||||
speak(&testVoiceA);
|
||||
speak(&testVoiceB);
|
||||
speak(&testVoiceC);
|
||||
if (audioMainAccumulator > 1) {
|
||||
for (i=0; i<samplesPerFrame; i++) {
|
||||
audioBuffer[audioMainLeftOff+i] /= audioMainAccumulator;
|
||||
}
|
||||
}
|
||||
audioMainAccumulator = 0;
|
||||
audioMainLeftOff += samplesPerFrame;
|
||||
if (audioMainLeftOff == audioBufferLength)
|
||||
audioMainLeftOff = 0;
|
||||
mainAudioLead = audioMainLeftOff - SDL_AtomicGet(&audioCallbackLeftOff);
|
||||
if (mainAudioLead < 0)
|
||||
mainAudioLead += audioBufferLength;
|
||||
if (mainAudioLead < floatStreamLength)
|
||||
printf("An audio collision may have occured!\n");
|
||||
SDL_Delay(mainAudioLead * syncCompensationFactor);
|
||||
}
|
||||
onExit();
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user