Dolphin now uses the first OpenCL device found on the system.

Added cast in TextureDecoder.cl to satisfy the OpenCL compiler (thanks, Orphis).

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4718 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
skidau 2009-12-23 01:31:42 +00:00
parent 24d8839793
commit 043681a37f
2 changed files with 163 additions and 153 deletions

View File

@ -24,9 +24,9 @@ kernel void DecodeI4(global uchar *dst,
{
uchar4 val = vload4(srcOffset, src);
uchar8 res;
res.even = (val >> 4) & 0x0F;
res.odd = val & 0x0F;
res |= res << 4;
res.even = (val >> (uchar4)4) & (uchar4)0x0F;
res.odd = val & (uchar4)0x0F;
res |= res << (uchar8)4;
vstore8(res, 0, dst + ((y + iy)*width + x));
srcOffset++;
}
@ -70,9 +70,9 @@ kernel void DecodeIA4(global uchar *dst,
{
uchar8 val = vload8(srcOffset, src);
uchar16 res;
res.odd = (val >> 4) & 0x0F;
res.even = val & 0x0F;
res |= res << 4;
res.odd = (val >> (uchar8)4) & (uchar8)0x0F;
res.even = val & (uchar8)0x0F;
res |= res << (uchar16)4;
vstore16(res, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
}
@ -105,7 +105,7 @@ kernel void DecodeRGB565(global ushort *dst,
for (int iy = 0; iy < 4; iy++)
{
ushort4 val = vload4(srcOffset, src);
val = (val >> 8) | (val << 8);
val = (val >> (ushort4)8) | (val << (ushort4)8);
vstore4(val, 0, dst + ((y + iy)*width + x));
srcOffset++;
}
@ -119,32 +119,32 @@ kernel void DecodeRGB5A3(global uchar *dst,
for (int iy = 0; iy < 4; iy++)
{
ushort8 val = convert_ushort8(vload8(srcOffset, src));
ushort4 vs = val.odd | (val.even << 8);
ushort4 vs = val.odd | (ushort4)(val.even << (ushort4)8);
uchar16 resNoAlpha;
resNoAlpha.s26AE = convert_uchar4(vs >> 7); // R
resNoAlpha.s159D = convert_uchar4(vs >> 2); // G
resNoAlpha.s048C = convert_uchar4(vs << 3); // B
resNoAlpha &= 0xF8;
resNoAlpha |= (resNoAlpha >> 5) & 3; // 5 -> 8
resNoAlpha.s26AE = convert_uchar4(vs >> (ushort4)7); // R
resNoAlpha.s159D = convert_uchar4(vs >> (ushort4)2); // G
resNoAlpha.s048C = convert_uchar4(vs << (ushort4)3); // B
resNoAlpha &= (uchar16)0xF8;
resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5) & (uchar16)3; // 5 -> 8
resNoAlpha.s37BF = (uchar4)(0xFF);
uchar16 resAlpha;
resAlpha.s26AE = convert_uchar4(vs >> 8); // R
resAlpha.s159D = convert_uchar4(vs >> 4); // G
resAlpha.s26AE = convert_uchar4(vs >> (ushort4)8); // R
resAlpha.s159D = convert_uchar4(vs >> (ushort4)4); // G
resAlpha.s048C = convert_uchar4(vs); // B
resAlpha &= 0x0F;
resAlpha |= (resAlpha << 4);
resAlpha.s37BF = convert_uchar4(vs >> 7) & 0xE0;
resAlpha.s37BF |= ((resAlpha.s37BF >> 3) & 0x1C)
| ((resAlpha.s37BF >> 6) & 0x3);
resAlpha &= (uchar16)0x0F;
resAlpha |= (resAlpha << (uchar16)4);
resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0;
resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C)
| ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3);
uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8),
(uchar4)(vs.s1 >> 8),
(uchar4)(vs.s2 >> 8),
(uchar4)(vs.s3 >> 8));
uchar16 res;
res = select(resAlpha, resNoAlpha, choice);
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
vstore16(res, 0, dst + ((y + iy) * width + x) * 4);
srcOffset++;
}
}
@ -157,47 +157,33 @@ uint4 unpack2bits(uchar b)
b & 3);
}
/*
Lots of debug code there that I'm using to find the problems with CMPR decoding
I think blocks having no alpha are properly decoded, only the blocks with alpha
are problematic. This is WIP !
*/
kernel void decodeCMPRBlock(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = (x * 2 + y * width / 2) / 8; //x / 4 + y * width / 16; //(x * 4) + (y * width) / 16;
uchar8 val = vload8(0, src);
ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8));
uchar8 color32 = convert_uchar8((ushort8)
(((color565 << 3) & 0xF8) | ((color565 >> 2) & 0x7),
((color565 >> 3) & 0xFC) | ((color565 >> 9) & 0x3),
((color565 >> 8) & 0xF8) | ((color565 >> 13) & 0x7),
(((color565 << (ushort2)3) & (ushort2)0xF8) | ((color565 >> (ushort2)2) & (ushort2)0x7),
((color565 >> (ushort2)3) & (ushort2)0xFC) | ((color565 >> (ushort2)9) & (ushort2)0x3),
((color565 >> (ushort2)8) & (ushort2)0xF8) | ((color565 >> (ushort2)13) & (ushort2)0x7),
0xFF, 0xFF));
uint4 colors;
//uint4 choice = (uint4)((color565.s0 - color565.s1) << 16);
uint4 colorNoAlpha;
//uchar4 frac = (color32.odd - color32.even) / 2;
//frac = frac - (frac / 4);
uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & 0xFF) - (convert_ushort4(color32.odd) & 0xFF)) * 3) / 8);
uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & (ushort4)0xFF) - (convert_ushort4(color32.odd) & (ushort4)0xFF)) * (ushort4)3) / (ushort4)8);
colorNoAlpha = convert_uint4(color32.odd + frac);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even - frac);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even);
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac);
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd);
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even);
uint4 colorAlpha;
//uchar4 midpoint = rhadd(color32.odd, color32.even);
uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + 1) / 2);
uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2);
midpoint.s3 = 0xFF;
colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0));
colorAlpha = (colorAlpha << 8) | convert_uint4(midpoint);
colorAlpha = (colorAlpha << 8) | convert_uint4(color32.odd);
colorAlpha = (colorAlpha << 8) | convert_uint4(color32.even);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(midpoint);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even);
//colorNoAlpha = (uint4)(0xFFFFFFFF);
//colorAlpha = (uint4)(0, 0, 0, 0xFFFFFFFF);
//colors = select(colorAlpha, colorNoAlpha, choice);
colors = color565.s0 > color565.s1 ? colorNoAlpha : colorAlpha;
uint16 colorsFull = (uint16)(colors, colors, colors, colors);
@ -206,28 +192,15 @@ kernel void decodeCMPRBlock(global uchar *dst,
uint4 shift1 = unpack2bits(val.s5);
uint4 shift2 = unpack2bits(val.s6);
uint4 shift3 = unpack2bits(val.s7);
uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3));
shifts <<= 3;
uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3));
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3));
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3));
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)) << (uint16)3;
for (int iy = 0; iy < 4; iy++)
{
uchar16 res;
res = convert_uchar16(colorsFull >> (shifts & 0xFF));
shifts >>= 8;
//uchar4 t = convert_uchar4((ushort4)(color565.s0 >> 8, color565.s0 & 0xFF, color565.s1 >> 8, color565.s1 & 0xFF));
//res = (uchar16)(t, t, t, t);
//res = (uchar16)(frac, color32.even - color32.odd, (color32.even - color32.odd) / 2, (color32.even - color32.odd) / 2 - ((color32.even - color32.odd) / 8));
//res = (uchar16)(color32.even, color32.odd, frac, convert_uchar4(choice));
//res = convert_uchar16((uint16)(colorNoAlpha >> 24, colorNoAlpha >> 16, colorNoAlpha >> 8, colorNoAlpha));
//res = convert_uchar16((uint16)(colorAlpha >> 24, colorAlpha >> 16, colorAlpha >> 8, colorAlpha));
//res = convert_uchar16((uint16)(colors >> 24, colors >> 16, colors >> 8, colors));
//res = convert_uchar16(shifts & 0xFF);
//res = convert_uchar16((uint16)(shift0, shift1, shift2, shift3));
//res = (uchar16)(((x))) + (uchar16)(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
//res.lo = val; res.s8 = x >> 8; res.s9 = x; res.sA = (iy + y) >> 8; res.sB = y + iy; res.sC = width >> 8; res.sD = width; res.sE = srcOffset >> 8; res.sF = srcOffset;
res = convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)) >> (uchar16)8;
vstore16(res, 0, dst);
dst += width * 4;
}

View File

@ -42,34 +42,74 @@ bool Initialize()
return false;
int err; // error code returned from api calls
// Connect to a compute device
cl_uint numPlatforms;
cl_platform_id platform = NULL;
err = clGetPlatformIDs(0, NULL, &numPlatforms);
// Connect to a compute device
//
int gpu = 1; // I think we should use CL_DEVICE_TYPE_ALL
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
HandleCLError(err, "Failed to create a device group!");
return false;
}
if (err != CL_SUCCESS)
{
HandleCLError(err, "clGetPlatformIDs failed.");
return false;
}
// Create a compute context
//
g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!g_context)
{
HandleCLError(err, "Failed to create a compute context!");
return false;
}
if (0 < numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
err = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Create a command commands
//
g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err);
if (!g_cmdq)
{
HandleCLError(err, "Failed to create a command commands!");
return false;
}
if (err != CL_SUCCESS)
{
HandleCLError(err, "clGetPlatformIDs failed.");
return false;
}
char pbuf[100];
err = clGetPlatformInfo(platforms[0], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
if (err != CL_SUCCESS)
{
HandleCLError(err, "clGetPlatformInfo failed.");
return false;
}
platform = platforms[0];
delete[] platforms;
}
else
{
PanicAlert("No OpenCL platform found.");
return false;
}
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
int gpu = 1; // I think we should use CL_DEVICE_TYPE_ALL
err = clGetDeviceIDs(platform, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
HandleCLError(err, "Failed to create a device group!");
return false;
}
// Create a compute context
g_context = clCreateContext(cprops, 1, &device_id, NULL, NULL, &err);
if (!g_context)
{
HandleCLError(err, "Failed to create a compute context!");
return false;
}
// Create a command commands
g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err);
if (!g_cmdq)
{
HandleCLError(err, "Failed to create a command commands!");
return false;
}
NOTICE_LOG(COMMON, "Initialized OpenCL!");
g_bInitialized = true;
@ -103,13 +143,12 @@ cl_program CompileProgram(const char *Kernel)
}
// Build the program executable
//
err = clBuildProgram(program , 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
char *errors[16384] = {0};
err = clGetProgramBuildInfo(program, OpenCL::device_id, CL_PROGRAM_BUILD_LOG, sizeof(errors),
errors, NULL);
PanicAlert("Error log:\n%s\n", errors);
ERROR_LOG(COMMON, "Error log:\n%s\n", errors);
return NULL;
}
@ -121,8 +160,8 @@ cl_kernel CompileKernel(cl_program program, const char *Function)
{
u32 compileStart = timeGetTime();
int err;
// Create the compute kernel in the program we wish to run
//
cl_kernel kernel = clCreateKernel(program, Function, &err);
if (!kernel || err != CL_SUCCESS)
{
@ -148,65 +187,63 @@ void HandleCLError(cl_int error, char* str)
{
#if defined(HAVE_OPENCL) && HAVE_OPENCL
char* name;
switch(error)
{
char* name;
switch(error)
{
#define CL_ERROR(x) case (x): name = #x; break
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
#undef CL_ERROR
default:
name = "Unknown error code";
}
if(!str)
str = "";
PanicAlert("OpenCL error: %s %s (%d)", str, name, error);
default:
name = "Unknown error code";
}
if(!str)
str = "";
ERROR_LOG(COMMON, "OpenCL error: %s %s (%d)", str, name, error);
#endif
}
}
}