diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index d3691b81cc..89595eb496 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -24,9 +24,9 @@ kernel void DecodeI4(global uchar *dst, { uchar4 val = vload4(srcOffset, src); uchar8 res; - res.even = (val >> 4) & 0x0F; - res.odd = val & 0x0F; - res |= res << 4; + res.even = (val >> (uchar4)4) & (uchar4)0x0F; + res.odd = val & (uchar4)0x0F; + res |= res << (uchar8)4; vstore8(res, 0, dst + ((y + iy)*width + x)); srcOffset++; } @@ -70,9 +70,9 @@ kernel void DecodeIA4(global uchar *dst, { uchar8 val = vload8(srcOffset, src); uchar16 res; - res.odd = (val >> 4) & 0x0F; - res.even = val & 0x0F; - res |= res << 4; + res.odd = (val >> (uchar8)4) & (uchar8)0x0F; + res.even = val & (uchar8)0x0F; + res |= res << (uchar16)4; vstore16(res, 0, dst + ((y + iy)*width + x) * 2); srcOffset++; } @@ -105,7 +105,7 @@ kernel void DecodeRGB565(global ushort *dst, for (int iy = 0; iy < 4; iy++) { ushort4 val = vload4(srcOffset, src); - val = (val >> 8) | (val << 8); + val = (val >> (ushort4)8) | (val << (ushort4)8); vstore4(val, 0, dst + ((y + iy)*width + x)); srcOffset++; } @@ -119,32 +119,32 @@ kernel void DecodeRGB5A3(global uchar *dst, for (int iy = 0; iy < 4; iy++) { ushort8 val = convert_ushort8(vload8(srcOffset, src)); - ushort4 vs = val.odd | (val.even << 8); + ushort4 vs = val.odd | (ushort4)(val.even << (ushort4)8); uchar16 resNoAlpha; - resNoAlpha.s26AE = convert_uchar4(vs >> 7); // R - resNoAlpha.s159D = convert_uchar4(vs >> 2); // G - resNoAlpha.s048C = convert_uchar4(vs << 3); // B - resNoAlpha &= 0xF8; - resNoAlpha |= (resNoAlpha >> 5) & 3; // 5 -> 8 + resNoAlpha.s26AE = convert_uchar4(vs >> (ushort4)7); // R + resNoAlpha.s159D = convert_uchar4(vs >> (ushort4)2); // G + resNoAlpha.s048C = convert_uchar4(vs << (ushort4)3); // B + resNoAlpha &= (uchar16)0xF8; + resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5) & (uchar16)3; // 5 -> 8 resNoAlpha.s37BF = (uchar4)(0xFF); uchar16 resAlpha; - resAlpha.s26AE = convert_uchar4(vs >> 8); // R - resAlpha.s159D = convert_uchar4(vs >> 4); // G + resAlpha.s26AE = convert_uchar4(vs >> (ushort4)8); // R + resAlpha.s159D = convert_uchar4(vs >> (ushort4)4); // G resAlpha.s048C = convert_uchar4(vs); // B - resAlpha &= 0x0F; - resAlpha |= (resAlpha << 4); - resAlpha.s37BF = convert_uchar4(vs >> 7) & 0xE0; - resAlpha.s37BF |= ((resAlpha.s37BF >> 3) & 0x1C) - | ((resAlpha.s37BF >> 6) & 0x3); + resAlpha &= (uchar16)0x0F; + resAlpha |= (resAlpha << (uchar16)4); + resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0; + resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) + | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), (uchar4)(vs.s1 >> 8), (uchar4)(vs.s2 >> 8), (uchar4)(vs.s3 >> 8)); uchar16 res; res = select(resAlpha, resNoAlpha, choice); - vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + vstore16(res, 0, dst + ((y + iy) * width + x) * 4); srcOffset++; } } @@ -157,47 +157,33 @@ uint4 unpack2bits(uchar b) b & 3); } -/* -Lots of debug code there that I'm using to find the problems with CMPR decoding -I think blocks having no alpha are properly decoded, only the blocks with alpha -are problematic. This is WIP ! -*/ kernel void decodeCMPRBlock(global uchar *dst, const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; - int srcOffset = (x * 2 + y * width / 2) / 8; //x / 4 + y * width / 16; //(x * 4) + (y * width) / 16; uchar8 val = vload8(0, src); ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8)); uchar8 color32 = convert_uchar8((ushort8) - (((color565 << 3) & 0xF8) | ((color565 >> 2) & 0x7), - ((color565 >> 3) & 0xFC) | ((color565 >> 9) & 0x3), - ((color565 >> 8) & 0xF8) | ((color565 >> 13) & 0x7), + (((color565 << (ushort2)3) & (ushort2)0xF8) | ((color565 >> (ushort2)2) & (ushort2)0x7), + ((color565 >> (ushort2)3) & (ushort2)0xFC) | ((color565 >> (ushort2)9) & (ushort2)0x3), + ((color565 >> (ushort2)8) & (ushort2)0xF8) | ((color565 >> (ushort2)13) & (ushort2)0x7), 0xFF, 0xFF)); uint4 colors; - //uint4 choice = (uint4)((color565.s0 - color565.s1) << 16); uint4 colorNoAlpha; - //uchar4 frac = (color32.odd - color32.even) / 2; - //frac = frac - (frac / 4); - uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & 0xFF) - (convert_ushort4(color32.odd) & 0xFF)) * 3) / 8); + uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & (ushort4)0xFF) - (convert_ushort4(color32.odd) & (ushort4)0xFF)) * (ushort4)3) / (ushort4)8); colorNoAlpha = convert_uint4(color32.odd + frac); - colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even - frac); - colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd); - colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even); + colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac); + colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd); + colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even); uint4 colorAlpha; - //uchar4 midpoint = rhadd(color32.odd, color32.even); - uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + 1) / 2); + uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2); midpoint.s3 = 0xFF; colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0)); - colorAlpha = (colorAlpha << 8) | convert_uint4(midpoint); - colorAlpha = (colorAlpha << 8) | convert_uint4(color32.odd); - colorAlpha = (colorAlpha << 8) | convert_uint4(color32.even); + colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(midpoint); + colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd); + colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even); - //colorNoAlpha = (uint4)(0xFFFFFFFF); - //colorAlpha = (uint4)(0, 0, 0, 0xFFFFFFFF); - - //colors = select(colorAlpha, colorNoAlpha, choice); colors = color565.s0 > color565.s1 ? colorNoAlpha : colorAlpha; uint16 colorsFull = (uint16)(colors, colors, colors, colors); @@ -206,31 +192,18 @@ kernel void decodeCMPRBlock(global uchar *dst, uint4 shift1 = unpack2bits(val.s5); uint4 shift2 = unpack2bits(val.s6); uint4 shift3 = unpack2bits(val.s7); - uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3)); - shifts = (shifts << 8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3)); - shifts = (shifts << 8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3)); - shifts = (shifts << 8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)); - shifts <<= 3; + uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3)); + shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3)); + shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3)); + shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)) << (uint16)3; for (int iy = 0; iy < 4; iy++) { uchar16 res; - res = convert_uchar16(colorsFull >> (shifts & 0xFF)); - shifts >>= 8; - //uchar4 t = convert_uchar4((ushort4)(color565.s0 >> 8, color565.s0 & 0xFF, color565.s1 >> 8, color565.s1 & 0xFF)); - //res = (uchar16)(t, t, t, t); - //res = (uchar16)(frac, color32.even - color32.odd, (color32.even - color32.odd) / 2, (color32.even - color32.odd) / 2 - ((color32.even - color32.odd) / 8)); - //res = (uchar16)(color32.even, color32.odd, frac, convert_uchar4(choice)); - //res = convert_uchar16((uint16)(colorNoAlpha >> 24, colorNoAlpha >> 16, colorNoAlpha >> 8, colorNoAlpha)); - //res = convert_uchar16((uint16)(colorAlpha >> 24, colorAlpha >> 16, colorAlpha >> 8, colorAlpha)); - //res = convert_uchar16((uint16)(colors >> 24, colors >> 16, colors >> 8, colors)); - //res = convert_uchar16(shifts & 0xFF); - //res = convert_uchar16((uint16)(shift0, shift1, shift2, shift3)); - //res = (uchar16)(((x))) + (uchar16)(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3); - //res.lo = val; res.s8 = x >> 8; res.s9 = x; res.sA = (iy + y) >> 8; res.sB = y + iy; res.sC = width >> 8; res.sD = width; res.sE = srcOffset >> 8; res.sF = srcOffset; + res = convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)) >> (uchar16)8; vstore16(res, 0, dst); dst += width * 4; - } + } } kernel void DecodeCMPR(global uchar *dst, diff --git a/Source/Core/Common/Src/OpenCL.cpp b/Source/Core/Common/Src/OpenCL.cpp index 37ffb9d028..7eaa44355f 100644 --- a/Source/Core/Common/Src/OpenCL.cpp +++ b/Source/Core/Common/Src/OpenCL.cpp @@ -42,34 +42,74 @@ bool Initialize() return false; int err; // error code returned from api calls - - // Connect to a compute device - // - int gpu = 1; // I think we should use CL_DEVICE_TYPE_ALL - err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); - if (err != CL_SUCCESS) - { - HandleCLError(err, "Failed to create a device group!"); - return false; - } - - // Create a compute context - // - g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); - if (!g_context) - { - HandleCLError(err, "Failed to create a compute context!"); - return false; - } + // Connect to a compute device + cl_uint numPlatforms; + cl_platform_id platform = NULL; + err = clGetPlatformIDs(0, NULL, &numPlatforms); - // Create a command commands - // - g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err); - if (!g_cmdq) - { - HandleCLError(err, "Failed to create a command commands!"); - return false; - } + if (err != CL_SUCCESS) + { + HandleCLError(err, "clGetPlatformIDs failed."); + return false; + } + + if (0 < numPlatforms) + { + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + err = clGetPlatformIDs(numPlatforms, platforms, NULL); + + if (err != CL_SUCCESS) + { + HandleCLError(err, "clGetPlatformIDs failed."); + return false; + } + + char pbuf[100]; + err = clGetPlatformInfo(platforms[0], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); + + if (err != CL_SUCCESS) + { + HandleCLError(err, "clGetPlatformInfo failed."); + return false; + } + + platform = platforms[0]; + delete[] platforms; + } + else + { + PanicAlert("No OpenCL platform found."); + return false; + } + + cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; + + cl_context_properties* cprops = (NULL == platform) ? NULL : cps; + + int gpu = 1; // I think we should use CL_DEVICE_TYPE_ALL + + err = clGetDeviceIDs(platform, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); + if (err != CL_SUCCESS) + { + HandleCLError(err, "Failed to create a device group!"); + return false; + } + + // Create a compute context + g_context = clCreateContext(cprops, 1, &device_id, NULL, NULL, &err); + if (!g_context) + { + HandleCLError(err, "Failed to create a compute context!"); + return false; + } + + // Create a command commands + g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err); + if (!g_cmdq) + { + HandleCLError(err, "Failed to create a command commands!"); + return false; + } NOTICE_LOG(COMMON, "Initialized OpenCL!"); g_bInitialized = true; @@ -103,13 +143,12 @@ cl_program CompileProgram(const char *Kernel) } // Build the program executable - // err = clBuildProgram(program , 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { char *errors[16384] = {0}; err = clGetProgramBuildInfo(program, OpenCL::device_id, CL_PROGRAM_BUILD_LOG, sizeof(errors), errors, NULL); - PanicAlert("Error log:\n%s\n", errors); + ERROR_LOG(COMMON, "Error log:\n%s\n", errors); return NULL; } @@ -121,8 +160,8 @@ cl_kernel CompileKernel(cl_program program, const char *Function) { u32 compileStart = timeGetTime(); int err; + // Create the compute kernel in the program we wish to run - // cl_kernel kernel = clCreateKernel(program, Function, &err); if (!kernel || err != CL_SUCCESS) { @@ -148,65 +187,63 @@ void HandleCLError(cl_int error, char* str) { #if defined(HAVE_OPENCL) && HAVE_OPENCL - char* name; - switch(error) - { + char* name; + switch(error) + { #define CL_ERROR(x) case (x): name = #x; break - CL_ERROR(CL_SUCCESS); - CL_ERROR(CL_DEVICE_NOT_FOUND); - CL_ERROR(CL_DEVICE_NOT_AVAILABLE); - CL_ERROR(CL_COMPILER_NOT_AVAILABLE); - CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE); - CL_ERROR(CL_OUT_OF_RESOURCES); - CL_ERROR(CL_OUT_OF_HOST_MEMORY); - CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE); - CL_ERROR(CL_MEM_COPY_OVERLAP); - CL_ERROR(CL_IMAGE_FORMAT_MISMATCH); - CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED); - CL_ERROR(CL_BUILD_PROGRAM_FAILURE); - CL_ERROR(CL_MAP_FAILURE); - CL_ERROR(CL_INVALID_VALUE); - CL_ERROR(CL_INVALID_DEVICE_TYPE); - CL_ERROR(CL_INVALID_PLATFORM); - CL_ERROR(CL_INVALID_DEVICE); - CL_ERROR(CL_INVALID_CONTEXT); - CL_ERROR(CL_INVALID_QUEUE_PROPERTIES); - CL_ERROR(CL_INVALID_COMMAND_QUEUE); - CL_ERROR(CL_INVALID_HOST_PTR); - CL_ERROR(CL_INVALID_MEM_OBJECT); - CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - CL_ERROR(CL_INVALID_IMAGE_SIZE); - CL_ERROR(CL_INVALID_SAMPLER); - CL_ERROR(CL_INVALID_BINARY); - CL_ERROR(CL_INVALID_BUILD_OPTIONS); - CL_ERROR(CL_INVALID_PROGRAM); - CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE); - CL_ERROR(CL_INVALID_KERNEL_NAME); - CL_ERROR(CL_INVALID_KERNEL_DEFINITION); - CL_ERROR(CL_INVALID_KERNEL); - CL_ERROR(CL_INVALID_ARG_INDEX); - CL_ERROR(CL_INVALID_ARG_VALUE); - CL_ERROR(CL_INVALID_ARG_SIZE); - CL_ERROR(CL_INVALID_KERNEL_ARGS); - CL_ERROR(CL_INVALID_WORK_DIMENSION); - CL_ERROR(CL_INVALID_WORK_GROUP_SIZE); - CL_ERROR(CL_INVALID_WORK_ITEM_SIZE); - CL_ERROR(CL_INVALID_GLOBAL_OFFSET); - CL_ERROR(CL_INVALID_EVENT_WAIT_LIST); - CL_ERROR(CL_INVALID_EVENT); - CL_ERROR(CL_INVALID_OPERATION); - CL_ERROR(CL_INVALID_GL_OBJECT); - CL_ERROR(CL_INVALID_BUFFER_SIZE); - CL_ERROR(CL_INVALID_MIP_LEVEL); + CL_ERROR(CL_SUCCESS); + CL_ERROR(CL_DEVICE_NOT_FOUND); + CL_ERROR(CL_DEVICE_NOT_AVAILABLE); + CL_ERROR(CL_COMPILER_NOT_AVAILABLE); + CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CL_ERROR(CL_OUT_OF_RESOURCES); + CL_ERROR(CL_OUT_OF_HOST_MEMORY); + CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE); + CL_ERROR(CL_MEM_COPY_OVERLAP); + CL_ERROR(CL_IMAGE_FORMAT_MISMATCH); + CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CL_ERROR(CL_BUILD_PROGRAM_FAILURE); + CL_ERROR(CL_MAP_FAILURE); + CL_ERROR(CL_INVALID_VALUE); + CL_ERROR(CL_INVALID_DEVICE_TYPE); + CL_ERROR(CL_INVALID_PLATFORM); + CL_ERROR(CL_INVALID_DEVICE); + CL_ERROR(CL_INVALID_CONTEXT); + CL_ERROR(CL_INVALID_QUEUE_PROPERTIES); + CL_ERROR(CL_INVALID_COMMAND_QUEUE); + CL_ERROR(CL_INVALID_HOST_PTR); + CL_ERROR(CL_INVALID_MEM_OBJECT); + CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CL_ERROR(CL_INVALID_IMAGE_SIZE); + CL_ERROR(CL_INVALID_SAMPLER); + CL_ERROR(CL_INVALID_BINARY); + CL_ERROR(CL_INVALID_BUILD_OPTIONS); + CL_ERROR(CL_INVALID_PROGRAM); + CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE); + CL_ERROR(CL_INVALID_KERNEL_NAME); + CL_ERROR(CL_INVALID_KERNEL_DEFINITION); + CL_ERROR(CL_INVALID_KERNEL); + CL_ERROR(CL_INVALID_ARG_INDEX); + CL_ERROR(CL_INVALID_ARG_VALUE); + CL_ERROR(CL_INVALID_ARG_SIZE); + CL_ERROR(CL_INVALID_KERNEL_ARGS); + CL_ERROR(CL_INVALID_WORK_DIMENSION); + CL_ERROR(CL_INVALID_WORK_GROUP_SIZE); + CL_ERROR(CL_INVALID_WORK_ITEM_SIZE); + CL_ERROR(CL_INVALID_GLOBAL_OFFSET); + CL_ERROR(CL_INVALID_EVENT_WAIT_LIST); + CL_ERROR(CL_INVALID_EVENT); + CL_ERROR(CL_INVALID_OPERATION); + CL_ERROR(CL_INVALID_GL_OBJECT); + CL_ERROR(CL_INVALID_BUFFER_SIZE); + CL_ERROR(CL_INVALID_MIP_LEVEL); #undef CL_ERROR - default: - name = "Unknown error code"; - } - if(!str) - str = ""; - PanicAlert("OpenCL error: %s %s (%d)", str, name, error); - + default: + name = "Unknown error code"; + } + if(!str) + str = ""; + ERROR_LOG(COMMON, "OpenCL error: %s %s (%d)", str, name, error); #endif -} - + } }