From 062d9a65a799448c66f594c266ae801adeb3a750 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 24 Dec 2018 21:35:17 -0800 Subject: [PATCH 1/7] Cocoa Port: Do a minor optimization for Metal display views running on macOS v10.13 High Sierra and later. --- .../project.pbxproj | 14 ++ .../cocoa/MetalRendererCommonShaders.h | 26 ++++ .../cocoa/MetalRendererCommonShaders.metal | 46 +++++++ .../userinterface/MacMetalDisplayView.mm | 128 ++++++++++++++---- .../MacMetalDisplayViewShaders.metal | 12 +- 5 files changed, 190 insertions(+), 36 deletions(-) create mode 100644 desmume/src/frontend/cocoa/MetalRendererCommonShaders.h create mode 100644 desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal diff --git a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index 42df488a..b40938d8 100644 --- a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -296,6 +296,11 @@ AB564915186E6F67002740F4 /* Image_Piano.png in Resources */ = {isa = PBXBuildFile; fileRef = AB56490B186E6F67002740F4 /* Image_Piano.png */; }; AB5785FD17176AFC002C5FC7 /* OpenEmuBase.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AB5785FC17176AFC002C5FC7 /* OpenEmuBase.framework */; }; AB58F32D1364F44B0074C376 /* cocoa_file.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB58F32C1364F44B0074C376 /* cocoa_file.mm */; }; + AB5B1D4A21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */; }; + AB5B1D4B21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */; }; + AB5B1D4C21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */; }; + AB5B1D4D21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */; }; + AB5B1D4E21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */; }; AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; AB64987C13ECC73800EE7DD2 /* FileTypeInfo.plist in Resources */ = {isa = PBXBuildFile; fileRef = AB64987B13ECC73800EE7DD2 /* FileTypeInfo.plist */; }; AB68101B187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png in Resources */ = {isa = PBXBuildFile; fileRef = AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */; }; @@ -2486,6 +2491,8 @@ AB5785FC17176AFC002C5FC7 /* OpenEmuBase.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenEmuBase.framework; path = openemu/OpenEmuBase.framework; sourceTree = ""; }; AB58F32B1364F44B0074C376 /* cocoa_file.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_file.h; sourceTree = ""; }; AB58F32C1364F44B0074C376 /* cocoa_file.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = cocoa_file.mm; sourceTree = ""; }; + AB5B1D4821D1F31D00BF0E0F /* MetalRendererCommonShaders.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MetalRendererCommonShaders.h; sourceTree = ""; }; + AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalRendererCommonShaders.metal; sourceTree = ""; }; AB64987B13ECC73800EE7DD2 /* FileTypeInfo.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = FileTypeInfo.plist; sourceTree = ""; }; AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_GuitarGrip_Button_Blue_512x512.png; path = images/Icon_GuitarGrip_Button_Blue_512x512.png; sourceTree = ""; }; AB681014187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Green_512x512.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_GuitarGrip_Button_Green_512x512.png; path = images/Icon_GuitarGrip_Button_Green_512x512.png; sourceTree = ""; }; @@ -3183,12 +3190,14 @@ ABE5DFE3143FB1DA00835AD8 /* cocoa_videofilter.h */, AB1B9E611501A78000464647 /* coreaudiosound.h */, AB28625520AE3E9E00EAED43 /* macOS_driver.h */, + AB5B1D4821D1F31D00BF0E0F /* MetalRendererCommonShaders.h */, ABD10AE41715FCDD00B5729D /* mic_ext.h */, ABB24F6C1A81EE92006C1108 /* OGLDisplayOutput_3_2.h */, ABE6840E189E33D5007FD69C /* OGLDisplayOutput.h */, AB1B9E621501A78000464647 /* ringbuffer.h */, ABD104011346652500AF11D1 /* sndOSX.h */, AB82445E1704AEC400B8EE20 /* utilities.h */, + AB5B1D4921D1F31E00BF0E0F /* MetalRendererCommonShaders.metal */, ABA6574A14511EC90077E5E9 /* cocoa_cheat.mm */, ABD104121346652500AF11D1 /* cocoa_core.mm */, AB58F32C1364F44B0074C376 /* cocoa_file.mm */, @@ -5539,6 +5548,7 @@ ABD104281346653B00AF11D1 /* main.m in Sources */, AB2ABA411C9F9CFA00173B15 /* rsemaphore.c in Sources */, ABA6574B14511EC90077E5E9 /* cocoa_cheat.mm in Sources */, + AB5B1D4E21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */, ABD1041D1346652500AF11D1 /* cocoa_core.mm in Sources */, AB58F32D1364F44B0074C376 /* cocoa_file.mm in Sources */, AB3BF43E1E26289E003E2B24 /* MacMetalDisplayView.mm in Sources */, @@ -5766,6 +5776,7 @@ AB7900C8215B84E50082AE82 /* ftinit.c in Sources */, AB7900C9215B84E50082AE82 /* vfat.cpp in Sources */, AB7900CA215B84E50082AE82 /* colorspacehandler.cpp in Sources */, + AB5B1D4C21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */, AB7900CB215B84E50082AE82 /* videofilter.cpp in Sources */, AB7900CC215B84E50082AE82 /* WavFile.cpp in Sources */, AB7900CD215B84E50082AE82 /* wifi.cpp in Sources */, @@ -5878,6 +5889,7 @@ AB7901B3215B84F20082AE82 /* ClientExecutionControl.cpp in Sources */, AB7901B4215B84F20082AE82 /* deposterize.cpp in Sources */, AB7901B5215B84F20082AE82 /* ftgasp.c in Sources */, + AB5B1D4D21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */, AB7901B6215B84F20082AE82 /* ftotval.c in Sources */, AB7901B7215B84F20082AE82 /* ftdebug.c in Sources */, AB7901B8215B84F20082AE82 /* ftstroke.c in Sources */, @@ -6221,6 +6233,7 @@ ABFEA82B1BB4EC1100B08C25 /* ftinit.c in Sources */, AB796D4415CDCBA200C59155 /* vfat.cpp in Sources */, AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */, + AB5B1D4A21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */, AB796D4515CDCBA200C59155 /* videofilter.cpp in Sources */, AB796D4615CDCBA200C59155 /* WavFile.cpp in Sources */, AB796D4715CDCBA200C59155 /* wifi.cpp in Sources */, @@ -6333,6 +6346,7 @@ ABB1C9491F5281AE0004844F /* ClientExecutionControl.cpp in Sources */, AB301BE01D9C8BCD00246A93 /* deposterize.cpp in Sources */, ABFEA8211BB4EC1000B08C25 /* ftgasp.c in Sources */, + AB5B1D4B21D1F31E00BF0E0F /* MetalRendererCommonShaders.metal in Sources */, ABFEA83C1BB4EC1100B08C25 /* ftotval.c in Sources */, ABFEA8181BB4EC1000B08C25 /* ftdebug.c in Sources */, ABFEA8541BB4EC1100B08C25 /* ftstroke.c in Sources */, diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h new file mode 100644 index 00000000..c1cd9ff5 --- /dev/null +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h @@ -0,0 +1,26 @@ +/* + Copyright (C) 2018 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef _METAL_RENDERER_COMMON_H_ +#define _METAL_RENDERER_COMMON_H_ + +float4 unpack_unorm1555_to_unorm8888(const ushort color16); + +ushort pack_color_to_unorm5551(const float4 inColor); +float4 pack_color_to_unorm6665(const float4 inColor); + +#endif // _METAL_RENDERER_COMMON_H_ diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal new file mode 100644 index 00000000..62263930 --- /dev/null +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal @@ -0,0 +1,46 @@ +/* + Copyright (C) 2018 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include +using namespace metal; + +#include "MetalRendererCommonShaders.h" + + +float4 unpack_unorm1555_to_unorm8888(const ushort color16) +{ + return float4((float)((color16 >> 0) & 0x1F) / 31.0f, + (float)((color16 >> 5) & 0x1F) / 31.0f, + (float)((color16 >> 10) & 0x1F) / 31.0f, + (float)(color16 >> 15)); +} + +ushort pack_color_to_unorm5551(const float4 inColor) +{ + ushort4 color16 = (ushort4)((inColor * 31.0f) + 0.35f); + + color16.g <<= 5; + color16.b <<= 10; + color16.a = (color16.a < 0.0001) ? 0 : 0x8000; + + return (color16.r | color16.g | color16.b | color16.a); +} + +float4 pack_color_to_unorm6665(const float4 inColor) +{ + return inColor * float4(63.0f/255.0f, 63.0f/255.0f, 63.0f/255.0f, 31.0f/255.0f); +} diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm index 6053993a..5ecdf777 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm @@ -67,12 +67,37 @@ commandQueue = [device newCommandQueue]; _fetchCommandQueue = [device newCommandQueue]; defaultLibrary = [device newDefaultLibrary]; - _fetch555Pipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555"] error:nil] retain]; - _fetch666Pipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666"] error:nil] retain]; - _fetch888Pipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"nds_fetch888"] error:nil] retain]; - _fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"] error:nil] retain]; - _fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"] error:nil] retain]; - deposterizePipeline = [[device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"] error:nil] retain]; + + MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; + [computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES]; + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"]]; + _fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"]]; + _fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]]; + deposterizePipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + +#if defined(MAC_OS_X_VERSION_10_13) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_13) + if (@available(macOS 10.13, *)) + { + [[[computePipelineDesc buffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + [[[computePipelineDesc buffers] objectAtIndexedSubscript:1] setMutability:MTLMutabilityImmutable]; + } +#endif + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555"]]; + _fetch555Pipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666"]]; + _fetch666Pipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch888"]]; + _fetch888Pipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; + + [computePipelineDesc release]; size_t tw = GetNearestPositivePOT((uint32_t)[_fetch555Pipeline threadExecutionWidth]); while ( (tw > [_fetch555Pipeline threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) @@ -105,6 +130,18 @@ [hudPipelineDesc setVertexFunction:[defaultLibrary newFunctionWithName:@"hud_vertex"]]; [hudPipelineDesc setFragmentFunction:hudFragmentFunction]; +#if defined(MAC_OS_X_VERSION_10_13) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_13) + if (@available(macOS 10.13, *)) + { + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:1] setMutability:MTLMutabilityImmutable]; + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:2] setMutability:MTLMutabilityImmutable]; + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:3] setMutability:MTLMutabilityImmutable]; + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:4] setMutability:MTLMutabilityImmutable]; + [[[hudPipelineDesc vertexBuffers] objectAtIndexedSubscript:5] setMutability:MTLMutabilityImmutable]; + } +#endif + [[[hudPipelineDesc colorAttachments] objectAtIndexedSubscript:0] setPixelFormat:MTLPixelFormatBGRA8Unorm]; hudPipeline = [[device newRenderPipelineStateWithDescriptor:hudPipelineDesc error:nil] retain]; @@ -854,104 +891,119 @@ { id currentHQnxLUT = nil; + MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; + [computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES]; + switch (filterID) { case VideoFilterTypeID_Nearest2X: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_nearest2x"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_nearest2x"]]; break; case VideoFilterTypeID_Scanline: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_scanline"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_scanline"]]; break; case VideoFilterTypeID_EPX: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xEPX"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xEPX"]]; break; case VideoFilterTypeID_EPXPlus: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xEPXPlus"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xEPXPlus"]]; break; case VideoFilterTypeID_2xSaI: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xSaI"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xSaI"]]; break; case VideoFilterTypeID_Super2xSaI: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_Super2xSaI"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_Super2xSaI"]]; break; case VideoFilterTypeID_SuperEagle: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xSuperEagle"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xSuperEagle"]]; break; case VideoFilterTypeID_LQ2X: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_LQ2x"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_LQ2x"]]; currentHQnxLUT = [sharedData texLQ2xLUT]; break; case VideoFilterTypeID_LQ2XS: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_LQ2xS"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_LQ2xS"]]; currentHQnxLUT = [sharedData texLQ2xLUT]; break; case VideoFilterTypeID_HQ2X: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ2x"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ2x"]]; currentHQnxLUT = [sharedData texHQ2xLUT]; break; case VideoFilterTypeID_HQ2XS: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ2xS"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ2xS"]]; currentHQnxLUT = [sharedData texHQ2xLUT]; break; case VideoFilterTypeID_HQ3X: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ3x"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ3x"]]; currentHQnxLUT = [sharedData texHQ3xLUT]; break; case VideoFilterTypeID_HQ3XS: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ3xS"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ3xS"]]; currentHQnxLUT = [sharedData texHQ3xLUT]; break; case VideoFilterTypeID_HQ4X: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ4x"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ4x"]]; currentHQnxLUT = [sharedData texHQ4xLUT]; break; case VideoFilterTypeID_HQ4XS: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ4xS"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_HQ4xS"]]; currentHQnxLUT = [sharedData texHQ4xLUT]; break; case VideoFilterTypeID_2xBRZ: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xBRZ"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_2xBRZ"]]; break; case VideoFilterTypeID_3xBRZ: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_3xBRZ"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_3xBRZ"]]; break; case VideoFilterTypeID_4xBRZ: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_4xBRZ"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_4xBRZ"]]; break; case VideoFilterTypeID_5xBRZ: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_5xBRZ"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_5xBRZ"]]; break; case VideoFilterTypeID_6xBRZ: - [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_6xBRZ"] error:nil]]; + [computePipelineDesc setComputeFunction:[[sharedData defaultLibrary] newFunctionWithName:@"pixel_scaler_6xBRZ"]]; break; case VideoFilterTypeID_None: default: - [self setPixelScalePipeline:nil]; + [computePipelineDesc release]; + computePipelineDesc = nil; break; } [sharedData setTexCurrentHQnxLUT:currentHQnxLUT]; + if (computePipelineDesc != nil) + { + [self setPixelScalePipeline:[[sharedData device] newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil]]; + [computePipelineDesc release]; + computePipelineDesc = nil; + } + else + { + [self setPixelScalePipeline:nil]; + } + if ([self pixelScalePipeline] != nil) { const VideoFilterAttributes vfAttr = VideoFilter::GetAttributesByID(filterID); @@ -1049,6 +1101,17 @@ [self setOutputDrawablePipeline:[[sharedData device] newRenderPipelineStateWithDescriptor:outputPipelineDesc error:nil]]; } +#if defined(MAC_OS_X_VERSION_10_13) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_13) + if (@available(macOS 10.13, *)) + { + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:1] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:2] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:3] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc fragmentBuffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + } +#endif + [outputPipelineDesc release]; } @@ -1064,6 +1127,17 @@ [outputPipelineDesc setVertexFunction:[[sharedData defaultLibrary] newFunctionWithName:@"display_output_vertex"]]; [outputPipelineDesc setFragmentFunction:[[sharedData defaultLibrary] newFunctionWithName:@"output_filter_bilinear"]]; +#if defined(MAC_OS_X_VERSION_10_13) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_13) + if (@available(macOS 10.13, *)) + { + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:1] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:2] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc vertexBuffers] objectAtIndexedSubscript:3] setMutability:MTLMutabilityImmutable]; + [[[outputPipelineDesc fragmentBuffers] objectAtIndexedSubscript:0] setMutability:MTLMutabilityImmutable]; + } +#endif + [[[outputPipelineDesc colorAttachments] objectAtIndexedSubscript:0] setPixelFormat:MTLPixelFormatRGBA8Unorm]; outputRGBAPipeline = [[[sharedData device] newRenderPipelineStateWithDescriptor:outputPipelineDesc error:nil] retain]; diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal index 4c58129a..783554ca 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal @@ -18,8 +18,11 @@ #include using namespace metal; +#include "../MetalRendererCommonShaders.h" + #define LANCZOS_FIX(c) max(abs(c), 1e-5) + struct HUDVtx { float4 position [[position]]; @@ -44,7 +47,6 @@ struct DisplayViewShaderProperties }; float reduce(const float3 color); -float4 unpack_unorm1555_to_unorm8888(const ushort color16); float3 color_interpolate_LTE(const float3 pixA, const float3 pixB, const float3 threshold); float4 bicubic_weight_bspline(const float x); float4 bicubic_weight_mitchell_netravali(const float x); @@ -89,14 +91,6 @@ bool InterpDiff(const float3 p1, const float3 p2) return any( yuv > float3(192.0f/255.0f, 28.0f/255.0f, 48.0f/255.0f) ); } -float4 unpack_unorm1555_to_unorm8888(const ushort color16) -{ - return float4((float)((color16 >> 0) & 0x1F) / 31.0f, - (float)((color16 >> 5) & 0x1F) / 31.0f, - (float)((color16 >> 10) & 0x1F) / 31.0f, - (float)(color16 >> 16)); -} - float3 color_interpolate_LTE(const float3 pixA, const float3 pixB, const float3 threshold) { const float3 interpPix = mix(pixA, pixB, 0.5f); From c1357c1451f8ae947375b089d745409d82667c11 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 26 Dec 2018 19:48:22 -0800 Subject: [PATCH 2/7] OpenGL Renderer: Do some minor performance improving tweaks. - Most notably, fix a performance regression where polygon drawing was no longer getting batched due to an incorrect polygon-facing test. (Regression from commit dab414c.) --- desmume/src/OGLRender.cpp | 44 +++++++++++++++++------------------ desmume/src/OGLRender_3_2.cpp | 26 ++++++++++----------- desmume/src/render3D.cpp | 5 ++++ 3 files changed, 38 insertions(+), 37 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 01eb4504..15ef7dbb 100755 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -54,7 +54,7 @@ typedef struct static OGLVersion _OGLDriverVersion = {0, 0, 0}; // Lookup Tables -static CACHE_ALIGN GLfloat material_8bit_to_float[256] = {0}; +static CACHE_ALIGN GLfloat material_6bit_to_float[64] = {0}; CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32] = {0.0, 0.0322580645161, 0.0645161290323, 0.0967741935484, 0.1290322580645, 0.1612903225806, 0.1935483870968, 0.2258064516129, 0.2580645161290, 0.2903225806452, 0.3225806451613, 0.3548387096774, @@ -292,7 +292,7 @@ void main() \n\ \n\ vtxPosition = inPosition; \n\ vtxTexCoord = texScaleMtx * inTexCoord0; \n\ - vtxColor = vec4(inColor * 4.0, polyAlpha); \n\ + vtxColor = vec4(inColor / 63.0, polyAlpha); \n\ \n\ gl_Position = vtxPosition; \n\ } \n\ @@ -330,18 +330,6 @@ void main()\n\ #endif\n\ #if ENABLE_FOG\n\ vec4 newFogAttributes = vec4(0.0, 0.0, 0.0, 0.0);\n\ -#endif\n\ - \n\ -#if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ - float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ - \n\ - #if ENABLE_W_DEPTH\n\ - float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ - #else\n\ - float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ - // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ - float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ - #endif\n\ #endif\n\ \n\ if ((polyMode != 3) || polyDrawShadow)\n\ @@ -416,6 +404,16 @@ void main()\n\ gl_FragData[2] = newFogAttributes;\n\ #endif\n\ #if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ + float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + \n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ + // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ + float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ + \n\ gl_FragDepth = newFragDepthValue;\n\ #endif\n\ }\n\ @@ -1908,7 +1906,7 @@ size_t OpenGLRenderer::DrawPolygonsForIndexRange(const POLYLIST *polyList, const polyPrimitive != GL_LINE_STRIP && oglPrimitiveType[nextPoly.vtxFormat] != GL_LINE_LOOP && oglPrimitiveType[nextPoly.vtxFormat] != GL_LINE_STRIP && - this->_isPolyFrontFacing[i] != this->_isPolyFrontFacing[i+1]) + this->_isPolyFrontFacing[i] == this->_isPolyFrontFacing[i+1]) { continue; } @@ -2765,7 +2763,7 @@ Render3DError OpenGLRenderer_1_2::CreateVAOs() glEnableVertexAttribArray(OGLVertexAttributeID_Color); glVertexAttribPointer(OGLVertexAttributeID_Position, 4, GL_FLOAT, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, coord)); glVertexAttribPointer(OGLVertexAttributeID_TexCoord0, 2, GL_FLOAT, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, texcoord)); - glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(VERT), (const GLvoid *)offsetof(VERT, color)); + glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, color)); glBindVertexArray(0); @@ -3758,8 +3756,8 @@ Render3DError OpenGLRenderer_1_2::InitTables() if (needTableInit) { - for (size_t i = 0; i < 256; i++) - material_8bit_to_float[i] = (GLfloat)(i * 4) / 255.0f; + for (size_t i = 0; i < 63; i++) + material_6bit_to_float[i] = ((GLfloat)i * (255.0f/63.0f)) / 255.0f; needTableInit = false; } @@ -3931,7 +3929,7 @@ Render3DError OpenGLRenderer_1_2::EnableVertexAttributes() glEnableVertexAttribArray(OGLVertexAttributeID_Color); glVertexAttribPointer(OGLVertexAttributeID_Position, 4, GL_FLOAT, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrPosition); glVertexAttribPointer(OGLVertexAttributeID_TexCoord0, 2, GL_FLOAT, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrTexCoord); - glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(VERT), OGLRef.vtxPtrColor); + glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrColor); } else { @@ -4356,9 +4354,9 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) // Consolidate the vertex color and the poly alpha to our internal color buffer // so that OpenGL can use it. const VERT *vertForAlpha = &engine.vertList[vertIndex]; - OGLRef.color4fBuffer[colorIndex+0] = material_8bit_to_float[vertForAlpha->color[0]]; - OGLRef.color4fBuffer[colorIndex+1] = material_8bit_to_float[vertForAlpha->color[1]]; - OGLRef.color4fBuffer[colorIndex+2] = material_8bit_to_float[vertForAlpha->color[2]]; + OGLRef.color4fBuffer[colorIndex+0] = material_6bit_to_float[vertForAlpha->color[0]]; + OGLRef.color4fBuffer[colorIndex+1] = material_6bit_to_float[vertForAlpha->color[1]]; + OGLRef.color4fBuffer[colorIndex+2] = material_6bit_to_float[vertForAlpha->color[2]]; OGLRef.color4fBuffer[colorIndex+3] = thePolyAlpha; // While we're looping through our vertices, add each vertex index to a @@ -5515,7 +5513,7 @@ Render3DError OpenGLRenderer_2_0::EnableVertexAttributes() glEnableVertexAttribArray(OGLVertexAttributeID_Color); glVertexAttribPointer(OGLVertexAttributeID_Position, 4, GL_FLOAT, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrPosition); glVertexAttribPointer(OGLVertexAttributeID_TexCoord0, 2, GL_FLOAT, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrTexCoord); - glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(VERT), OGLRef.vtxPtrColor); + glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_FALSE, sizeof(VERT), OGLRef.vtxPtrColor); } return OGLERROR_NOERR; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 0642d8e1..9ae2b2bd 100755 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -147,7 +147,7 @@ void main() \n\ \n\ vtxPosition = inPosition; \n\ vtxTexCoord = texScaleMtx * inTexCoord0; \n\ - vtxColor = vec4(inColor * 4.0, polyAlpha); \n\ + vtxColor = vec4(inColor / 63.0, polyAlpha); \n\ \n\ gl_Position = vtxPosition; \n\ } \n\ @@ -206,18 +206,6 @@ void main()\n\ #endif\n\ #if ENABLE_FOG\n\ vec4 newFogAttributes = vec4(0.0, 0.0, 0.0, 0.0);\n\ -#endif\n\ - \n\ -#if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ - float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ - \n\ - #if ENABLE_W_DEPTH\n\ - float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ - #else\n\ - float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ - // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ - float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ - #endif\n\ #endif\n\ \n\ if ((polyMode != 3u) || polyDrawShadow)\n\ @@ -299,6 +287,16 @@ void main()\n\ outFogAttributes = newFogAttributes;\n\ #endif\n\ #if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ + float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + \n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ + // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ + float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ + \n\ gl_FragDepth = newFragDepthValue;\n\ #endif\n\ }\n\ @@ -1259,7 +1257,7 @@ Render3DError OpenGLRenderer_3_2::CreateVAOs() glEnableVertexAttribArray(OGLVertexAttributeID_Color); glVertexAttribPointer(OGLVertexAttributeID_Position, 4, GL_FLOAT, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, coord)); glVertexAttribPointer(OGLVertexAttributeID_TexCoord0, 2, GL_FLOAT, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, texcoord)); - glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(VERT), (const GLvoid *)offsetof(VERT, color)); + glVertexAttribPointer(OGLVertexAttributeID_Color, 3, GL_UNSIGNED_BYTE, GL_FALSE, sizeof(VERT), (const GLvoid *)offsetof(VERT, color)); glBindVertexArray(0); diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index a2b324e3..d2fd1004 100755 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -244,6 +244,11 @@ Render3D::Render3D() _textureDeposterizeSrcSurface.Height = _textureDeposterizeDstSurface.Height = 1; _textureDeposterizeSrcSurface.Pitch = _textureDeposterizeDstSurface.Pitch = 1; + for (size_t i = 0; i < POLYLIST_SIZE; i++) + { + _textureList[i] = NULL; + } + Reset(); } From aeea0ea46a00c08f983397485a4289ba3e9a3e59 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 26 Dec 2018 22:35:34 -0800 Subject: [PATCH 3/7] OpenGL Renderer: Remove the material_6bit_to_float LUT, since we already have an equivalent existing LUT -- divide6bitBy63_LUT. --- desmume/src/OGLRender.cpp | 25 +++---------------------- desmume/src/OGLRender.h | 2 -- desmume/src/OGLRender_3_2.cpp | 3 --- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 15ef7dbb..1334acce 100755 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -54,7 +54,6 @@ typedef struct static OGLVersion _OGLDriverVersion = {0, 0, 0}; // Lookup Tables -static CACHE_ALIGN GLfloat material_6bit_to_float[64] = {0}; CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32] = {0.0, 0.0322580645161, 0.0645161290323, 0.0967741935484, 0.1290322580645, 0.1612903225806, 0.1935483870968, 0.2258064516129, 0.2580645161290, 0.2903225806452, 0.3225806451613, 0.3548387096774, @@ -2440,9 +2439,6 @@ Render3DError OpenGLRenderer_1_2::InitExtensions() glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyOGL); this->_deviceInfo.maxAnisotropy = maxAnisotropyOGL; - // Initialize OpenGL - this->InitTables(); - this->isShaderSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_shader_objects") && this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_vertex_shader") && this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_fragment_shader") && @@ -3750,21 +3746,6 @@ Render3DError OpenGLRenderer_1_2::InitFinalRenderStates(const std::setcolor[0]]; - OGLRef.color4fBuffer[colorIndex+1] = material_6bit_to_float[vertForAlpha->color[1]]; - OGLRef.color4fBuffer[colorIndex+2] = material_6bit_to_float[vertForAlpha->color[2]]; + OGLRef.color4fBuffer[colorIndex+0] = divide6bitBy63_LUT[vertForAlpha->color[0]]; + OGLRef.color4fBuffer[colorIndex+1] = divide6bitBy63_LUT[vertForAlpha->color[1]]; + OGLRef.color4fBuffer[colorIndex+2] = divide6bitBy63_LUT[vertForAlpha->color[2]]; OGLRef.color4fBuffer[colorIndex+3] = thePolyAlpha; // While we're looping through our vertices, add each vertex index to a diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index def9567a..89029b6e 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -727,7 +727,6 @@ protected: virtual void DestroyFramebufferOutput8888Program() = 0; virtual Render3DError InitFinalRenderStates(const std::set *oglExtensionSet) = 0; - virtual Render3DError InitTables() = 0; virtual Render3DError InitPostprocessingPrograms(const char *edgeMarkVtxShader, const char *edgeMarkFragShader, const char *framebufferOutputVtxShader, @@ -803,7 +802,6 @@ protected: virtual void DestroyFramebufferOutput8888Program(); virtual Render3DError InitFinalRenderStates(const std::set *oglExtensionSet); - virtual Render3DError InitTables(); virtual Render3DError InitPostprocessingPrograms(const char *edgeMarkVtxShader, const char *edgeMarkFragShader, const char *framebufferOutputVtxShader, diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 9ae2b2bd..79387f98 100755 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -795,9 +795,6 @@ Render3DError OpenGLRenderer_3_2::InitExtensions() this->_deviceInfo.isEdgeMarkSupported = true; this->_deviceInfo.isFogSupported = true; - // Initialize OpenGL - this->InitTables(); - glGenTextures(1, &OGLRef.texFinalColorID); glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_FinalColor); glBindTexture(GL_TEXTURE_2D, OGLRef.texFinalColorID); From 0c0bd5144ec7811f22f006329fc2261fd54c4ea5 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 28 Dec 2018 15:39:09 -0800 Subject: [PATCH 4/7] Cocoa Port: Do a small optimization when doing video output framebuffer fetches for Metal display views. --- .../cocoa/MetalRendererCommonShaders.h | 9 ++- .../cocoa/MetalRendererCommonShaders.metal | 20 ++++-- .../cocoa/userinterface/MacMetalDisplayView.h | 3 +- .../userinterface/MacMetalDisplayView.mm | 68 ++++++++++++------- .../MacMetalDisplayViewShaders.metal | 45 +++--------- 5 files changed, 78 insertions(+), 67 deletions(-) diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h index c1cd9ff5..bc50b81a 100644 --- a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h @@ -18,9 +18,12 @@ #ifndef _METAL_RENDERER_COMMON_H_ #define _METAL_RENDERER_COMMON_H_ -float4 unpack_unorm1555_to_unorm8888(const ushort color16); +float4 unpack_rgba5551_to_unorm8888(const ushort color16); -ushort pack_color_to_unorm5551(const float4 inColor); -float4 pack_color_to_unorm6665(const float4 inColor); +ushort pack_unorm8888_to_rgba5551(const float4 inColor); +uchar4 pack_unorm8888_to_rgba6665(const float4 inColor); +uchar4 pack_unorm8888_to_rgba8888(const float4 inColor); + +float4 convert_unorm666X_to_unorm8888(const float4 inColor); #endif // _METAL_RENDERER_COMMON_H_ diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal index 62263930..c6f8f83b 100644 --- a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal @@ -21,7 +21,7 @@ using namespace metal; #include "MetalRendererCommonShaders.h" -float4 unpack_unorm1555_to_unorm8888(const ushort color16) +float4 unpack_rgba5551_to_unorm8888(const ushort color16) { return float4((float)((color16 >> 0) & 0x1F) / 31.0f, (float)((color16 >> 5) & 0x1F) / 31.0f, @@ -29,9 +29,9 @@ float4 unpack_unorm1555_to_unorm8888(const ushort color16) (float)(color16 >> 15)); } -ushort pack_color_to_unorm5551(const float4 inColor) +ushort pack_unorm8888_to_rgba5551(const float4 inColor) { - ushort4 color16 = (ushort4)((inColor * 31.0f) + 0.35f); + ushort4 color16 = ushort4( (inColor * 31.0f) + 0.1f ); color16.g <<= 5; color16.b <<= 10; @@ -40,7 +40,17 @@ ushort pack_color_to_unorm5551(const float4 inColor) return (color16.r | color16.g | color16.b | color16.a); } -float4 pack_color_to_unorm6665(const float4 inColor) +uchar4 pack_unorm8888_to_rgba6665(const float4 inColor) { - return inColor * float4(63.0f/255.0f, 63.0f/255.0f, 63.0f/255.0f, 31.0f/255.0f); + return uchar4( (inColor * float4(63.0f, 63.0f, 63.0f, 31.0f)) + 0.1f ); +} + +uchar4 pack_unorm8888_to_rgba8888(const float4 inColor) +{ + return uchar4( (inColor * 255.0f) + 0.1f ); +} + +float4 convert_unorm666X_to_unorm8888(const float4 inColor) +{ + return float4( inColor.rgb * (255.0f/63.0f), 1.0f ); } diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h index 5599145b..69de5ff8 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h @@ -129,8 +129,9 @@ typedef DisplayViewShaderProperties DisplayViewShaderProperties; id texHQ4xLUT; id texCurrentHQnxLUT; - MTLSize _fetchThreadsPerGroup; + MTLSize _fetchThreadsPerGroupNative; MTLSize _fetchThreadGroupsPerGridNative; + MTLSize _fetchThreadsPerGroupCustom; MTLSize _fetchThreadGroupsPerGridCustom; MTLSize deposterizeThreadsPerGroup; MTLSize deposterizeThreadGroupsPerGrid; diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm index 5ecdf777..8d173348 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm @@ -71,10 +71,10 @@ MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; [computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES]; - [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"]]; + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_rgb555_to_unorm8888"]]; _fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; - [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"]]; + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_unorm666X_to_unorm8888"]]; _fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]]; @@ -99,22 +99,27 @@ [computePipelineDesc release]; - size_t tw = GetNearestPositivePOT((uint32_t)[_fetch555Pipeline threadExecutionWidth]); - while ( (tw > [_fetch555Pipeline threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) + NSUInteger tw = [_fetch555Pipeline threadExecutionWidth]; + while ( ((GPU_FRAMEBUFFER_NATIVE_WIDTH % tw) != 0) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) { tw >>= 1; } - size_t th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + while ( ((GPU_FRAMEBUFFER_NATIVE_HEIGHT % th) != 0) || (th > GPU_FRAMEBUFFER_NATIVE_HEIGHT) ) + { + th >>= 1; + } - _fetchThreadsPerGroup = MTLSizeMake(tw, th, 1); + _fetchThreadsPerGroupNative = MTLSizeMake(tw, th, 1); _fetchThreadGroupsPerGridNative = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, 1); + _fetchThreadsPerGroupCustom = _fetchThreadsPerGroupNative; _fetchThreadGroupsPerGridCustom = _fetchThreadGroupsPerGridNative; - deposterizeThreadsPerGroup = _fetchThreadsPerGroup; + deposterizeThreadsPerGroup = _fetchThreadsPerGroupNative; deposterizeThreadGroupsPerGrid = _fetchThreadGroupsPerGridNative; MTLRenderPipelineDescriptor *hudPipelineDesc = [[MTLRenderPipelineDescriptor alloc] init]; @@ -413,9 +418,22 @@ _fetchPixelBytes = dispInfo.pixelBytes; - const size_t tw = _fetchThreadsPerGroup.width; - const size_t th = _fetchThreadsPerGroup.height; - _fetchThreadGroupsPerGridCustom = MTLSizeMake((w + tw - 1) / tw, (h + th - 1) / th, 1); + NSUInteger tw = [_fetch555Pipeline threadExecutionWidth]; + while ( ((w % tw) != 0) || (tw > w) ) + { + tw >>= 1; + } + + NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + while ( ((h % th) != 0) || (th > h) ) + { + th >>= 1; + } + + _fetchThreadsPerGroupCustom = MTLSizeMake(tw, th, 1); + _fetchThreadGroupsPerGridCustom = MTLSizeMake(w / tw, + h / th, + 1); id cb = [_fetchCommandQueue commandBufferWithUnretainedReferences]; MetalTexturePair newTexPair = [self setFetchTextureBindingsAtIndex:dispInfo.bufferIndex commandBuffer:cb]; @@ -498,7 +516,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; } @@ -507,7 +525,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; } @@ -528,7 +546,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; } @@ -537,7 +555,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; } @@ -572,7 +590,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; } @@ -581,7 +599,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; } @@ -594,7 +612,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; } @@ -603,7 +621,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; } @@ -1023,17 +1041,21 @@ _texDisplayPixelScaler[NDSDisplayID_Main] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; _texDisplayPixelScaler[NDSDisplayID_Touch] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; - size_t tw = GetNearestPositivePOT((uint32_t)[[self pixelScalePipeline] threadExecutionWidth]); - while ( (tw > [[self pixelScalePipeline] threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) + NSUInteger tw = [[self pixelScalePipeline] threadExecutionWidth]; + while ( ((newScalerWidth % tw) != 0) || (tw > newScalerWidth) ) { tw >>= 1; } - const size_t th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw; + NSUInteger th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw; + while ( ((newScalerHeight % th) != 0) || (th > newScalerHeight) ) + { + th >>= 1; + } _pixelScalerThreadsPerGroup = MTLSizeMake(tw, th, 1); - _pixelScalerThreadGroupsPerGrid = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, - GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, + _pixelScalerThreadGroupsPerGrid = MTLSizeMake(newScalerWidth / tw, + newScalerHeight / th, 1); } else diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal index 783554ca..5cf9fda0 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal @@ -432,12 +432,7 @@ kernel void nds_fetch555(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - - const float4 inColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r ); + const float4 inColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r ); float3 outColor = inColor.rgb; const uint line = uint( (float)position.y / ((float)h / 192.0f) ); @@ -454,11 +449,6 @@ kernel void nds_fetch666(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - const float4 inColor = inTexture.read(position); float3 outColor = inColor.rgb * float3(255.0f/63.0f); @@ -476,11 +466,6 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - const float4 inColor = inTexture.read(position); float3 outColor = inColor.rgb; @@ -490,30 +475,20 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]], outTexture.write(float4(outColor, 1.0f), position); } -kernel void nds_fetch555ConvertOnly(const uint2 position [[thread_position_in_grid]], - const texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]]) +kernel void convert_texture_rgb555_to_unorm8888(const uint2 position [[thread_position_in_grid]], + const texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]]) { - if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) - { - return; - } - - const float4 outColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r ); + const float4 outColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r ); outTexture.write(float4(outColor.rgb, 1.0f), position); } -kernel void nds_fetch666ConvertOnly(const uint2 position [[thread_position_in_grid]], - const texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]]) +kernel void convert_texture_unorm666X_to_unorm8888(const uint2 position [[thread_position_in_grid]], + const texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]]) { - if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) - { - return; - } - - const float3 outColor = inTexture.read(position).rgb * float3(255.0f/63.0f); - outTexture.write(float4(outColor, 1.0f), position); + const float4 outColor = convert_unorm666X_to_unorm8888( inTexture.read(position) ); + outTexture.write(outColor, position); } float3 nds_apply_master_brightness(const float3 inColor, const uchar mode, const float intensity) From 4d6a132116285c6dd719943f1ba30791e264e915 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 29 Dec 2018 22:37:37 -0800 Subject: [PATCH 5/7] OpenGL Renderer: Mitigate some of the performance penalty of using the NDS Style Depth Calculation option. - GPUs that support the GL_ARB_conservative_depth extension will benefit more from this commit. (OpenGL 3.2 only.) - Also fix some miscellaneous bugs. --- desmume/src/OGLRender.cpp | 289 +++++++++++++++++++--------------- desmume/src/OGLRender.h | 5 +- desmume/src/OGLRender_3_2.cpp | 61 +++++-- 3 files changed, 213 insertions(+), 142 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 1334acce..6308e913 100755 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -403,14 +403,25 @@ void main()\n\ gl_FragData[2] = newFogAttributes;\n\ #endif\n\ #if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ - float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + // It is tempting to perform the NDS depth calculation in the vertex shader rather than in the fragment shader.\n\ + // Resist this temptation! It is much more reliable to do the depth calculation in the fragment shader due to\n\ + // subtle interpolation differences between various GPUs and/or drivers. If the depth calculation is not done\n\ + // here, then it is very possible for the user to experience Z-fighting in certain rendering situations.\n\ \n\ - #if ENABLE_W_DEPTH\n\ - float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #if NEEDS_DEPTH_EQUALS_TEST\n\ + float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + float newFragDepthValue = clamp( ( (floor(gl_FragCoord.z * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ #else\n\ - float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ - // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ - float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( (vtxPosition.w * 4096.0) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ + float newFragDepthValue = clamp( (floor(gl_FragCoord.z * 4194303.0) * 4.0) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ #endif\n\ \n\ gl_FragDepth = newFragDepthValue;\n\ @@ -1223,6 +1234,7 @@ OpenGLRenderer::OpenGLRenderer() isMultisampledFBOSupported = false; isShaderSupported = false; isSampleShadingSupported = false; + isConservativeDepthSupported = false; isVAOSupported = false; willFlipOnlyFramebufferOnGPU = false; willFlipAndConvertFramebufferOnGPU = false; @@ -1241,6 +1253,7 @@ OpenGLRenderer::OpenGLRenderer() _workingTextureUnpackBuffer = (FragmentColor *)malloc_alignedCacheLine(1024 * 1024 * sizeof(FragmentColor)); _pixelReadNeedsFinish = false; _needsZeroDstAlphaPass = true; + _renderNeedsDepthEqualsTest = false; _currentPolyIndex = 0; _lastTextureDrawTarget = OGLTextureUnitID_GColor; _geometryProgramFlags.value = 0; @@ -2110,76 +2123,73 @@ Render3DError OpenGLRenderer::DrawAlphaTexturePolygon(const GLenum polyPrimitive glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); } } - else + else if (DRAWMODE != OGLPolyDrawMode_DrawOpaquePolys) { - if (DRAWMODE != OGLPolyDrawMode_DrawOpaquePolys) + // Draw the translucent fragments. + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + + // Draw the opaque fragments if they might exist. + if (canHaveOpaqueFragments) { - // Draw the translucent fragments. - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - - // Draw the opaque fragments if they might exist. - if (canHaveOpaqueFragments) + if (DRAWMODE != OGLPolyDrawMode_ZeroAlphaPass) { - if (DRAWMODE != OGLPolyDrawMode_ZeroAlphaPass) - { - glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); - glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); - glDepthMask(GL_TRUE); - } - - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_TRUE); - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); - - if (DRAWMODE != OGLPolyDrawMode_ZeroAlphaPass) - { - glStencilFunc(GL_NOTEQUAL, 0x40 | opaquePolyID, 0x7F); - glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); - glDepthMask((enableAlphaDepthWrite) ? GL_TRUE : GL_FALSE); - } + glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); + glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); + glDepthMask(GL_TRUE); + } + + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_TRUE); + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); + + if (DRAWMODE != OGLPolyDrawMode_ZeroAlphaPass) + { + glStencilFunc(GL_NOTEQUAL, 0x40 | opaquePolyID, 0x7F); + glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); + glDepthMask((enableAlphaDepthWrite) ? GL_TRUE : GL_FALSE); } } - else // Draw the polygon as completely opaque. + } + else // Draw the polygon as completely opaque. + { + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_TRUE); + + if (this->_emulateDepthLEqualPolygonFacing) { - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_TRUE); - - if (this->_emulateDepthLEqualPolygonFacing) + if (isPolyFrontFacing) { - if (isPolyFrontFacing) - { - glDepthFunc(GL_EQUAL); - glStencilFunc(GL_EQUAL, 0x40 | opaquePolyID, 0x40); - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - - glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); - glDepthMask(GL_FALSE); - glStencilOp(GL_KEEP, GL_KEEP, GL_ZERO); - glStencilMask(0x40); - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - - glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - glDepthMask(GL_TRUE); - glDepthFunc(GL_LESS); - glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); - glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); - glStencilMask(0xFF); - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - } - else - { - glStencilFunc(GL_ALWAYS, 0x40 | opaquePolyID, 0x40); - glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); - - glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); - } + glDepthFunc(GL_EQUAL); + glStencilFunc(GL_EQUAL, 0x40 | opaquePolyID, 0x40); + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + + glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); + glDepthMask(GL_FALSE); + glStencilOp(GL_KEEP, GL_KEEP, GL_ZERO); + glStencilMask(0x40); + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glDepthMask(GL_TRUE); + glDepthFunc(GL_LESS); + glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); + glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); + glStencilMask(0xFF); + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); } else { + glStencilFunc(GL_ALWAYS, 0x40 | opaquePolyID, 0x40); glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + + glStencilFunc(GL_ALWAYS, opaquePolyID, 0x3F); } - - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); } + else + { + glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + } + + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); } } else @@ -3084,11 +3094,11 @@ Render3DError OpenGLRenderer_1_2::CreateGeometryPrograms() OGLGeometryFlags programFlags; programFlags.value = 0; - std::stringstream shaderHeader; - shaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0 \n"; - shaderHeader << "\n"; + std::stringstream fragShaderHeader; + fragShaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0 \n"; + fragShaderHeader << "\n"; - for (size_t flagsValue = 0; flagsValue < 64; flagsValue++, programFlags.value++) + for (size_t flagsValue = 0; flagsValue < 128; flagsValue++, programFlags.value++) { std::stringstream shaderFlags; shaderFlags << "#define USE_TEXTURE_SMOOTHING " << ((this->_enableTextureSmoothing) ? 1 : 0) << "\n"; @@ -3100,9 +3110,10 @@ Render3DError OpenGLRenderer_1_2::CreateGeometryPrograms() shaderFlags << "#define ENABLE_FOG " << ((programFlags.EnableFog) ? 1 : 0) << "\n"; shaderFlags << "#define ENABLE_EDGE_MARK " << ((programFlags.EnableEdgeMark) ? 1 : 0) << "\n"; shaderFlags << "#define TOON_SHADING_MODE " << ((programFlags.ToonShadingMode) ? 1 : 0) << "\n"; + shaderFlags << "#define NEEDS_DEPTH_EQUALS_TEST " << ((programFlags.NeedsDepthEqualsTest) ? 1 : 0) << "\n"; shaderFlags << "\n"; - std::string fragShaderCode = shaderHeader.str() + shaderFlags.str() + std::string(GeometryFragShader_100); + std::string fragShaderCode = fragShaderHeader.str() + shaderFlags.str() + std::string(GeometryFragShader_100); error = this->ShaderProgramCreate(OGLRef.vertexGeometryShaderID, OGLRef.fragmentGeometryShaderID[flagsValue], @@ -3170,7 +3181,7 @@ void OpenGLRenderer_1_2::DestroyGeometryPrograms() OGLRenderRef &OGLRef = *this->ref; - for (size_t flagsValue = 0; flagsValue < 64; flagsValue++) + for (size_t flagsValue = 0; flagsValue < 128; flagsValue++) { if (OGLRef.programGeometryID[flagsValue] == 0) { @@ -4238,37 +4249,6 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) return OGLERROR_BEGINGL_FAILED; } - if (this->isShaderSupported) - { - this->_geometryProgramFlags.EnableWDepth = (engine.renderState.wbuffer) ? 1 : 0; - this->_geometryProgramFlags.EnableAlphaTest = (engine.renderState.enableAlphaTest) ? 1 : 0; - this->_geometryProgramFlags.EnableTextureSampling = (this->_enableTextureSampling) ? 1 : 0; - this->_geometryProgramFlags.EnableFog = (this->_enableFog) ? 1 : 0; - this->_geometryProgramFlags.EnableEdgeMark = (this->_enableEdgeMark) ? 1 : 0; - this->_geometryProgramFlags.ToonShadingMode = (engine.renderState.shading) ? 1 : 0; - - glUseProgram(OGLRef.programGeometryID[this->_geometryProgramFlags.value]); - glUniform1i(OGLRef.uniformStateClearPolyID, this->_clearAttributes.opaquePolyID); - glUniform1f(OGLRef.uniformStateClearDepth, (GLfloat)this->_clearAttributes.depth / (GLfloat)0x00FFFFFF); - glUniform1f(OGLRef.uniformStateAlphaTestRef[this->_geometryProgramFlags.value], divide5bitBy31_LUT[engine.renderState.alphaTestRef]); - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); - glUniform1i(OGLRef.uniformPolyDrawShadow[this->_geometryProgramFlags.value], GL_FALSE); - } - else - { - if(engine.renderState.enableAlphaTest && (engine.renderState.alphaTestRef > 0)) - { - glAlphaFunc(GL_GEQUAL, divide5bitBy31_LUT[engine.renderState.alphaTestRef]); - } - else - { - glAlphaFunc(GL_GREATER, 0); - } - - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - } - GLushort *indexPtr = NULL; if (this->isVBOSupported) @@ -4286,6 +4266,7 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) indexPtr = OGLRef.vertIndexBuffer; } + this->_renderNeedsDepthEqualsTest = false; size_t vertIndexCount = 0; for (size_t i = 0; i < engine.polylist->count; i++) @@ -4361,15 +4342,16 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) // Get this polygon's facing. const size_t n = polyType - 1; - float facing = (vert[0].y + vert[n].y) * (vert[0].x - vert[n].x) - + (vert[1].y + vert[0].y) * (vert[1].x - vert[0].x) - + (vert[2].y + vert[1].y) * (vert[2].x - vert[1].x); + float facing = (vert[0].y + vert[n].y) * (vert[0].x - vert[n].x) + + (vert[1].y + vert[0].y) * (vert[1].x - vert[0].x) + + (vert[2].y + vert[1].y) * (vert[2].x - vert[1].x); for (size_t j = 2; j < n; j++) { facing += (vert[j+1].y + vert[j].y) * (vert[j+1].x - vert[j].x); } + this->_renderNeedsDepthEqualsTest = this->_renderNeedsDepthEqualsTest || (thePoly.attribute.DepthEqualTest_Enable != 0); this->_isPolyFrontFacing[i] = (facing < 0); // Get the texture that is to be attached to this polygon. @@ -4382,6 +4364,38 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) glBufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(VERT) * engine.vertListCount, engine.vertList); } + if (this->isShaderSupported) + { + this->_geometryProgramFlags.EnableWDepth = (engine.renderState.wbuffer) ? 1 : 0; + this->_geometryProgramFlags.EnableAlphaTest = (engine.renderState.enableAlphaTest) ? 1 : 0; + this->_geometryProgramFlags.EnableTextureSampling = (this->_enableTextureSampling) ? 1 : 0; + this->_geometryProgramFlags.EnableFog = (this->_enableFog) ? 1 : 0; + this->_geometryProgramFlags.EnableEdgeMark = (this->_enableEdgeMark) ? 1 : 0; + this->_geometryProgramFlags.ToonShadingMode = (engine.renderState.shading) ? 1 : 0; + this->_geometryProgramFlags.NeedsDepthEqualsTest = (this->_renderNeedsDepthEqualsTest) ? 1 : 0; + + glUseProgram(OGLRef.programGeometryID[this->_geometryProgramFlags.value]); + glUniform1i(OGLRef.uniformStateClearPolyID, this->_clearAttributes.opaquePolyID); + glUniform1f(OGLRef.uniformStateClearDepth, (GLfloat)this->_clearAttributes.depth / (GLfloat)0x00FFFFFF); + glUniform1f(OGLRef.uniformStateAlphaTestRef[this->_geometryProgramFlags.value], divide5bitBy31_LUT[engine.renderState.alphaTestRef]); + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); + glUniform1i(OGLRef.uniformPolyDrawShadow[this->_geometryProgramFlags.value], GL_FALSE); + } + else + { + if(engine.renderState.enableAlphaTest && (engine.renderState.alphaTestRef > 0)) + { + glAlphaFunc(GL_GEQUAL, divide5bitBy31_LUT[engine.renderState.alphaTestRef]); + } + else + { + glAlphaFunc(GL_GREATER, 0); + } + + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + } + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glDepthMask(GL_TRUE); @@ -5086,7 +5100,7 @@ Render3DError OpenGLRenderer_1_2::DrawShadowPolygon(const GLenum polyPrimitive, // 1st pass: Create the shadow volume. if (opaquePolyID == 0) { - if (performDepthEqualTest && this->isShaderSupported) + if (performDepthEqualTest && this->_emulateNDSDepthCalculation && this->isShaderSupported) { // Use the stencil buffer to determine which fragments fail the depth test using the lower-side tolerance. glUniform1i(OGLRef.uniformPolyDepthOffsetMode[this->_geometryProgramFlags.value], 1); @@ -5103,6 +5117,8 @@ Render3DError OpenGLRenderer_1_2::DrawShadowPolygon(const GLenum polyPrimitive, glStencilOp(GL_KEEP, GL_REPLACE, GL_KEEP); glStencilMask(0x80); glDrawElements(polyPrimitive, vertIndexCount, GL_UNSIGNED_SHORT, indexBufferPtr); + + glUniform1i(OGLRef.uniformPolyDepthOffsetMode[this->_geometryProgramFlags.value], 0); } else { @@ -5113,7 +5129,7 @@ Render3DError OpenGLRenderer_1_2::DrawShadowPolygon(const GLenum polyPrimitive, } // 2nd pass: Do the polygon ID check. - if (performDepthEqualTest && this->isShaderSupported) + if (performDepthEqualTest && this->_emulateNDSDepthCalculation && this->isShaderSupported) { // Use the stencil buffer to determine which fragments pass the lower-side tolerance. glUniform1i(OGLRef.uniformPolyDepthOffsetMode[this->_geometryProgramFlags.value], 1); @@ -5227,6 +5243,7 @@ Render3DError OpenGLRenderer_1_2::Reset() memset(OGLRef.vertIndexBuffer, 0, OGLRENDER_VERT_INDEX_BUFFER_COUNT * sizeof(GLushort)); } + this->_renderNeedsDepthEqualsTest = false; this->_currentPolyIndex = 0; OGLRef.vtxPtrPosition = (GLvoid *)offsetof(VERT, coord); @@ -5525,40 +5542,35 @@ Render3DError OpenGLRenderer_2_0::BeginRender(const GFX3D &engine) return OGLERROR_BEGINGL_FAILED; } - // Setup render states - this->_geometryProgramFlags.EnableWDepth = (engine.renderState.wbuffer) ? 1 : 0; - this->_geometryProgramFlags.EnableAlphaTest = (engine.renderState.enableAlphaTest) ? 1 : 0; - this->_geometryProgramFlags.EnableTextureSampling = (this->_enableTextureSampling) ? 1 : 0; - this->_geometryProgramFlags.EnableFog = (this->_enableFog) ? 1 : 0; - this->_geometryProgramFlags.EnableEdgeMark = (this->_enableEdgeMark) ? 1 : 0; - this->_geometryProgramFlags.ToonShadingMode = (engine.renderState.shading) ? 1 : 0; - - glUseProgram(OGLRef.programGeometryID[this->_geometryProgramFlags.value]); - glUniform1f(OGLRef.uniformStateAlphaTestRef[this->_geometryProgramFlags.value], divide5bitBy31_LUT[engine.renderState.alphaTestRef]); - glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); - glUniform1i(OGLRef.uniformPolyDrawShadow[this->_geometryProgramFlags.value], GL_FALSE); - glBindBuffer(GL_ARRAY_BUFFER, OGLRef.vboGeometryVtxID); glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, OGLRef.iboGeometryIndexID); + this->_renderNeedsDepthEqualsTest = false; + size_t vertIndexCount = 0; GLushort *indexPtr = (GLushort *)glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY); for (size_t i = 0; i < engine.polylist->count; i++) { - const POLY *thePoly = &engine.polylist->list[engine.indexlist.list[i]]; - const size_t polyType = thePoly->type; + const POLY &thePoly = engine.polylist->list[engine.indexlist.list[i]]; + const size_t polyType = thePoly.type; + const VERT vert[4] = { + engine.vertList[thePoly.vertIndexes[0]], + engine.vertList[thePoly.vertIndexes[1]], + engine.vertList[thePoly.vertIndexes[2]], + engine.vertList[thePoly.vertIndexes[3]] + }; for (size_t j = 0; j < polyType; j++) { - const GLushort vertIndex = thePoly->vertIndexes[j]; + const GLushort vertIndex = thePoly.vertIndexes[j]; // While we're looping through our vertices, add each vertex index to // a buffer. For GFX3D_QUADS and GFX3D_QUAD_STRIP, we also add additional // vertices here to convert them to GL_TRIANGLES, which are much easier // to work with and won't be deprecated in future OpenGL versions. indexPtr[vertIndexCount++] = vertIndex; - if (thePoly->vtxFormat == GFX3D_QUADS || thePoly->vtxFormat == GFX3D_QUAD_STRIP) + if (thePoly.vtxFormat == GFX3D_QUADS || thePoly.vtxFormat == GFX3D_QUAD_STRIP) { if (j == 2) { @@ -5566,17 +5578,46 @@ Render3DError OpenGLRenderer_2_0::BeginRender(const GFX3D &engine) } else if (j == 3) { - indexPtr[vertIndexCount++] = thePoly->vertIndexes[0]; + indexPtr[vertIndexCount++] = thePoly.vertIndexes[0]; } } } - this->_textureList[i] = this->GetLoadedTextureFromPolygon(*thePoly, this->_enableTextureSampling); + // Get this polygon's facing. + const size_t n = polyType - 1; + float facing = (vert[0].y + vert[n].y) * (vert[0].x - vert[n].x) + + (vert[1].y + vert[0].y) * (vert[1].x - vert[0].x) + + (vert[2].y + vert[1].y) * (vert[2].x - vert[1].x); + + for (size_t j = 2; j < n; j++) + { + facing += (vert[j+1].y + vert[j].y) * (vert[j+1].x - vert[j].x); + } + + this->_renderNeedsDepthEqualsTest = this->_renderNeedsDepthEqualsTest || (thePoly.attribute.DepthEqualTest_Enable != 0); + this->_isPolyFrontFacing[i] = (facing < 0); + + // Get the texture that is to be attached to this polygon. + this->_textureList[i] = this->GetLoadedTextureFromPolygon(thePoly, this->_enableTextureSampling); } glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER); glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(VERT) * engine.vertListCount, engine.vertList); + // Setup render states + this->_geometryProgramFlags.EnableWDepth = (engine.renderState.wbuffer) ? 1 : 0; + this->_geometryProgramFlags.EnableAlphaTest = (engine.renderState.enableAlphaTest) ? 1 : 0; + this->_geometryProgramFlags.EnableTextureSampling = (this->_enableTextureSampling) ? 1 : 0; + this->_geometryProgramFlags.EnableFog = (this->_enableFog) ? 1 : 0; + this->_geometryProgramFlags.EnableEdgeMark = (this->_enableEdgeMark) ? 1 : 0; + this->_geometryProgramFlags.ToonShadingMode = (engine.renderState.shading) ? 1 : 0; + this->_geometryProgramFlags.NeedsDepthEqualsTest = (this->_renderNeedsDepthEqualsTest) ? 1 : 0; + + glUseProgram(OGLRef.programGeometryID[this->_geometryProgramFlags.value]); + glUniform1f(OGLRef.uniformStateAlphaTestRef[this->_geometryProgramFlags.value], divide5bitBy31_LUT[engine.renderState.alphaTestRef]); + glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); + glUniform1i(OGLRef.uniformPolyDrawShadow[this->_geometryProgramFlags.value], GL_FALSE); + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glDepthMask(GL_TRUE); diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 89029b6e..7049398f 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -414,7 +414,8 @@ union OGLGeometryFlags u8 EnableFog:1; u8 EnableEdgeMark:1; u8 ToonShadingMode:1; - u8 :3; + u8 NeedsDepthEqualsTest:1; + u8 :1; }; }; typedef OGLGeometryFlags OGLGeometryFlags; @@ -657,6 +658,7 @@ protected: bool isShaderSupported; bool isVAOSupported; bool isSampleShadingSupported; + bool isConservativeDepthSupported; bool willFlipOnlyFramebufferOnGPU; bool willFlipAndConvertFramebufferOnGPU; bool willUsePerSampleZeroDstPass; @@ -670,6 +672,7 @@ protected: FragmentColor *_workingTextureUnpackBuffer; bool _pixelReadNeedsFinish; bool _needsZeroDstAlphaPass; + bool _renderNeedsDepthEqualsTest; size_t _currentPolyIndex; OGLTextureUnitID _lastTextureDrawTarget; OGLGeometryFlags _geometryProgramFlags; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 79387f98..9aac389e 100755 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -197,6 +197,9 @@ out vec4 outPolyID;\n\ #if ENABLE_FOG\n\ out vec4 outFogAttributes;\n\ #endif\n\ +#if IS_CONSERVATIVE_DEPTH_SUPPORTED && (USE_NDS_DEPTH_CALCULATION || ENABLE_FOG) && !NEEDS_DEPTH_EQUALS_TEST && !ENABLE_W_DEPTH\n\ +layout (depth_less) out float gl_FragDepth;\n\ +#endif\n\ \n\ void main()\n\ {\n\ @@ -287,14 +290,25 @@ void main()\n\ outFogAttributes = newFogAttributes;\n\ #endif\n\ #if USE_NDS_DEPTH_CALCULATION || ENABLE_FOG\n\ - float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + // It is tempting to perform the NDS depth calculation in the vertex shader rather than in the fragment shader.\n\ + // Resist this temptation! It is much more reliable to do the depth calculation in the fragment shader due to\n\ + // subtle interpolation differences between various GPUs and/or drivers. If the depth calculation is not done\n\ + // here, then it is very possible for the user to experience Z-fighting in certain rendering situations.\n\ \n\ - #if ENABLE_W_DEPTH\n\ - float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #if NEEDS_DEPTH_EQUALS_TEST\n\ + float depthOffset = (polyDepthOffsetMode == 0) ? 0.0 : ((polyDepthOffsetMode == 1) ? -DEPTH_EQUALS_TEST_TOLERANCE : DEPTH_EQUALS_TEST_TOLERANCE);\n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( ( (vtxPosition.w * 4096.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + float newFragDepthValue = clamp( ( (floor(gl_FragCoord.z * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ #else\n\ - float vertW = (vtxPosition.w == 0.0) ? 0.00000001 : vtxPosition.w;\n\ - // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ - float newFragDepthValue = clamp( ( (floor(((vtxPosition.z/vertW) * 0.5 + 0.5) * 4194303.0) * 4.0) + depthOffset ) / 16777215.0, 0.0, 1.0 );\n\ + #if ENABLE_W_DEPTH\n\ + float newFragDepthValue = clamp( (vtxPosition.w * 4096.0) / 16777215.0, 0.0, 1.0 );\n\ + #else\n\ + // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly\n\ + float newFragDepthValue = clamp( (floor(gl_FragCoord.z * 4194303.0) * 4.0) / 16777215.0, 0.0, 1.0 );\n\ + #endif\n\ #endif\n\ \n\ gl_FragDepth = newFragDepthValue;\n\ @@ -813,6 +827,7 @@ Render3DError OpenGLRenderer_3_2::InitExtensions() this->willFlipAndConvertFramebufferOnGPU = true; this->isSampleShadingSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_sample_shading"); + this->isConservativeDepthSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_conservative_depth"); this->_enableTextureSmoothing = CommonSettings.GFX3D_Renderer_TextureSmoothing; this->_emulateShadowPolygon = CommonSettings.OpenGL_Emulation_ShadowPolygon; @@ -1296,14 +1311,21 @@ Render3DError OpenGLRenderer_3_2::CreateGeometryPrograms() OGLGeometryFlags programFlags; programFlags.value = 0; - std::stringstream shaderHeader; - shaderHeader << "#version 150\n"; - shaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0\n"; - shaderHeader << "\n"; + std::stringstream vtxShaderHeader; + vtxShaderHeader << "#version 150\n"; + vtxShaderHeader << "\n"; - std::string vtxShaderCode = shaderHeader.str() + std::string(GeometryVtxShader_150); + std::string vtxShaderCode = vtxShaderHeader.str() + std::string(GeometryVtxShader_150); - for (size_t flagsValue = 0; flagsValue < 64; flagsValue++, programFlags.value++) + std::stringstream fragShaderHeader; + fragShaderHeader << "#version 150\n"; + if (this->isConservativeDepthSupported) fragShaderHeader << "#extension GL_ARB_conservative_depth : require\n"; + fragShaderHeader << "\n"; + fragShaderHeader << "#define IS_CONSERVATIVE_DEPTH_SUPPORTED " << ((this->isConservativeDepthSupported) ? 1 : 0) << "\n"; + fragShaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0\n"; + fragShaderHeader << "\n"; + + for (size_t flagsValue = 0; flagsValue < 128; flagsValue++, programFlags.value++) { std::stringstream shaderFlags; shaderFlags << "#define USE_TEXTURE_SMOOTHING " << ((this->_enableTextureSmoothing) ? 1 : 0) << "\n"; @@ -1315,9 +1337,10 @@ Render3DError OpenGLRenderer_3_2::CreateGeometryPrograms() shaderFlags << "#define ENABLE_FOG " << ((programFlags.EnableFog) ? 1 : 0) << "\n"; shaderFlags << "#define ENABLE_EDGE_MARK " << ((programFlags.EnableEdgeMark) ? 1 : 0) << "\n"; shaderFlags << "#define TOON_SHADING_MODE " << ((programFlags.ToonShadingMode) ? 1 : 0) << "\n"; + shaderFlags << "#define NEEDS_DEPTH_EQUALS_TEST " << ((programFlags.NeedsDepthEqualsTest) ? 1 : 0) << "\n"; shaderFlags << "\n"; - std::string fragShaderCode = shaderHeader.str() + shaderFlags.str() + std::string(GeometryFragShader_150); + std::string fragShaderCode = fragShaderHeader.str() + shaderFlags.str() + std::string(GeometryFragShader_150); error = this->ShaderProgramCreate(OGLRef.vertexGeometryShaderID, OGLRef.fragmentGeometryShaderID[flagsValue], @@ -1421,7 +1444,7 @@ void OpenGLRenderer_3_2::DestroyGeometryPrograms() OGLRef.uboRenderStatesID = 0; OGLRef.tboPolyStatesID = 0; - for (size_t flagsValue = 0; flagsValue < 64; flagsValue++) + for (size_t flagsValue = 0; flagsValue < 128; flagsValue++) { if (OGLRef.programGeometryID[flagsValue] == 0) { @@ -2146,6 +2169,8 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine) glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, OGLRef.iboGeometryIndexID); glBindBuffer(GL_TEXTURE_BUFFER, OGLRef.tboPolyStatesID); + this->_renderNeedsDepthEqualsTest = false; + size_t vertIndexCount = 0; GLushort *indexPtr = (GLushort *)glMapBufferRange(GL_ELEMENT_ARRAY_BUFFER, 0, engine.polylist->count * 6 * sizeof(GLushort), GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT); OGLPolyStates *polyStates = (OGLPolyStates *)glMapBufferRange(GL_TEXTURE_BUFFER, 0, engine.polylist->count * sizeof(OGLPolyStates), GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT); @@ -2185,15 +2210,16 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine) // Get the polygon's facing. const size_t n = polyType - 1; - float facing = (vert[0].y + vert[n].y) * (vert[0].x - vert[n].x) - + (vert[1].y + vert[0].y) * (vert[1].x - vert[0].x) - + (vert[2].y + vert[1].y) * (vert[2].x - vert[1].x); + float facing = (vert[0].y + vert[n].y) * (vert[0].x - vert[n].x) + + (vert[1].y + vert[0].y) * (vert[1].x - vert[0].x) + + (vert[2].y + vert[1].y) * (vert[2].x - vert[1].x); for (size_t j = 2; j < n; j++) { facing += (vert[j+1].y + vert[j].y) * (vert[j+1].x - vert[j].x); } + this->_renderNeedsDepthEqualsTest = this->_renderNeedsDepthEqualsTest || (thePoly.attribute.DepthEqualTest_Enable != 0); this->_isPolyFrontFacing[i] = (facing < 0); // Get the texture that is to be attached to this polygon. @@ -2227,6 +2253,7 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine) this->_geometryProgramFlags.EnableFog = (this->_enableFog) ? 1 : 0; this->_geometryProgramFlags.EnableEdgeMark = (this->_enableEdgeMark) ? 1 : 0; this->_geometryProgramFlags.ToonShadingMode = (engine.renderState.shading) ? 1 : 0; + this->_geometryProgramFlags.NeedsDepthEqualsTest = (this->_renderNeedsDepthEqualsTest) ? 1 : 0; glUseProgram(OGLRef.programGeometryID[this->_geometryProgramFlags.value]); glUniform1i(OGLRef.uniformTexDrawOpaque[this->_geometryProgramFlags.value], GL_FALSE); From 39f94830342007e7157fb8aa01353b0b72dfa890 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sun, 30 Dec 2018 01:36:54 -0800 Subject: [PATCH 6/7] OpenGL Renderer: Also require GLSL 4.00 when using the GL_ARB_conservative_depth extension. (Related to commit 4d6a132.) --- desmume/src/OGLRender_3_2.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 9aac389e..a73fb7a8 100755 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -827,7 +827,7 @@ Render3DError OpenGLRenderer_3_2::InitExtensions() this->willFlipAndConvertFramebufferOnGPU = true; this->isSampleShadingSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_sample_shading"); - this->isConservativeDepthSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_conservative_depth"); + this->isConservativeDepthSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_conservative_depth") && IsOpenGLDriverVersionSupported(4, 0, 0); this->_enableTextureSmoothing = CommonSettings.GFX3D_Renderer_TextureSmoothing; this->_emulateShadowPolygon = CommonSettings.OpenGL_Emulation_ShadowPolygon; @@ -1312,14 +1312,28 @@ Render3DError OpenGLRenderer_3_2::CreateGeometryPrograms() programFlags.value = 0; std::stringstream vtxShaderHeader; - vtxShaderHeader << "#version 150\n"; + if (this->isConservativeDepthSupported) + { + vtxShaderHeader << "#version 400\n"; + } + else + { + vtxShaderHeader << "#version 150\n"; + } vtxShaderHeader << "\n"; std::string vtxShaderCode = vtxShaderHeader.str() + std::string(GeometryVtxShader_150); std::stringstream fragShaderHeader; - fragShaderHeader << "#version 150\n"; - if (this->isConservativeDepthSupported) fragShaderHeader << "#extension GL_ARB_conservative_depth : require\n"; + if (this->isConservativeDepthSupported) + { + fragShaderHeader << "#version 400\n"; + fragShaderHeader << "#extension GL_ARB_conservative_depth : require\n"; + } + else + { + fragShaderHeader << "#version 150\n"; + } fragShaderHeader << "\n"; fragShaderHeader << "#define IS_CONSERVATIVE_DEPTH_SUPPORTED " << ((this->isConservativeDepthSupported) ? 1 : 0) << "\n"; fragShaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0\n"; From f97c6334412f26138c9b63d2e4ae349db48ad4d4 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sun, 30 Dec 2018 02:12:54 -0800 Subject: [PATCH 7/7] OpenGL Renderer: Okay, let's try using GL_AMD_conservative_depth for those AMD drivers that outright lie about supporting GL_ARB_conservative_depth. (Related to commit 4d6a132 and commit 39f9483.) --- desmume/src/OGLRender.cpp | 1 + desmume/src/OGLRender.h | 1 + desmume/src/OGLRender_3_2.cpp | 12 ++++++++---- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 6308e913..f5767223 100755 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -1235,6 +1235,7 @@ OpenGLRenderer::OpenGLRenderer() isShaderSupported = false; isSampleShadingSupported = false; isConservativeDepthSupported = false; + isConservativeDepthAMDSupported = false; isVAOSupported = false; willFlipOnlyFramebufferOnGPU = false; willFlipAndConvertFramebufferOnGPU = false; diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 7049398f..0aa7c9cf 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -659,6 +659,7 @@ protected: bool isVAOSupported; bool isSampleShadingSupported; bool isConservativeDepthSupported; + bool isConservativeDepthAMDSupported; bool willFlipOnlyFramebufferOnGPU; bool willFlipAndConvertFramebufferOnGPU; bool willUsePerSampleZeroDstPass; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index a73fb7a8..081cf61d 100755 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -828,6 +828,7 @@ Render3DError OpenGLRenderer_3_2::InitExtensions() this->isSampleShadingSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_sample_shading"); this->isConservativeDepthSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_ARB_conservative_depth") && IsOpenGLDriverVersionSupported(4, 0, 0); + this->isConservativeDepthAMDSupported = this->IsExtensionPresent(&oglExtensionSet, "GL_AMD_conservative_depth") && IsOpenGLDriverVersionSupported(4, 0, 0); this->_enableTextureSmoothing = CommonSettings.GFX3D_Renderer_TextureSmoothing; this->_emulateShadowPolygon = CommonSettings.OpenGL_Emulation_ShadowPolygon; @@ -1312,7 +1313,7 @@ Render3DError OpenGLRenderer_3_2::CreateGeometryPrograms() programFlags.value = 0; std::stringstream vtxShaderHeader; - if (this->isConservativeDepthSupported) + if (this->isConservativeDepthSupported || this->isConservativeDepthAMDSupported) { vtxShaderHeader << "#version 400\n"; } @@ -1325,17 +1326,20 @@ Render3DError OpenGLRenderer_3_2::CreateGeometryPrograms() std::string vtxShaderCode = vtxShaderHeader.str() + std::string(GeometryVtxShader_150); std::stringstream fragShaderHeader; - if (this->isConservativeDepthSupported) + if (this->isConservativeDepthSupported || this->isConservativeDepthAMDSupported) { fragShaderHeader << "#version 400\n"; - fragShaderHeader << "#extension GL_ARB_conservative_depth : require\n"; + + // Prioritize using GL_AMD_conservative_depth over GL_ARB_conservative_depth, since AMD drivers + // seem to have problems with GL_ARB_conservative_depth. + fragShaderHeader << ((this->isConservativeDepthAMDSupported) ? "#extension GL_AMD_conservative_depth : require\n" : "#extension GL_ARB_conservative_depth : require\n"); } else { fragShaderHeader << "#version 150\n"; } fragShaderHeader << "\n"; - fragShaderHeader << "#define IS_CONSERVATIVE_DEPTH_SUPPORTED " << ((this->isConservativeDepthSupported) ? 1 : 0) << "\n"; + fragShaderHeader << "#define IS_CONSERVATIVE_DEPTH_SUPPORTED " << ((this->isConservativeDepthSupported || this->isConservativeDepthAMDSupported) ? 1 : 0) << "\n"; fragShaderHeader << "#define DEPTH_EQUALS_TEST_TOLERANCE " << DEPTH_EQUALS_TEST_TOLERANCE << ".0\n"; fragShaderHeader << "\n";