Add crt-royale-fast shaders (#619)

This commit is contained in:
Hyllian 2024-08-09 09:57:48 -03:00 committed by GitHub
parent b327343b77
commit 111fcedc3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 2423 additions and 0 deletions

View File

@ -0,0 +1,93 @@
# crt-royale-fast: a fast crt-royale adapted from original sources by Hyllian (2024).
shaders = "8"
textures = "mask_grille_texture_small;mask_slot_texture_small;mask_shadow_texture_small"
mask_grille_texture_small = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64BGR.png"
mask_slot_texture_small = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png"
mask_shadow_texture_small = "shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png"
mask_grille_texture_small_wrap_mode = "repeat"
mask_slot_texture_small_wrap_mode = "repeat"
mask_shadow_texture_small_wrap_mode = "repeat"
mask_grille_texture_small_linear = "true"
mask_slot_texture_small_linear = "true"
mask_shadow_texture_small_linear = "true"
mask_grille_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod
mask_slot_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod
mask_shadow_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod
# Pass0: Linearize the input based on CRT gamma and bob interlaced fields.
# (Bobbing ensures we can immediately blur without getting artifacts.)
shader0 = "shaders/crt-royale/src-fast/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang"
alias0 = "ORIG_LINEARIZED"
filter_linear0 = "false"
scale_type0 = "source"
scale0 = "1.0"
srgb_framebuffer0 = "true"
# Pass1: Resample interlaced scanlines vertically.
# Separating vertical/horizontal scanline sampling is faster: It lets us
# consider more scanlines while calculating weights for fewer pixels, and
# it reduces our samples from vertical*horizontal to vertical+horizontal.
# This has to come right after ORIG_LINEARIZED, because there's no
# "original_source" scale_type we can use later.
shader1 = "shaders/crt-royale/src-fast/crt-royale-scanlines-vertical-interlacing.slang"
alias1 = "VERTICAL_SCANLINES"
filter_linear1 = "true"
scale_type_x1 = "source"
scale_x1 = "1.0"
scale_type_y1 = "viewport"
scale_y1 = "1.0"
srgb_framebuffer1 = "true"
# Pass2: Resize the phosphor mask vertically.
shader2 = "shaders/crt-royale/src-fast/crt-royale-mask-resize-vertical.slang"
filter_linear2 = "true"
scale_type_x2 = "absolute"
scale_x2 = "64"
scale_type_y2 = "viewport"
scale_y2 = "0.0625" # Safe for >= 341.333 horizontal triads at viewport size
#srgb_framebuffer2 = "false" # mask_texture is already assumed linear
# Pass3: Resize the phosphor mask horizontally. scale_x3 = scale_y5.
shader3 = "shaders/crt-royale/src-fast/crt-royale-mask-resize-horizontal.slang"
alias3 = "MASK_RESIZE"
filter_linear3 = "false"
scale_type_x3 = "viewport"
scale_x3 = "0.0625"
scale_type_y3 = "source"
scale_y3 = "1.0"
#srgb_framebuffer3 = "false" # mask_texture is already assumed linear
# Pass4: Resample scanlines horizontally, apply the phosphor mask.
shader4 = "shaders/crt-royale/src-fast/crt-royale-scanlines-horizontal-apply-mask.slang"
alias4 = "MASKED_SCANLINES"
filter_linear4 = "true" # This could just as easily be nearest neighbor.
scale_type4 = "viewport"
scale4 = "1.0"
srgb_framebuffer4 = "true"
# Pass5: Compute a brightpass. This will require reading the final mask.
shader5 = "shaders/crt-royale/src-fast/crt-royale-brightpass.slang"
alias5 = "BRIGHTPASS"
filter_linear5 = "true" # This could just as easily be nearest neighbor.
scale_type5 = "viewport"
scale5 = "1.0"
srgb_framebuffer5 = "true"
# Pass6: Blur the brightpass vertically
shader6 = "shaders/crt-royale/src-fast/crt-royale-bloom-vertical.slang"
filter_linear6 = "true" # This could just as easily be nearest neighbor.
scale_type6 = "source"
scale6 = "1.0"
srgb_framebuffer6 = "true"
# Pass7: Blur the brightpass horizontally and combine it with the dimpass:
shader7 = "shaders/crt-royale/src-fast/crt-royale-bloom-horizontal-reconstitute.slang"
filter_linear7 = "true"
scale_type7 = "source"
scale7 = "1.0"
srgb_framebuffer7 = "true"
wrap_mode7 = "clamp_to_edge"

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

View File

@ -0,0 +1,158 @@
#ifndef BIND_SHADER_PARAMS_H
#define BIND_SHADER_PARAMS_H
/*
crt-royale-fast: a fast crt-royale adapted from original sources by Hyllian (2024).
Aims to deliver a fast shader with crt-royale visual style by sacrificing some
of its complex features.
*/
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
layout(std140, set = 0, binding = 0) uniform UBO
{
mat4 MVP;
float crt_gamma;
float lcd_gamma;
float levels_contrast;
float bloom_underestimate_levels;
float bloom_excess;
float beam_min_sigma;
float beam_max_sigma;
float beam_spot_power;
float beam_min_shape;
float beam_max_shape;
float beam_shape_power;
float beam_horiz_filter;
float beam_horiz_sigma;
float beam_horiz_linear_rgb_weight;
float mask_type;
float mask_triad_size_desired;
float geom_aspect_ratio_x;
float geom_aspect_ratio_y;
float interlace_bff;
float interlace_1080i;
float interlace_detect_toggle;
} global;
#pragma parameter crt_gamma "Simulated CRT Gamma" 2.5 1.0 5.0 0.025
#define crt_gamma global.crt_gamma
#pragma parameter lcd_gamma "Your Display Gamma" 2.2 1.0 5.0 0.025
#define lcd_gamma global.lcd_gamma
#pragma parameter levels_contrast "Contrast" 1.0 0.0 4.0 0.015625
#define levels_contrast global.levels_contrast
#pragma parameter bloom_underestimate_levels "Bloom - Underestimate Levels" 0.8 0.0 5.0 0.01
#define bloom_underestimate_levels global.bloom_underestimate_levels
#pragma parameter bloom_excess "Bloom - Excess" 0.0 0.0 1.0 0.005
#pragma parameter beam_min_sigma "Beam - Min Sigma" 0.02 0.005 1.0 0.005
#define beam_min_sigma global.beam_min_sigma
#pragma parameter beam_max_sigma "Beam - Max Sigma" 0.3 0.005 1.0 0.005
#define beam_max_sigma global.beam_max_sigma
#pragma parameter beam_spot_power "Beam - Spot Power" 0.33 0.01 16.0 0.01
#define beam_spot_power global.beam_spot_power
#pragma parameter beam_min_shape "Beam - Min Shape" 2.0 2.0 32.0 0.1
#define beam_min_shape global.beam_min_shape
#pragma parameter beam_max_shape "Beam - Max Shape" 4.0 2.0 32.0 0.1
#define beam_max_shape global.beam_max_shape
#pragma parameter beam_shape_power "Beam - Shape Power" 0.25 0.01 16.0 0.01
#define beam_shape_power global.beam_shape_power
#pragma parameter beam_horiz_filter "Beam - Horiz Filter" 0.0 0.0 2.0 1.0
#define beam_horiz_filter global.beam_horiz_filter
#pragma parameter beam_horiz_sigma "Beam - Horiz Sigma" 0.35 0.0 0.67 0.005
#define beam_horiz_sigma global.beam_horiz_sigma
#pragma parameter beam_horiz_linear_rgb_weight "Beam - Horiz Linear RGB Weight" 1.0 0.0 1.0 0.01
#pragma parameter mask_type "Mask - Type" 0.0 0.0 2.0 1.0
#define mask_type global.mask_type
#pragma parameter mask_triad_size_desired "Mask - Triad Size Desired" 3.0 1.0 18.0 0.125
#pragma parameter interlace_detect_toggle "Interlacing - Toggle" 1.0 0.0 1.0 1.0
bool interlace_detect = bool(global.interlace_detect_toggle);
#pragma parameter interlace_bff "Interlacing - Bottom Field First" 0.0 0.0 1.0 1.0
//#define interlace_bff global.interlace_bff
#pragma parameter interlace_1080i "Interlace - Detect 1080i" 0.0 0.0 1.0 1.0
#define interlace_1080i global.interlace_1080i
// LEVELS MANAGEMENT:
float levels_autodim_temp = 0.5; // range (0, 1]
bool beam_generalized_gaussian = true;
float beam_antialias_level = 1.0; // range [0, 2]
float beam_spot_shape_function = 0.0;
float beam_spot_power_static = 1.0/3.0; // range (0, 16]
float beam_min_shape_static = 2.0; // range [2, 32]
float beam_max_shape_static = 4.0; // range [2, 32]
// PHOSPHOR MASK:
float mask_sinc_lobes = 3.0; // range [2, 4]
float mask_min_allowed_triad_size = 2.0;
// PASS SCALES AND RELATED CONSTANTS:
vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625);
// PHOSPHOR MASK TEXTURE CONSTANTS:
vec2 mask_texture_small_size = vec2(64.0, 64.0);
float mask_triads_per_tile = 8.0;
float mask_grille_avg_color = 53.0/255.0;
float mask_slot_avg_color = 46.0/255.0;
float mask_shadow_avg_color = 50.0/255.0;
#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
float bloom_approx_filter = 0.0;
vec2 mask_resize_src_lut_size = mask_texture_small_size;
float max_aa_base_pixel_border = 0.0;
float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
float max_tiled_pixel_border = max_aniso_pixel_border;
float max_mask_texel_border = ceil(max_tiled_pixel_border);
float max_mask_tile_border = max_mask_texel_border/
(mask_min_allowed_triad_size * mask_triads_per_tile);
float mask_resize_num_tiles = 1.0 + 2.0 * max_mask_tile_border;
float mask_start_texels = max_mask_texel_border;
float mask_resize_num_triads = mask_resize_num_tiles * mask_triads_per_tile;
vec2 min_allowed_viewport_triads = vec2(mask_resize_num_triads) / mask_resize_viewport_scale;
// Calculate {sigma, shape}_range outside of scanline_contrib so it's only
// done once per pixel (not 6 times) with runtime params. Don't reuse the
// vertex shader calculations, so static versions can be constant-folded.
float sigma_range = max(beam_max_sigma, beam_min_sigma) - beam_min_sigma;
float shape_range = max(beam_max_shape, beam_min_shape) - beam_min_shape;
//////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
float pi = 3.141592653589;
float under_half = 0.4995;
// Provide accessors settings which still need "cooking:"
float get_mask_amplify()
{
float mask_grille_amplify = 1.0/mask_grille_avg_color;
float mask_slot_amplify = 1.0/mask_slot_avg_color;
float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
return mask_type < 0.5 ? mask_grille_amplify :
mask_type < 1.5 ? mask_slot_amplify :
mask_shadow_amplify;
}
#endif // BIND_SHADER_PARAMS_H

View File

@ -0,0 +1,133 @@
#ifndef BLOOM_FUNCTIONS_H
#define BLOOM_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////////// DESCRIPTION ////////////////////////////////
// These utility functions and constants help several passes determine the
// size and center texel weight of the phosphor bloom in a uniform manner.
////////////////////////////////// INCLUDES //////////////////////////////////
#include "blur-functions.h"
/////////////////////////////// BLOOM CONSTANTS //////////////////////////////
// Compute constants with manual inlines of the functions below:
float bloom_diff_thresh = 1.0/256.0;
/////////////////////////////////// HELPERS //////////////////////////////////
float get_min_sigma_to_blur_triad(float triad_size, float thresh)
{
// Requires: 1.) triad_size is the final phosphor triad size in pixels
// 2.) thresh is the max desired pixel difference in the
// blurred triad (e.g. 1.0/256.0).
// Returns: Return the minimum sigma that will fully blur a phosphor
// triad on the screen to an even color, within thresh.
// This closed-form function was found by curve-fitting data.
// Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
// Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
//return 0.5985*triad_size - triad_size*sqrt(thresh)
}
float get_absolute_scale_blur_sigma(float thresh)
{
// Requires: 1.) min_expected_triads must be a global float. The number
// of horizontal phosphor triads in the final image must be
// >= min_allowed_viewport_triads.x for realistic results.
// 2.) bloom_approx_scale_x must be a global float equal to the
// absolute horizontal scale of BLOOM_APPROX.
// 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
// should be <= 1.1658025090 to keep the final result <
// 0.62666015625 (the largest sigma ensuring the largest
// unused texel weight stays < 1.0/256.0 for a 3x3 blur).
// 4.) thresh is the max desired pixel difference in the
// blurred triad (e.g. 1.0/256.0).
// Returns: Return the minimum Gaussian sigma that will blur the pass
// output as much as it would have taken to blur away
// bloom_approx_scale_x horizontal phosphor triads.
// Description:
// BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd
// use the same blur sigma as the actual phosphor bloom and scale it down
// to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
// we don't know the viewport size in this pass. Instead, we'll blur as
// much as it would take to blur away min_allowed_viewport_triads.x. This
// will blur "more than necessary" if the user actually uses more triads,
// but that's not terrible either, because blurring a constant fraction of
// the viewport may better resemble a true optical bloom anyway (since the
// viewport will generally be about the same fraction of each player's
// field of view, regardless of screen size and resolution).
// Assume an extremely large viewport size for asymptotic results.
float min_sigma = get_min_sigma_to_blur_triad(max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
return bloom_approx_scale_x/max_viewport_size_x * min_sigma;
}
float get_center_weight(float sigma)
{
// Given a Gaussian blur sigma, get the blur weight for the center texel.
return get_fast_gaussian_weight_sum_inv(sigma);
}
float get_bloom_approx_sigma(float output_size_x_runtime, float estimated_viewport_size_x)
{
// Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
// This is included for dynamic codepaths just in case the
// following two globals are incorrect:
// 2.) bloom_approx_size_x_for_skip should == the same
// if PHOSPHOR_BLOOM_FAKE is #defined
// 3.) bloom_approx_size_x should == the same otherwise
// Returns: For gaussian4x4, return a dynamic small bloom sigma that's
// as close to optimal as possible given available information.
// For blur3x3, return the a static small bloom sigma that
// works well for typical cases. Otherwise, we're using simple
// bilinear filtering, so use static calculations.
// Assume the default static value. This is a compromise that ensures
// typical triads are blurred, even if unusually large ones aren't.
float mask_num_triads_static = max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
// Assume an extremely large viewport size for asymptotic results:
float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
// We're either using blur3x3 or bilinear filtering. The biggest
// reason to choose blur3x3 is to avoid dynamic weights, so use a
// static calculation.
float output_size_x_static = bloom_approx_size_x;
float asymptotic_triad_size = max_viewport_size_x/mask_num_triads_static;
float asymptotic_sigma = get_min_sigma_to_blur_triad(asymptotic_triad_size, bloom_diff_thresh);
float bloom_approx_sigma = asymptotic_sigma * output_size_x_static/max_viewport_size_x;
// The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
// try accounting for the Gaussian scanline sigma from the last pass
// too; use the static default value:
return length(vec2(bloom_approx_sigma, beam_max_sigma_static));
}
#endif // BLOOM_FUNCTIONS_H

View File

@ -0,0 +1,148 @@
#ifndef BLUR_FUNCTIONS_H
#define BLUR_FUNCTIONS_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
// Set static standard deviations, but allow users to override them with their
// own constants (even non-static uniforms if they're okay with the speed hit):
// blurN_std_dev values are specified in terms of dxdy strides.
// The defaults are the largest values that keep the largest unused
// blur term on each side <= 1.0/256.0. (We could get away with more
// or be more conservative, but this compromise is pretty reasonable.)
float blur3_std_dev = 0.62666015625;
float blur4_std_dev = 0.66171875;
float blur5_std_dev = 0.9845703125;
float blur6_std_dev = 1.02626953125;
float blur7_std_dev = 1.36103515625;
float blur8_std_dev = 1.4080078125;
float blur9_std_dev = 1.7533203125;
float blur10_std_dev = 1.80478515625;
float blur11_std_dev = 2.15986328125;
float blur12_std_dev = 2.215234375;
float blur17_std_dev = 3.45535583496;
float blur25_std_dev = 5.3409576416;
float blur31_std_dev = 6.86488037109;
float blur43_std_dev = 10.1852050781;
// error_blurring should be in [0.0, 1.0]. Higher values reduce ringing
// in shared-sample blurs but increase blurring and feature shifting.
float error_blurring = 0.5;
/////////////////////////////////// HELPERS //////////////////////////////////
vec4 uv2_to_uv4(vec2 tex_uv)
{
// Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
return vec4(tex_uv, 0.0, 0.0);
}
// Make a length squared helper macro (for usage with static constants):
#define LENGTH_SQ(vec) (dot(vec, vec))
float get_fast_gaussian_weight_sum_inv(float sigma)
{
// We can use the Gaussian integral to calculate the asymptotic weight for
// the center pixel. Since the unnormalized center pixel weight is 1.0,
// the normalized weight is the same as the weight sum inverse. Given a
// large enough blur (9+), the asymptotic weight sum is close and faster:
// center_weight = 0.5 *
// (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
// erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
// However, we can get even faster results with curve-fitting. These are
// also closer than the asymptotic results, because they were constructed
// from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
// (0, blurN_std_dev), so the results for smaller sigmas are biased toward
// smaller blurs. The max error is 0.0031793913.
// Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
//float temp = 0.5/sqrt(2.0);
//return erf(temp/sigma);
return min(exp(exp(0.348348412457428/
(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
}
vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
vec2 dxdy, float sigma)
{
// Requires: Global requirements must be met (see file description).
// Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
// It may be mipmapped depending on settings and dxdy.
// First get the texel weights and normalization factor as above.
float denom_inv = 0.5/(sigma*sigma);
float w0 = 1.0;
float w1 = exp(-1.0 * denom_inv);
float w2 = exp(-4.0 * denom_inv);
float w3 = exp(-9.0 * denom_inv);
float w4 = exp(-16.0 * denom_inv);
float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
// Statically normalize weights, sum weighted samples, and return:
vec3 sum = vec3(0.0,0.0,0.0);
sum += w4 * texture(tex, tex_uv - 4.0 * dxdy).rgb;
sum += w3 * texture(tex, tex_uv - 3.0 * dxdy).rgb;
sum += w2 * texture(tex, tex_uv - 2.0 * dxdy).rgb;
sum += w1 * texture(tex, tex_uv - 1.0 * dxdy).rgb;
sum += w0 * texture(tex, tex_uv).rgb;
sum += w1 * texture(tex, tex_uv + 1.0 * dxdy).rgb;
sum += w2 * texture(tex, tex_uv + 2.0 * dxdy).rgb;
sum += w3 * texture(tex, tex_uv + 3.0 * dxdy).rgb;
sum += w4 * texture(tex, tex_uv + 4.0 * dxdy).rgb;
return sum * weight_sum_inv;
}
vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv,
vec2 dxdy, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
{
// Requires: Same as tex2Dblur11()
// Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest
// neighbor and 8 linear taps. It may be mipmapped depending
// on settings and dxdy.
// First get the texel weights and normalization factor as above.
float w0 = 1.0;
vec3 sum = vec3(0.0,0.0,0.0);
sum += (w1_8.w * texture(tex, tex_uv - (7.0 + w1_8_ratio.w) * dxdy).rgb);
sum += (w1_8.z * texture(tex, tex_uv - (5.0 + w1_8_ratio.z) * dxdy).rgb);
sum += (w1_8.y * texture(tex, tex_uv - (3.0 + w1_8_ratio.y) * dxdy).rgb);
sum += (w1_8.x * texture(tex, tex_uv - (1.0 + w1_8_ratio.x) * dxdy).rgb);
sum += (w0 * texture(tex, tex_uv).rgb);
sum += (w1_8.x * texture(tex, tex_uv + (1.0 + w1_8_ratio.x) * dxdy).rgb);
sum += (w1_8.y * texture(tex, tex_uv + (3.0 + w1_8_ratio.y) * dxdy).rgb);
sum += (w1_8.z * texture(tex, tex_uv + (5.0 + w1_8_ratio.z) * dxdy).rgb);
sum += (w1_8.w * texture(tex, tex_uv + (7.0 + w1_8_ratio.w) * dxdy).rgb);
return sum * weight_sum_inv;
}
#endif // BLUR_FUNCTIONS_H

View File

@ -0,0 +1,206 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
vec4 MASKED_SCANLINESSize;
vec4 BRIGHTPASSSize;
} params;
#define MASKED_SCANLINEStexture MASKED_SCANLINES
#define MASKED_SCANLINEStexture_size params.MASKED_SCANLINESSize.xy
#define MASKED_SCANLINESvideo_size params.MASKED_SCANLINESSize.xy
#define BRIGHTPASStexture BRIGHTPASS
#define BRIGHTPASStexture_size params.BRIGHTPASSSize.xy
#define BRIGHTPASSvideo_size params.BRIGHTPASSSize.xy
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
/////////////////////////////// VERTEX INCLUDES //////////////////////////////
#include "scanline-functions.h"
#define GAMMA_OUT(color) pow(color, vec3(1.0 / lcd_gamma))
float bloom_diff_thresh_ = 1.0/256.0;
float mask_min_allowed_tile_size = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
struct st_gauss{
vec4 w1_8;
vec4 w1_8_ratio;
};
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 video_uv;
layout(location = 1) out float bloom_sigma_runtime;
layout(location = 2) out vec4 w1_8;
layout(location = 3) out vec4 w1_8_ratio;
layout(location = 4) out float weight_sum_inv;
layout(location = 5) out float undim_mask_contrast_factors;
// copied from bloom-functions.h
float get_min_sigma_to_blur_triad(float triad_size, float thresh)
{
return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
}
float get_fast_gaussian_weight_sum_inv(float sigma)
{
return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
}
st_gauss get_blur_fastest_w1_8(float sigma)
{
float denom_inv = 0.5/(sigma*sigma);
float w0 = 1.0;
float w1 = exp(-1.0 * denom_inv);
float w2 = exp(-4.0 * denom_inv);
float w3 = exp(-9.0 * denom_inv);
float w4 = exp(-16.0 * denom_inv);
float w5 = exp(-25.0 * denom_inv);
float w6 = exp(-36.0 * denom_inv);
float w7 = exp(-49.0 * denom_inv);
float w8 = exp(-64.0 * denom_inv);
st_gauss blur_weights;
blur_weights.w1_8.x = w1 + w2;
blur_weights.w1_8.y = w3 + w4;
blur_weights.w1_8.z = w5 + w6;
blur_weights.w1_8.w = w7 + w8;
blur_weights.w1_8_ratio.x = w2/(blur_weights.w1_8.x) + 1.0;
blur_weights.w1_8_ratio.y = w4/(blur_weights.w1_8.y) + 3.0;
blur_weights.w1_8_ratio.z = w6/(blur_weights.w1_8.z) + 5.0;
blur_weights.w1_8_ratio.w = w8/(blur_weights.w1_8.w) + 7.0;
return blur_weights;
}
vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
{
// Stated tile properties must be correct:
float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
// Make sure we're not upsizing:
float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
vec2 min_tile_size = mask_min_allowed_tile_size * tile_aspect;
vec2 max_tile_size = estimated_mask_resize_output_size / mask_resize_num_tiles;
vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
return final_resized_tile_size;
}
void main()
{
gl_Position = global.MVP * Position;
video_uv = TexCoord;
// Calculate a runtime bloom_sigma in case it's needed:
float mask_tile_size_x = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
bloom_sigma_runtime = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
st_gauss blur_weights = get_blur_fastest_w1_8(bloom_sigma_runtime);
w1_8 = blur_weights.w1_8;
w1_8_ratio = blur_weights.w1_8_ratio * params.SourceSize.z;
weight_sum_inv = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
float undim_factor = 1.0/levels_autodim_temp;
undim_mask_contrast_factors = undim_factor * get_mask_amplify() * levels_contrast;
}
#pragma stage fragment
layout(location = 0) in vec2 video_uv;
layout(location = 1) in float bloom_sigma_runtime;
layout(location = 2) in vec4 w1_8;
layout(location = 3) in vec4 w1_8_ratio;
layout(location = 4) in float weight_sum_inv;
layout(location = 5) in float undim_mask_contrast_factors;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
layout(set = 0, binding = 3) uniform sampler2D BRIGHTPASS;
layout(set = 0, binding = 4) uniform sampler2D MASKED_SCANLINES;
#define bloom_texture Source
////////////////////////////// FRAGMENT INCLUDES //////////////////////////////
vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
{
float w0 = 1.0;
vec3 sum = vec3(0.0,0.0,0.0);
sum += (w1_8.w * texture(tex, tex_uv - vec2(w1_8_ratio.w, 0.0)).rgb);
sum += (w1_8.z * texture(tex, tex_uv - vec2(w1_8_ratio.z, 0.0)).rgb);
sum += (w1_8.y * texture(tex, tex_uv - vec2(w1_8_ratio.y, 0.0)).rgb);
sum += (w1_8.x * texture(tex, tex_uv - vec2(w1_8_ratio.x, 0.0)).rgb);
sum += (w0 * texture(tex, tex_uv).rgb);
sum += (w1_8.x * texture(tex, tex_uv + vec2(w1_8_ratio.x, 0.0)).rgb);
sum += (w1_8.y * texture(tex, tex_uv + vec2(w1_8_ratio.y, 0.0)).rgb);
sum += (w1_8.z * texture(tex, tex_uv + vec2(w1_8_ratio.z, 0.0)).rgb);
sum += (w1_8.w * texture(tex, tex_uv + vec2(w1_8_ratio.w, 0.0)).rgb);
return sum * weight_sum_inv;
}
void main()
{
vec3 blurred_brightpass = tex2Dblur17fastest(bloom_texture, video_uv, weight_sum_inv, w1_8, w1_8_ratio);
// Sample the masked scanlines. Alpha contains the auto-dim factor:
vec3 intensity_dim = texture(MASKED_SCANLINEStexture, video_uv).rgb;
// Calculate the mask dimpass, add it to the blurred brightpass, and
// undim (from scanline auto-dim) and amplify (from mask dim) the result:
vec3 brightpass = texture(BRIGHTPASStexture, video_uv).rgb;
vec3 phosphor_bloom = (intensity_dim - brightpass + blurred_brightpass) * undim_mask_contrast_factors;
FragColor = vec4(GAMMA_OUT(phosphor_bloom), 1.0);
}

View File

@ -0,0 +1,204 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
vec4 MASKED_SCANLINESSize;
} params;
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
/////////////////////////////// VERTEX INCLUDES ///////////////////////////////
float bloom_diff_thresh_ = 1.0/256.0;
float mask_min_allowed_tile_size = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
struct st_gauss{
vec4 w1_8;
vec4 w1_8_ratio;
};
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 tex_uv;
layout(location = 1) out float bloom_sigma_runtime;
layout(location = 2) out vec4 w1_8;
layout(location = 3) out vec4 w1_8_ratio;
layout(location = 4) out float weight_sum_inv;
// copied from bloom-functions.h
float get_min_sigma_to_blur_triad(float triad_size,
float thresh)
{
// Requires: 1.) triad_size is the final phosphor triad size in pixels
// 2.) thresh is the max desired pixel difference in the
// blurred triad (e.g. 1.0/256.0).
// Returns: Return the minimum sigma that will fully blur a phosphor
// triad on the screen to an even color, within thresh.
// This closed-form function was found by curve-fitting data.
// Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
// Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
//return 0.5985*triad_size - triad_size*sqrt(thresh)
}
float get_fast_gaussian_weight_sum_inv(float sigma)
{
// We can use the Gaussian integral to calculate the asymptotic weight for
// the center pixel. Since the unnormalized center pixel weight is 1.0,
// the normalized weight is the same as the weight sum inverse. Given a
// large enough blur (9+), the asymptotic weight sum is close and faster:
// center_weight = 0.5 *
// (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
// erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
// However, we can get even faster results with curve-fitting. These are
// also closer than the asymptotic results, because they were constructed
// from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
// (0, blurN_std_dev), so the results for smaller sigmas are biased toward
// smaller blurs. The max error is 0.0031793913.
// Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
//float temp = 0.5/sqrt(2.0);
//return erf(temp/sigma);
return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
}
st_gauss get_blur_fastest_w1_8(float sigma)
{
float denom_inv = 0.5/(sigma*sigma);
float w0 = 1.0;
float w1 = exp(-1.0 * denom_inv);
float w2 = exp(-4.0 * denom_inv);
float w3 = exp(-9.0 * denom_inv);
float w4 = exp(-16.0 * denom_inv);
float w5 = exp(-25.0 * denom_inv);
float w6 = exp(-36.0 * denom_inv);
float w7 = exp(-49.0 * denom_inv);
float w8 = exp(-64.0 * denom_inv);
st_gauss blur_weights;
blur_weights.w1_8.x = w1 + w2;
blur_weights.w1_8.y = w3 + w4;
blur_weights.w1_8.z = w5 + w6;
blur_weights.w1_8.w = w7 + w8;
blur_weights.w1_8_ratio.x = w2/(blur_weights.w1_8.x) + 1.0;
blur_weights.w1_8_ratio.y = w4/(blur_weights.w1_8.y) + 3.0;
blur_weights.w1_8_ratio.z = w6/(blur_weights.w1_8.z) + 5.0;
blur_weights.w1_8_ratio.w = w8/(blur_weights.w1_8.w) + 7.0;
return blur_weights;
}
vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
{
// Stated tile properties must be correct:
float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
// Make sure we're not upsizing:
float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
vec2 min_tile_size = mask_min_allowed_tile_size * tile_aspect;
vec2 max_tile_size = estimated_mask_resize_output_size / mask_resize_num_tiles;
vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
return final_resized_tile_size;
}
void main()
{
gl_Position = global.MVP * Position;
tex_uv = TexCoord * 1.0001;
// Get the uv sample distance between output pixels. Calculate dxdy like
// blurs/shaders/vertex-shader-blur-fast-vertical.h.
vec2 dxdy_scale = params.SourceSize.xy/params.OutputSize.xy;
vec2 dxdy = dxdy_scale/params.SourceSize.xy;
// Calculate a runtime bloom_sigma in case it's needed:
float mask_tile_size_x = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
bloom_sigma_runtime = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
st_gauss blur_weights = get_blur_fastest_w1_8(bloom_sigma_runtime);
w1_8 = blur_weights.w1_8;
w1_8_ratio = blur_weights.w1_8_ratio * dxdy.y;
weight_sum_inv = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
}
#pragma stage fragment
#pragma format R8G8B8A8_SRGB
layout(location = 0) in vec2 tex_uv;
layout(location = 1) in float bloom_sigma_runtime;
layout(location = 2) in vec4 w1_8;
layout(location = 3) in vec4 w1_8_ratio;
layout(location = 4) in float weight_sum_inv;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
#define input_texture Source
////////////////////////////// FRAGMENT INCLUDES //////////////////////////////
vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
{
float w0 = 1.0;
vec3 sum = vec3(0.0,0.0,0.0);
sum += (w1_8.w * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.w)).rgb);
sum += (w1_8.z * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.z)).rgb);
sum += (w1_8.y * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.y)).rgb);
sum += (w1_8.x * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.x)).rgb);
sum += (w0 * texture(tex, tex_uv).rgb);
sum += (w1_8.x * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.x)).rgb);
sum += (w1_8.y * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.y)).rgb);
sum += (w1_8.z * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.z)).rgb);
sum += (w1_8.w * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.w)).rgb);
return sum * weight_sum_inv;
}
void main()
{
FragColor = vec4(tex2Dblur17fastest(Source, tex_uv, weight_sum_inv, w1_8, w1_8_ratio), 1.0);
}

View File

@ -0,0 +1,150 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
vec4 MASKED_SCANLINESSize;
} params;
#define MASKED_SCANLINEStexture MASKED_SCANLINES
#define MASKED_SCANLINEStexture_size params.MASKED_SCANLINESSize.xy
#define MASKED_SCANLINESvideo_size params.MASKED_SCANLINESSize.xy
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
/////////////////////////////// VERTEX INCLUDES ///////////////////////////////
float bloom_diff_thresh_ = 1.0/256.0;
float mask_min_allowed_tile_size = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 tex_uv;
layout(location = 1) out float center_weight;
layout(location = 2) out float undim_mask_contrast_factors;
// copied from bloom-functions.h
float get_min_sigma_to_blur_triad(float triad_size, float thresh)
{
return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
}
float get_fast_gaussian_weight_sum_inv(float sigma)
{
return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
}
vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
{
// Stated tile properties must be correct:
float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
// Make sure we're not upsizing:
float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
vec2 min_tile_size = mask_min_allowed_tile_size * tile_aspect;
vec2 max_tile_size = estimated_mask_resize_output_size / mask_resize_num_tiles;
vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
return final_resized_tile_size;
}
void main()
{
gl_Position = global.MVP * Position;
tex_uv = TexCoord;
// Calculate a runtime bloom_sigma in case it's needed:
float mask_tile_size_x = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
float bloom_sigma_runtime = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
center_weight = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
float undim_factor = 1.0/levels_autodim_temp;
undim_mask_contrast_factors = undim_factor * get_mask_amplify() * levels_contrast;
}
#pragma stage fragment
layout(location = 0) in vec2 tex_uv;
layout(location = 1) in float center_weight;
layout(location = 2) in float undim_mask_contrast_factors;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D MASKED_SCANLINES;
layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
////////////////////////////// FRAGMENT INCLUDES //////////////////////////////
void main()
{
// Sample the masked scanlines:
vec3 intensity_dim = texture(MASKED_SCANLINEStexture, tex_uv).rgb;
// Get the full intensity, including auto-undimming, and mask compensation:
vec3 intensity = intensity_dim * undim_mask_contrast_factors;
// Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
// would look like, so we can estimate how much energy we'll receive from
// blooming neighbors:
vec3 phosphor_blur_approx = levels_contrast * texture(ORIG_LINEARIZED, tex_uv).rgb;
// Compute the blur weight for the center texel and the maximum energy we
// expect to receive from neighbors:
vec3 max_area_contribution_approx = max(vec3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity);
// Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
// because it actually gets better results (on top of being very simple),
// but adjust all intensities for the user's desired underestimate factor:
vec3 area_contrib_underestimate = bloom_underestimate_levels * max_area_contribution_approx;
vec3 intensity_underestimate = bloom_underestimate_levels * intensity;
// Calculate the blur_ratio, the ratio of intensity we want to blur:
vec3 blur_ratio_temp = ((vec3(1.0, 1.0, 1.0) - area_contrib_underestimate) / intensity_underestimate - vec3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
vec3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0);
// Calculate the brightpass based on the auto-dimmed, unamplified, masked
// scanlines, encode if necessary, and return!
vec3 brightpass = intensity_dim * mix(blur_ratio, vec3(1.0, 1.0, 1.0), global.bloom_excess);
FragColor = vec4(brightpass, 1.0);
}

View File

@ -0,0 +1,97 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
} params;
////////////////////////////////// INCLUDES //////////////////////////////////
#include "bind-shader-params.h"
#include "scanline-functions.h"
#define GAMMA_IN(color) pow(color, vec3(crt_gamma))
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 tex_uv;
layout(location = 1) out vec2 uv_step;
layout(location = 2) out float interlaced;
void main()
{
gl_Position = global.MVP * Position;
tex_uv = TexCoord * 1.00001;
uv_step = vec2(1.0)/params.SourceSize.xy;
// Detect interlacing: 1.0 = true, 0.0 = false.
vec2 _video_size = params.SourceSize.xy;
interlaced = float(is_interlaced(_video_size.y));
}
#pragma stage fragment
#pragma format R8G8B8A8_SRGB
layout(location = 0) in vec2 tex_uv;
layout(location = 1) in vec2 uv_step;
layout(location = 2) in float interlaced;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
#define input_texture Source
void main()
{
// Linearize the input based on CRT gamma and bob interlaced fields.
// Bobbing ensures we can immediately blur without getting artifacts.
// Note: TFF/BFF won't matter for sources that double-weave or similar.
if(bool(interlace_detect))
{
// Sample the current line and an average of the previous/next line;
// tex2D_linearize will decode CRT gamma. Don't bother branching:
vec2 v_step = vec2(0.0, uv_step.y);
vec3 curr_line = GAMMA_IN(texture(input_texture, tex_uv ).rgb);
vec3 last_line = GAMMA_IN(texture(input_texture, tex_uv - v_step).rgb);
vec3 next_line = GAMMA_IN(texture(input_texture, tex_uv + v_step).rgb);
vec3 interpolated_line = 0.5 * (last_line + next_line);
// If we're interlacing, determine which field curr_line is in:
float modulus = interlaced + 1.0;
float field_offset = mod(params.FrameCount + global.interlace_bff, modulus);
float curr_line_texel = tex_uv.y * params.SourceSize.y;
// Use under_half to fix a rounding bug around exact texel locations.
float line_num_last = floor(curr_line_texel - under_half);
float wrong_field = mod(line_num_last + field_offset, modulus);
// Select the correct color, and output the result:
vec3 color = mix(curr_line, interpolated_line, wrong_field);
FragColor = vec4(color, 1.0);
}
else
{
FragColor = vec4(GAMMA_IN(texture(input_texture, tex_uv).rgb), 1.0);
}
}

View File

@ -0,0 +1,114 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
} params;
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "phosphor-mask-resizing.h"
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 src_tex_uv_wrap;
layout(location = 1) out vec2 tile_uv_wrap;
layout(location = 2) out vec2 resize_magnification_scale;
layout(location = 3) out vec2 src_dxdy;
layout(location = 4) out vec2 tile_size_uv;
layout(location = 5) out vec2 input_tiles_per_texture;
void main()
{
gl_Position = global.MVP * Position;
// First estimate the viewport size (the user will get the wrong number of
// triads if it's wrong and mask_specify_num_triads is 1.0/true).
vec2 estimated_viewport_size = params.OutputSize.xy / mask_resize_viewport_scale;
// Find the final size of our resized phosphor mask tiles. We probably
// estimated the viewport size and MASK_RESIZE output size differently last
// pass, so do not swear they were the same. ;)
vec2 mask_resize_tile_size = get_resized_mask_tile_size(estimated_viewport_size, params.OutputSize.xy, false);
// We'll render resized tiles until filling the output FBO or meeting a
// limit, so compute [wrapped] tile uv coords based on the output uv coords
// and the number of tiles that will fit in the FBO.
vec2 output_tiles_this_pass = params.OutputSize.xy / mask_resize_tile_size;
tile_uv_wrap = TexCoord * output_tiles_this_pass;
// Get the texel size of an input tile and related values:
vec2 input_tile_size = vec2(min(mask_resize_src_lut_size.x, params.SourceSize.x), mask_resize_tile_size.y);
tile_size_uv = input_tile_size / params.SourceSize.xy;
input_tiles_per_texture = params.SourceSize.xy / input_tile_size;
// Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
// the tile size in uv coords, and save frac() for the fragment shader.
src_tex_uv_wrap = tile_uv_wrap * tile_size_uv;
// Output the values we need, including the magnification scale and step:
resize_magnification_scale = mask_resize_tile_size / input_tile_size;
src_dxdy = vec2(1.0/params.SourceSize.x, 0.0);
}
#pragma stage fragment
layout(location = 0) in vec2 src_tex_uv_wrap;
layout(location = 1) in vec2 tile_uv_wrap;
layout(location = 2) in vec2 resize_magnification_scale;
layout(location = 3) in vec2 src_dxdy;
layout(location = 4) in vec2 tile_size_uv;
layout(location = 5) in vec2 input_tiles_per_texture;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
#define input_texture Source
void main()
{
// The input contains one mask tile horizontally and a number vertically.
// Resize the tile horizontally to its final screen size and repeat it
// until drawing at least mask_resize_num_tiles, leaving it unchanged
// vertically. Lanczos-resizing the phosphor mask achieves much sharper
// results than mipmapping, outputting >= mask_resize_num_tiles makes for
// easier tiled sampling later.
// Discard unneeded fragments in case our profile allows real branches.
if(max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
{
float src_dx = src_dxdy.x;
vec2 src_tex_uv = fract(src_tex_uv_wrap);
vec3 pixel_color = downsample_horizontal_sinc_tiled(input_texture, src_tex_uv, params.SourceSize.xy, src_dxdy.x, resize_magnification_scale.x, tile_size_uv.x);
// The input LUT was linear RGB, and so is our output:
FragColor = vec4(pixel_color, 1.0);
}
else
{
discard;
}
}

View File

@ -0,0 +1,120 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
} params;
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "phosphor-mask-resizing.h"
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 src_tex_uv_wrap;
layout(location = 1) out vec2 resize_magnification_scale;
void main()
{
gl_Position = global.MVP * Position;
// First estimate the viewport size (the user will get the wrong number of
// triads if it's wrong and mask_specify_num_triads is 1.0/true).
vec2 estimated_viewport_size = params.OutputSize.xy / mask_resize_viewport_scale.yy;
// Estimate the output size of MASK_RESIZE (the next pass). The estimated
// x component shouldn't matter, because we're not using the x result, and
// we're not swearing it's correct (if we did, the x result would influence
// the y result to maintain the tile aspect ratio).
vec2 estimated_mask_resize_output_size = params.OutputSize.xy;
// Find the final intended [y] size of our resized phosphor mask tiles,
// then the tile size for the current pass (resize y only):
vec2 mask_resize_tile_size = get_resized_mask_tile_size(estimated_viewport_size, estimated_mask_resize_output_size, false);
vec2 pass_output_tile_size = vec2(min(mask_resize_src_lut_size.x, params.OutputSize.x), mask_resize_tile_size.y);
// We'll render resized tiles until filling the output FBO or meeting a
// limit, so compute [wrapped] tile uv coords based on the output uv coords
// and the number of tiles that will fit in the FBO.
vec2 output_tiles_this_pass = params.OutputSize.xy / pass_output_tile_size;
// The input LUT is just a single mask tile, so texture uv coords are the
// same as tile uv coords (save fract() for the fragment shader). The
// magnification scale is also straightforward:
src_tex_uv_wrap = TexCoord * output_tiles_this_pass;
resize_magnification_scale = pass_output_tile_size / mask_resize_src_lut_size;
}
#pragma stage fragment
layout(location = 0) in vec2 src_tex_uv_wrap;
layout(location = 1) in vec2 resize_magnification_scale;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small;
layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small;
layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small;
void main()
{
// Resize the input phosphor mask tile to the final vertical size it will
// appear on screen. Keep 1x horizontal size if possible (IN.output_size
// >= mask_resize_src_lut_size), and otherwise linearly sample horizontally
// to fit exactly one tile. Lanczos-resizing the phosphor mask achieves
// much sharper results than mipmapping, and vertically resizing first
// minimizes the total number of taps required. We output a number of
// resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
//vec2 src_tex_uv_wrap = src_tex_uv_wrap;
// Discard unneeded fragments in case our profile allows real branches.
if(src_tex_uv_wrap.y <= mask_resize_num_tiles)
{
float src_dy = 1.0/mask_resize_src_lut_size.y;
vec2 src_tex_uv = fract(src_tex_uv_wrap);
vec3 pixel_color;
if(mask_type < 0.5)
{
pixel_color = downsample_vertical_sinc_tiled(mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
}
else if(mask_type < 1.5)
{
pixel_color = downsample_vertical_sinc_tiled(mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
}
else
{
pixel_color = downsample_vertical_sinc_tiled(mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
}
// The input LUT was linear RGB, and so is our output:
FragColor = vec4(pixel_color, 1.0);
}
else
{
discard;
}
}

View File

@ -0,0 +1,112 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
vec4 VERTICAL_SCANLINESSize;
vec4 MASK_RESIZESize;
} params;
#define VERTICAL_SCANLINEStexture VERTICAL_SCANLINES
#define VERTICAL_SCANLINEStexture_size params.VERTICAL_SCANLINESSize.xy
#define VERTICAL_SCANLINESvideo_size params.VERTICAL_SCANLINESSize.xy
#define MASK_RESIZEtexture MASK_RESIZE
#define MASK_RESIZEtexture_size params.MASK_RESIZESize.xy
#define MASK_RESIZEvideo_size params.MASK_RESIZESize.xy
float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "bind-shader-params.h"
/////////////////////////////// VERTEX INCLUDES ///////////////////////////////
#include "scanline-functions.h"
#include "phosphor-mask-resizing.h"
/////////////////////////////////// HELPERS //////////////////////////////////
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 video_uv;
layout(location = 1) out vec2 scanline_texture_size_inv;
layout(location = 2) out vec4 mask_tile_start_uv_and_size;
layout(location = 3) out vec2 mask_tiles_per_screen;
void main()
{
gl_Position = global.MVP * Position;
video_uv = TexCoord;
scanline_texture_size_inv = vec2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size;
// Get a consistent name for the final mask texture size. Sample mode 0
// uses the manually resized mask, but ignore it if we never resized.
vec2 mask_resize_texture_size = MASK_RESIZEtexture_size;
vec2 mask_resize_video_size = MASK_RESIZEvideo_size;
// Compute mask tile dimensions, starting points, etc.:
mask_tile_start_uv_and_size = get_mask_sampling_parameters(mask_resize_texture_size, mask_resize_video_size, params.OutputSize.xy, mask_tiles_per_screen);
}
#pragma stage fragment
layout(location = 0) in vec2 video_uv;
layout(location = 1) in vec2 scanline_texture_size_inv;
layout(location = 2) in vec4 mask_tile_start_uv_and_size;
layout(location = 3) in vec2 mask_tiles_per_screen;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
layout(set = 0, binding = 3) uniform sampler2D VERTICAL_SCANLINES;
layout(set = 0, binding = 4) uniform sampler2D MASK_RESIZE;
////////////////////////////// FRAGMENT INCLUDES //////////////////////////////
void main()
{
// This pass: Sample (misconverged?) scanlines to the final horizontal
// resolution, apply halation (bouncing electrons), and apply the phosphor
// mask. Fake a bloom if requested. Unless we fake a bloom, the output
// will be dim from the scanline auto-dim, mask dimming, and low gamma.
// Horizontally sample the current row (a vertically interpolated scanline)
// and account for horizontal convergence offsets, given in units of texels.
vec3 scanline_color_dim = sample_rgb_scanline_horizontal(VERTICAL_SCANLINEStexture, video_uv, VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv);
// Sample the phosphor mask:
vec2 tile_uv_wrap = video_uv * mask_tiles_per_screen;
vec2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(tile_uv_wrap, mask_tile_start_uv_and_size);
vec3 phosphor_mask_sample;
// Sample the resized mask, and avoid tiling artifacts:
phosphor_mask_sample = texture(MASK_RESIZEtexture, mask_tex_uv).rgb;
// Apply the phosphor mask:
vec3 phosphor_emission_dim = scanline_color_dim * phosphor_mask_sample;
FragColor = vec4(phosphor_emission_dim, 1.0);
}

View File

@ -0,0 +1,126 @@
#version 450
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
layout(push_constant) uniform Push
{
vec4 SourceSize;
vec4 OriginalSize;
vec4 OutputSize;
uint FrameCount;
vec4 ORIG_LINEARIZEDSize;
} params;
////////////////////////////////// INCLUDES //////////////////////////////////
#include "bind-shader-params.h"
#include "scanline-functions.h"
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 tex_uv;
layout(location = 1) out vec2 uv_step; // uv size of a texel (x) and scanline (y)
layout(location = 2) out vec2 il_step_multiple; // (1, 1) = progressive, (1, 2) = interlaced
layout(location = 3) out float pixel_height_in_scanlines; // Height of an output pixel in scanlines
void main()
{
gl_Position = global.MVP * Position;
tex_uv = TexCoord * 1.00001;
// Detect interlacing: il_step_multiple indicates the step multiple between
// lines: 1 is for progressive sources, and 2 is for interlaced sources.
vec2 video_size_ = params.ORIG_LINEARIZEDSize.xy;
float y_step = 1.0 + float(is_interlaced(video_size_.y));
il_step_multiple = vec2(1.0, y_step);
// Get the uv tex coords step between one texel (x) and scanline (y):
uv_step = il_step_multiple / params.ORIG_LINEARIZEDSize.xy;
// We need the pixel height in scanlines for antialiased/integral sampling:
float ph = (video_size_.y / params.OutputSize.y) / il_step_multiple.y;
pixel_height_in_scanlines = ph;
}
#pragma stage fragment
#pragma format R8G8B8A8_SRGB
layout(location = 0) in vec2 tex_uv;
layout(location = 1) in vec2 uv_step; // uv size of a texel (x) and scanline (y)
layout(location = 2) in vec2 il_step_multiple; // (1, 1) = progressive, (1, 2) = interlaced
layout(location = 3) in float pixel_height_in_scanlines; // Height of an output pixel in scanlines
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
#define input_texture ORIG_LINEARIZED
void main()
{
// This pass: Sample multiple (misconverged?) scanlines to the final
// vertical resolution. Temporarily auto-dim the output to avoid clipping.
// Read some attributes into local variables:
vec2 texture_size_ = params.ORIG_LINEARIZEDSize.xy;
vec2 texture_size_inv = params.ORIG_LINEARIZEDSize.zw;
float frame_count = float(params.FrameCount);
float ph = pixel_height_in_scanlines;
// Get the uv coords of the previous scanline (in this field), and the
// scanline's distance from this sample, in scanlines.
float dist;
vec2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size_, texture_size_inv, il_step_multiple, frame_count, dist);
// NOTE: Anisotropic filtering creates interlacing artifacts, which is why
// ORIG_LINEARIZED bobbed any interlaced input before this pass.
vec2 v_step = vec2(0.0, uv_step.y);
vec3 scanline2_color = texture(input_texture, scanline_uv ).rgb;
vec3 scanline3_color = texture(input_texture, scanline_uv + v_step).rgb;
vec3 scanline0_color, scanline1_color, scanline4_color, scanline5_color, scanline_outside_color;
float dist_round;
// dist is in [0, 1]
dist_round = round(dist);
vec2 sample_1or4_uv_off = mix(-v_step, 2.0 * v_step, dist_round);
scanline_outside_color = texture(input_texture, scanline_uv + sample_1or4_uv_off).rgb;
// Compute scanline contributions, accounting for vertical convergence.
// Vertical convergence offsets are in units of current-field scanlines.
// dist2 means "positive sample distance from scanline 2, in scanlines:"
vec3 dist2 = vec3(dist);
// Calculate and sum final scanline contributions, starting with lines 2/3.
// There is no normalization step, because we're not interpolating a
// continuous signal. Instead, each scanline is an additive light source.
vec3 scanline2_contrib = scanline_contrib(dist2, scanline2_color, ph, sigma_range, shape_range);
vec3 scanline3_contrib = scanline_contrib(abs(vec3(1.0,1.0,1.0) - dist2), scanline3_color, ph, sigma_range, shape_range);
vec3 scanline_intensity = scanline2_contrib + scanline3_contrib;
vec3 dist1or4 = mix(dist2 + vec3(1.0,1.0,1.0), vec3(2.0,2.0,2.0) - dist2, dist_round);
vec3 scanline1or4_contrib = scanline_contrib(dist1or4, scanline_outside_color, ph, sigma_range, shape_range);
scanline_intensity += scanline1or4_contrib;
// Auto-dim the image to avoid clipping, encode if necessary, and output.
// My original idea was to compute a minimal auto-dim factor and put it in
// the alpha channel, but it wasn't working, at least not reliably. This
// is faster anyway, levels_autodim_temp = 0.5 isn't causing banding.
FragColor = vec4(scanline_intensity * levels_autodim_temp, 1.0);
}

View File

@ -0,0 +1,271 @@
#ifndef PHOSPHOR_MASK_RESIZING_H
#define PHOSPHOR_MASK_RESIZING_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
///////////////////////////// CODEPATH SELECTION /////////////////////////////
#define USE_SINGLE_STATIC_LOOP
////////////////////////////////// CONSTANTS /////////////////////////////////
// The larger the resized tile, the fewer samples we'll need for downsizing.
// See if we can get a static min tile size > mask_min_allowed_tile_size:
float mask_min_allowed_tile_size = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
float mask_min_expected_tile_size = mask_min_allowed_tile_size;
// Limit the number of sinc resize taps by the maximum minification factor:
float pi_over_lobes = pi/mask_sinc_lobes;
float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * mask_resize_src_lut_size.x/mask_min_expected_tile_size;
// Vectorized loops sample in multiples of 4. Round up to be safe:
float max_sinc_resize_samples_m4 = ceil(max_sinc_resize_samples_float * 0.25) * 4.0;
///////////////////////// RESAMPLING FUNCTION HELPERS ////////////////////////
vec2 get_first_texel_tile_uv_and_dist(vec2 tex_uv, vec2 tex_size, float dr, float input_tiles_per_texture_r, float samples, bool vertical)
{
vec2 curr_texel = tex_uv * tex_size;
vec2 prev_texel = floor(curr_texel - vec2(under_half)) + vec2(0.5);
vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0);
vec2 first_texel_uv_wrap_2D = first_texel * dr;
vec2 first_texel_dist_2D = curr_texel - first_texel;
// Convert from tex_uv to tile_uv coords so we can sub fracts for fmods.
vec2 first_texel_tile_uv_wrap_2D = first_texel_uv_wrap_2D * input_tiles_per_texture_r;
// Project wrapped coordinates to the [0, 1] range. We'll do this with all
// samples,but the first texel is special, since it might be negative.
vec2 coord_negative = vec2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
vec2 first_texel_tile_uv_2D = fract(first_texel_tile_uv_wrap_2D) + coord_negative;
// Pack the first texel's tile_uv coord and texel distance in 1D:
vec2 tile_u_and_dist = vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
vec2 tile_v_and_dist = vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
return vertical ? tile_v_and_dist : tile_u_and_dist;
}
vec4 tex2Dlod0try(sampler2D tex, vec2 tex_uv)
{
// Mipmapping and anisotropic filtering get confused by sinc-resampling.
// One [slow] workaround is to select the lowest mip level:
return texture(tex, tex_uv);
}
////////////////////////////// LOOP BODY MACROS //////////////////////////////
#define CALCULATE_R_COORD_FOR_4_SAMPLES \
vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \
vec4 tile_uv_r = fract( \
first_texel_tile_uv_rrrr + true_i * tile_dr); \
vec4 tex_uv_r = tile_uv_r * tile_size_uv_r;
#ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
vec4 pi_dist_over_lobes = pi_over_lobes * dist; \
vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
(pi_dist*pi_dist_over_lobes), vec4(1.0));
#else
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0));
#endif
#define UPDATE_COLOR_AND_WEIGHT_SUMS \
vec4 dist = magnification_scale * \
abs(first_dist_unscaled - true_i); \
vec4 pi_dist = pi * dist; \
CALCULATE_SINC_RESAMPLE_WEIGHTS; \
pixel_color += new_sample0 * weights.xxx; \
pixel_color += new_sample1 * weights.yyy; \
pixel_color += new_sample2 * weights.zzz; \
pixel_color += new_sample3 * weights.www; \
weight_sum += weights;
#define VERTICAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
vec3 new_sample0 = tex2Dlod0try(tex, \
vec2(tex_uv.x, tex_uv_r.x)).rgb; \
vec3 new_sample1 = tex2Dlod0try(tex, \
vec2(tex_uv.x, tex_uv_r.y)).rgb; \
vec3 new_sample2 = tex2Dlod0try(tex, \
vec2(tex_uv.x, tex_uv_r.z)).rgb; \
vec3 new_sample3 = tex2Dlod0try(tex, \
vec2(tex_uv.x, tex_uv_r.w)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
#define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
vec3 new_sample0 = tex2Dlod0try(tex, \
vec2(tex_uv_r.x, tex_uv.y)).rgb; \
vec3 new_sample1 = tex2Dlod0try(tex, \
vec2(tex_uv_r.y, tex_uv.y)).rgb; \
vec3 new_sample2 = tex2Dlod0try(tex, \
vec2(tex_uv_r.z, tex_uv.y)).rgb; \
vec3 new_sample3 = tex2Dlod0try(tex, \
vec2(tex_uv_r.w, tex_uv.y)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
//////////////////////////// RESAMPLING FUNCTIONS ////////////////////////////
vec3 downsample_vertical_sinc_tiled(sampler2D tex, vec2 tex_uv, vec2 tex_size, float dr, float magnification_scale, float tile_size_uv_r)
{
int samples = int(max_sinc_resize_samples_m4);
// Get the first sample location (scalar tile uv coord along the resized
// dimension) and distance from the output location (in texels):
float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// true = vertical resize:
vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
int i_step = 4;
vec4 weight_sum = vec4(0.0);
vec3 pixel_color = vec3(0.0);
for(int i = 0; i < samples; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
// Normalize so the weight_sum == 1.0, and return:
vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + weight_sum_reduce.y);
return (pixel_color/scalar_weight_sum);
}
vec3 downsample_horizontal_sinc_tiled(sampler2D tex, vec2 tex_uv, vec2 tex_size, float dr, float magnification_scale, float tile_size_uv_r)
{
int samples = int(max_sinc_resize_samples_m4);
// Get the first sample location (scalar tile uv coord along resized
// dimension) and distance from the output location (in texels):
float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// false = horizontal resize:
vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
int i_step = 4;
vec4 weight_sum = vec4(0.0);
vec3 pixel_color = vec3(0.0);
for(int i = 0; i < samples; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
// Normalize so the weight_sum == 1.0, and return:
vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + weight_sum_reduce.y);
return (pixel_color/scalar_weight_sum);
}
//////////////////////////// TILE SIZE CALCULATION ///////////////////////////
vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
{
// Stated tile properties must be correct:
float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
// Make sure we're not upsizing:
float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
vec2 min_tile_size = mask_min_allowed_tile_size * tile_aspect;
vec2 max_tile_size = estimated_mask_resize_output_size / mask_resize_num_tiles;
vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
return final_resized_tile_size;
}
///////////////////////// FINAL MASK SAMPLING HELPERS ////////////////////////
vec4 get_mask_sampling_parameters(vec2 mask_resize_texture_size, vec2 mask_resize_video_size, vec2 true_viewport_size, out vec2 mask_tiles_per_screen)
{
vec2 mask_resize_tile_size = get_resized_mask_tile_size(true_viewport_size, mask_resize_video_size, false);
// Sample MASK_RESIZE: The resized tile is a fracttion of the texture
// size and starts at a nonzero offset to allow for border texels:
vec2 mask_tile_uv_size = mask_resize_tile_size / mask_resize_texture_size;
vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
// mask_tiles_per_screen must be based on the *true* viewport size:
mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
return vec4(mask_tile_start_uv, mask_tile_uv_size);
}
vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(vec2 tile_uv_wrap, vec4 mask_tile_start_uv_and_size)
{
vec2 tile_uv = fract(tile_uv_wrap);
vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy + tile_uv * mask_tile_start_uv_and_size.zw;
return mask_tex_uv;
}
#endif // PHOSPHOR_MASK_RESIZING_H

View File

@ -0,0 +1,309 @@
#ifndef SCANLINE_FUNCTIONS_H
#define SCANLINE_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "special-functions.h"
///////////////////////////// SCANLINE FUNCTIONS /////////////////////////////
vec3 get_gaussian_sigma(vec3 color, float sigma_range)
{
if(beam_spot_shape_function < 0.5)
{
// Use a power function:
return vec3(beam_min_sigma) + sigma_range * pow(color, vec3(beam_spot_power));
}
else
{
// Use a spherical function:
vec3 color_minus_1 = color - vec3(1.0);
return vec3(beam_min_sigma) + sigma_range * sqrt(vec3(1.0) - color_minus_1*color_minus_1);
}
}
vec3 get_generalized_gaussian_beta(vec3 color, float shape_range)
{
return beam_min_shape + shape_range * pow(color, vec3(beam_shape_power));
}
vec3 scanline_gaussian_integral_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range)
{
vec3 sigma = get_gaussian_sigma(color, sigma_range);
vec3 ph_offset = vec3(pixel_height * 0.5);
vec3 denom_inv = 1.0/(sigma*sqrt(2.0));
vec3 integral_high = erf((dist + ph_offset)*denom_inv);
vec3 integral_low = erf((dist - ph_offset)*denom_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
vec3 scanline_generalized_gaussian_integral_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
{
vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
vec3 beta = get_generalized_gaussian_beta(color, shape_range);
vec3 alpha_inv = vec3(1.0)/alpha;
vec3 s = vec3(1.0)/beta;
vec3 ph_offset = vec3(pixel_height * 0.5);
vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, beta);
vec3 dist1 = dist + ph_offset;
vec3 dist0 = dist - ph_offset;
vec3 integral_high = sign(dist1) * normalized_ligamma_impl(s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
vec3 integral_low = sign(dist0) * normalized_ligamma_impl(s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
vec3 scanline_gaussian_sampled_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range)
{
vec3 sigma = get_gaussian_sigma(color, sigma_range);
vec3 sigma_inv = vec3(1.0)/sigma;
vec3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
vec3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel away in each direction as well:
vec3 sample_offset = vec3(pixel_height/3.0);
vec3 dist2 = dist + sample_offset;
vec3 dist3 = abs(dist - sample_offset);
// Average three pure Gaussian samples:
vec3 scale = color/3.0 * outer_denom_inv;
vec3 weight1 = exp(-( dist* dist)*inner_denom_inv);
vec3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
vec3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
return scale * (weight1 + weight2 + weight3);
}
else
{
return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
}
}
vec3 scanline_generalized_gaussian_sampled_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
{
vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
vec3 beta = get_generalized_gaussian_beta(color, shape_range);
// Avoid repeated divides:
vec3 alpha_inv = vec3(1.0)/alpha;
vec3 beta_inv = vec3(1.0)/beta;
vec3 scale = color * beta * 0.5 * alpha_inv / gamma_impl(beta_inv, beta);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel closer to and farther from the scanline too.
vec3 sample_offset = vec3(pixel_height/3.0);
vec3 dist2 = dist + sample_offset;
vec3 dist3 = abs(dist - sample_offset);
// Average three generalized Gaussian samples:
vec3 weight1 = exp(-pow(abs( dist*alpha_inv), beta));
vec3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
vec3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
return scale/3.0 * (weight1 + weight2 + weight3);
}
else
{
return scale * exp(-pow(abs(dist*alpha_inv), beta));
}
}
vec3 scanline_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
{
if(beam_generalized_gaussian)
{
if(beam_antialias_level > 1.5)
{
return scanline_generalized_gaussian_integral_contrib(dist, color, pixel_height, sigma_range, shape_range);
}
else
{
return scanline_generalized_gaussian_sampled_contrib(dist, color, pixel_height, sigma_range, shape_range);
}
}
else
{
if(beam_antialias_level > 1.5)
{
return scanline_gaussian_integral_contrib(dist, color, pixel_height, sigma_range);
}
else
{
return scanline_gaussian_sampled_contrib(dist, color, pixel_height, sigma_range);
}
}
}
// 2 - Apply mask only.
vec3 get_raw_interpolated_color(vec3 color0, vec3 color1, vec3 color2, vec3 color3, vec4 weights)
{
// Use max to avoid bizarre artifacts from negative colors:
return max((mat4x3(color0, color1, color2, color3) * weights), 0.0);
}
vec3 get_interpolated_linear_color(vec3 color0, vec3 color1, vec3 color2, vec3 color3, vec4 weights)
{
float intermediate_gamma = lcd_gamma;
// Inputs: color0-3 are colors in linear RGB.
vec3 linear_mixed_color = get_raw_interpolated_color(color0, color1, color2, color3, weights);
vec3 gamma_mixed_color = get_raw_interpolated_color(
pow(color0, vec3(1.0/intermediate_gamma)),
pow(color1, vec3(1.0/intermediate_gamma)),
pow(color2, vec3(1.0/intermediate_gamma)),
pow(color3, vec3(1.0/intermediate_gamma)),
weights);
// wtf fixme
// float beam_horiz_linear_rgb_weight1 = 1.0;
return mix(gamma_mixed_color, linear_mixed_color, global.beam_horiz_linear_rgb_weight);
}
vec3 get_scanline_color(sampler2D tex, vec2 scanline_uv, vec2 uv_step_x, vec4 weights)
{
vec3 color1 = texture(tex, scanline_uv).rgb;
vec3 color2 = texture(tex, scanline_uv + uv_step_x).rgb;
vec3 color0 = vec3(0.0);
vec3 color3 = vec3(0.0);
if(beam_horiz_filter > 0.5)
{
color0 = texture(tex, scanline_uv - uv_step_x).rgb;
color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb;
}
return get_interpolated_linear_color(color0, color1, color2, color3, weights);
}
vec3 sample_single_scanline_horizontal(sampler2D tex, vec2 tex_uv, vec2 tex_size, vec2 texture_size_inv)
{
vec2 curr_texel = tex_uv * tex_size;
// Use under_half to fix a rounding bug right around exact texel locations.
vec2 prev_texel = floor(curr_texel - vec2(under_half)) + vec2(0.5);
vec2 prev_texel_hor = vec2(prev_texel.x, curr_texel.y);
vec2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
float prev_dist = curr_texel.x - prev_texel_hor.x;
vec4 sample_dists = vec4(1.0 + prev_dist, prev_dist, 1.0 - prev_dist, 2.0 - prev_dist);
// Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
vec4 weights;
if(beam_horiz_filter < 0.5)
{
// Quilez:
float x = sample_dists.y;
float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
weights = vec4(0.0, 1.0 - w2, w2, 0.0);
}
else if(beam_horiz_filter < 1.5)
{
// Gaussian:
float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
}
else
{
// Lanczos2:
vec4 pi_dists = FIX_ZERO(sample_dists * pi);
weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5)/(pi_dists * pi_dists);
}
// Ensure the weight sum == 1.0:
vec4 final_weights = weights/dot(weights, vec4(1.0));
// Get the interpolated horizontal scanline color:
vec2 uv_step_x = vec2(texture_size_inv.x, 0.0);
return get_scanline_color(tex, prev_texel_hor_uv, uv_step_x, final_weights);
}
vec3 sample_rgb_scanline_horizontal(sampler2D tex, vec2 tex_uv, vec2 tex_size, vec2 texture_size_inv)
{
// TODO: Add function requirements.
return sample_single_scanline_horizontal(tex, tex_uv, tex_size, texture_size_inv);
}
// Monolythic
vec2 get_last_scanline_uv( vec2 tex_uv,
vec2 tex_size, vec2 texture_size_inv,
vec2 il_step_multiple,
float frame_count, out float dist)
{
float field_offset = floor(il_step_multiple.y * 0.75)*mod(frame_count + float(global.interlace_bff), 2.0);
vec2 curr_texel = tex_uv * tex_size;
// Use under_half to fix a rounding bug right around exact texel locations.
vec2 prev_texel_num = floor(curr_texel - vec2(under_half));
float wrong_field = mod(prev_texel_num.y + field_offset, il_step_multiple.y);
vec2 scanline_texel_num = prev_texel_num - vec2(0.0, wrong_field);
// Snap to the center of the previous scanline in the current field:
vec2 scanline_texel = scanline_texel_num + vec2(0.5);
vec2 scanline_uv = scanline_texel * texture_size_inv;
// Save the sample's distance from the scanline, in units of scanlines:
dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
return scanline_uv;
}
bool is_interlaced(float num_lines)
{
// Detect interlacing based on the number of lines in the source.
if(interlace_detect)
{
// NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
// NTSC Emulators: Typically 224 or 240 lines
// PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
// PAL Emulators: ?
// ATSC: 720p, 1080i, 1080p
// Where do we place our cutoffs? Assumptions:
// 1.) We only need to care about active lines.
// 2.) Anything > 288 and <= 576 lines is probably interlaced.
// 3.) Anything > 576 lines is probably not interlaced...
// 4.) ...except 1080 lines, which is a crapshoot (user decision).
// 5.) Just in case the main program uses calculated video sizes,
// we should nudge the float thresholds a bit.
bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
bool hd_interlace = bool(interlace_1080i) ? ((num_lines > 1079.5) && (num_lines < 1080.5)) : false;
return (sd_interlace || hd_interlace);
}
else
{
return false;
}
}
#endif // SCANLINE_FUNCTIONS_H

View File

@ -0,0 +1,182 @@
#ifndef SPECIAL_FUNCTIONS_H
#define SPECIAL_FUNCTIONS_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
/////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
vec3 erf6(vec3 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Return an Abramowitz/Stegun approximation of erf(), where:
// erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
// This approximation has a max absolute error of 2.5*10**-5
// with solid numerical robustness and efficiency. See:
// https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
vec3 one = vec3(1.0);
vec3 sign_x = sign(x);
vec3 t = one/(one + 0.47047*abs(x));
vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
exp(-(x*x));
return result * sign_x;
}
vec3 erft(vec3 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Approximate erf() with the hyperbolic tangent. The error is
// visually noticeable, but it's blazing fast and perceptually
// close...at least on ATI hardware. See:
// http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
// Warning: Only use this if your hardware drivers correctly implement
// tanh(): My nVidia 8800GTS returns garbage output.
return tanh(1.202760580 * x);
}
vec3 erf(vec3 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Some approximation of erf(x), depending on user settings.
#ifdef ERF_FAST_APPROXIMATION
return erft(x);
#else
return erf6(x);
#endif
}
/////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
vec3 gamma_impl(vec3 s, vec3 s_inv)
{
// Requires: 1.) s is the standard parameter to the gamma function, and
// it should lie in the [0, 36] range.
// 2.) s_inv = 1.0/s. This implementation function requires
// the caller to precompute this value, giving users the
// opportunity to reuse it.
// Returns: Return approximate gamma function (real-numbered factorial)
// output using the Lanczos approximation with two coefficients
// calculated using Paul Godfrey's method here:
// http://my.fit.edu/~gabdo/gamma.txt
// An optimal g value for s in [0, 36] is ~1.12906830989, with
// a maximum relative error of 0.000463 for 2**16 equally
// evals. We could use three coeffs (0.0000346 error) without
// hurting latency, but this allows more parallelism with
// outside instructions.
vec3 g = vec3(1.12906830989);
vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
vec3 e = vec3(2.71828182845904523536028747135266249775724709);
vec3 sph = s + vec3(0.5);
vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
vec3 base = (sph + g)/e;
return (pow(base, sph) * lanczos_sum) * s_inv;
}
//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
// Lower incomplete gamma function for small s and z (implementation):
vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
{
// Requires: 1.) s < ~0.5
// 2.) z <= ~0.775075
// 3.) s_inv = 1.0/s (precomputed for outside reuse)
// Returns: A series representation for the lower incomplete gamma
// function for small s and small z (4 terms).
// The actual "rolled up" summation looks like:
// last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
// sum = last_sign * last_pow / ((s + k) * last_factorial)
// for(int i = 0; i < 4; ++i)
// {
// last_sign *= -1.0; last_pow *= z; last_factorial *= i;
// sum += last_sign * last_pow / ((s + k) * last_factorial);
// }
// Unrolled, constant-unfolded and arranged for madds and parallelism:
vec3 scale = pow(z, s);
vec3 sum = s_inv;
vec3 z_sq = z*z;
vec3 denom1 = s + vec3(1.0);
vec3 denom2 = 2.0*s + vec3(4.0);
vec3 denom3 = 6.0*s + vec3(18.0);
sum -= z/denom1;
sum += z_sq/denom2;
sum -= z * z_sq/denom3;
return scale * sum;
}
// Upper incomplete gamma function for small s and large z (implementation):
vec3 uigamma_large_z_impl(vec3 s, vec3 z)
{
// Requires: 1.) s < ~0.5
// 2.) z > ~0.775075
// Returns: Gauss's continued fraction representation for the upper
// incomplete gamma function (4 terms).
// The "rolled up" continued fraction looks like this. The denominator
// is truncated, and it's calculated "from the bottom up:"
// denom = vec4('inf');
// vec4 one = vec4(1.0);
// for(int i = 4; i > 0; --i)
// {
// denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
// }
// Unrolled and constant-unfolded for madds and parallelism:
vec3 numerator = pow(z, s) * exp(-z);
vec3 denom = vec3(7.0) + z - s;
denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
return numerator / denom;
}
// Normalized lower incomplete gamma function for small s (implementation):
vec3 normalized_ligamma_impl(vec3 s, vec3 z,
vec3 s_inv, vec3 gamma_s_inv)
{
// Requires: 1.) s < ~0.5
// 2.) s_inv = 1/s (precomputed for outside reuse)
// 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
// Returns: Approximate the normalized lower incomplete gamma function
// for s < 0.5. Since we only care about s < 0.5, we only need
// to evaluate two branches (not four) based on z. Each branch
// uses four terms, with a max relative error of ~0.00182. The
// branch threshold and specifics were adapted for fewer terms
// from Gil/Segura/Temme's paper here:
// http://oai.cwi.nl/oai/asset/20433/20433B.pdf
// Evaluate both branches: Real branches test slower even when available.
vec3 thresh = vec3(0.775075);
bvec3 z_is_large;
z_is_large.x = z.x > thresh.x;
z_is_large.y = z.y > thresh.y;
z_is_large.z = z.z > thresh.z;
vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
bvec3 inverse_z_is_large = not(z_is_large);
return large_z * vec3(z_is_large) + small_z * vec3(inverse_z_is_large);
}
#endif // SPECIAL_FUNCTIONS_H