Add crt-royale-fast shaders (#619)

2025-02-19 17:40:36 +00:00 · 2024-08-09 09:57:48 -03:00 · 2024-08-09 09:57:48 -03:00 · 111fcedc3b
commit 111fcedc3b
parent b327343b77
16 changed files with 2423 additions and 0 deletions
--- a/crt/crt-royale-fast.slangp
+++ b/crt/crt-royale-fast.slangp
@ -0,0 +1,93 @@
+# crt-royale-fast: a fast crt-royale adapted from original sources by Hyllian (2024).
+
+shaders = "8"
+
+textures = "mask_grille_texture_small;mask_slot_texture_small;mask_shadow_texture_small"
+mask_grille_texture_small = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64BGR.png"
+mask_slot_texture_small = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png"
+mask_shadow_texture_small = "shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png"
+mask_grille_texture_small_wrap_mode = "repeat"
+mask_slot_texture_small_wrap_mode = "repeat"
+mask_shadow_texture_small_wrap_mode = "repeat"
+mask_grille_texture_small_linear = "true"
+mask_slot_texture_small_linear = "true"
+mask_shadow_texture_small_linear = "true"
+mask_grille_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_slot_texture_small_mipmap = "false"    # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_shadow_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+
+# Pass0: Linearize the input based on CRT gamma and bob interlaced fields.
+# (Bobbing ensures we can immediately blur without getting artifacts.)
+shader0 = "shaders/crt-royale/src-fast/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang"
+alias0 = "ORIG_LINEARIZED"
+filter_linear0 = "false"
+scale_type0 = "source"
+scale0 = "1.0"
+srgb_framebuffer0 = "true"
+
+
+# Pass1: Resample interlaced scanlines vertically.
+# Separating vertical/horizontal scanline sampling is faster: It lets us
+# consider more scanlines while calculating weights for fewer pixels, and
+# it reduces our samples from vertical*horizontal to vertical+horizontal.
+# This has to come right after ORIG_LINEARIZED, because there's no
+# "original_source" scale_type we can use later.
+shader1 = "shaders/crt-royale/src-fast/crt-royale-scanlines-vertical-interlacing.slang"
+alias1 = "VERTICAL_SCANLINES"
+filter_linear1 = "true"
+scale_type_x1 = "source"
+scale_x1 = "1.0"
+scale_type_y1 = "viewport"
+scale_y1 = "1.0"
+srgb_framebuffer1 = "true"
+
+# Pass2: Resize the phosphor mask vertically.
+shader2 = "shaders/crt-royale/src-fast/crt-royale-mask-resize-vertical.slang"
+filter_linear2 = "true"
+scale_type_x2 = "absolute"
+scale_x2 = "64"
+scale_type_y2 = "viewport"
+scale_y2 = "0.0625" # Safe for >= 341.333 horizontal triads at viewport size
+#srgb_framebuffer2 = "false" # mask_texture is already assumed linear
+
+# Pass3: Resize the phosphor mask horizontally.  scale_x3 = scale_y5.
+shader3 = "shaders/crt-royale/src-fast/crt-royale-mask-resize-horizontal.slang"
+alias3 = "MASK_RESIZE"
+filter_linear3 = "false"
+scale_type_x3 = "viewport"
+scale_x3 = "0.0625"
+scale_type_y3 = "source"
+scale_y3 = "1.0"
+#srgb_framebuffer3 = "false" # mask_texture is already assumed linear
+
+# Pass4: Resample scanlines horizontally, apply the phosphor mask.
+shader4 = "shaders/crt-royale/src-fast/crt-royale-scanlines-horizontal-apply-mask.slang"
+alias4 = "MASKED_SCANLINES"
+filter_linear4 = "true" # This could just as easily be nearest neighbor.
+scale_type4 = "viewport"
+scale4 = "1.0"
+srgb_framebuffer4 = "true"
+
+# Pass5: Compute a brightpass.  This will require reading the final mask.
+shader5 = "shaders/crt-royale/src-fast/crt-royale-brightpass.slang"
+alias5 = "BRIGHTPASS"
+filter_linear5 = "true" # This could just as easily be nearest neighbor.
+scale_type5 = "viewport"
+scale5 = "1.0"
+srgb_framebuffer5 = "true"
+
+# Pass6: Blur the brightpass vertically
+shader6 = "shaders/crt-royale/src-fast/crt-royale-bloom-vertical.slang"
+filter_linear6 = "true" # This could just as easily be nearest neighbor.
+scale_type6 = "source"
+scale6 = "1.0"
+srgb_framebuffer6 = "true"
+
+# Pass7: Blur the brightpass horizontally and combine it with the dimpass:
+shader7 = "shaders/crt-royale/src-fast/crt-royale-bloom-horizontal-reconstitute.slang"
+filter_linear7 = "true"
+scale_type7 = "source"
+scale7 = "1.0"
+srgb_framebuffer7 = "true"
+wrap_mode7 = "clamp_to_edge"
+
--- a/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64BGR.png
+++ b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64BGR.png
--- a/crt/shaders/crt-royale/src-fast/bind-shader-params.h
+++ b/crt/shaders/crt-royale/src-fast/bind-shader-params.h
@ -0,0 +1,158 @@
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/*
+    crt-royale-fast: a fast crt-royale adapted from original sources by Hyllian (2024).
+
+    Aims to deliver a fast shader with crt-royale visual style by sacrificing some
+    of its complex features.
+*/
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float mask_type;
+	float mask_triad_size_desired;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float interlace_bff;
+	float interlace_1080i;
+	float interlace_detect_toggle;
+} global;
+
+
+#pragma parameter crt_gamma "Simulated CRT Gamma" 2.5 1.0 5.0 0.025
+#define crt_gamma global.crt_gamma
+#pragma parameter lcd_gamma "Your Display Gamma" 2.2 1.0 5.0 0.025
+#define lcd_gamma global.lcd_gamma
+#pragma parameter levels_contrast "Contrast" 1.0 0.0 4.0 0.015625
+#define levels_contrast global.levels_contrast
+#pragma parameter bloom_underestimate_levels "Bloom - Underestimate Levels" 0.8 0.0 5.0 0.01
+#define bloom_underestimate_levels global.bloom_underestimate_levels
+#pragma parameter bloom_excess "Bloom - Excess" 0.0 0.0 1.0 0.005
+#pragma parameter beam_min_sigma "Beam - Min Sigma" 0.02 0.005 1.0 0.005
+#define beam_min_sigma global.beam_min_sigma
+#pragma parameter beam_max_sigma "Beam - Max Sigma" 0.3 0.005 1.0 0.005
+#define beam_max_sigma global.beam_max_sigma
+#pragma parameter beam_spot_power "Beam - Spot Power" 0.33 0.01 16.0 0.01
+#define beam_spot_power global.beam_spot_power
+#pragma parameter beam_min_shape "Beam - Min Shape" 2.0 2.0 32.0 0.1
+#define beam_min_shape global.beam_min_shape
+#pragma parameter beam_max_shape "Beam - Max Shape" 4.0 2.0 32.0 0.1
+#define beam_max_shape global.beam_max_shape
+#pragma parameter beam_shape_power "Beam - Shape Power" 0.25 0.01 16.0 0.01
+#define beam_shape_power global.beam_shape_power
+#pragma parameter beam_horiz_filter "Beam - Horiz Filter" 0.0 0.0 2.0 1.0
+#define beam_horiz_filter global.beam_horiz_filter
+#pragma parameter beam_horiz_sigma "Beam - Horiz Sigma" 0.35 0.0 0.67 0.005
+#define beam_horiz_sigma global.beam_horiz_sigma
+#pragma parameter beam_horiz_linear_rgb_weight "Beam - Horiz Linear RGB Weight" 1.0 0.0 1.0 0.01
+#pragma parameter mask_type "Mask - Type" 0.0 0.0 2.0 1.0
+#define mask_type global.mask_type
+#pragma parameter mask_triad_size_desired "Mask - Triad Size Desired" 3.0 1.0 18.0 0.125
+#pragma parameter interlace_detect_toggle "Interlacing - Toggle" 1.0 0.0 1.0 1.0
+bool interlace_detect = bool(global.interlace_detect_toggle);
+#pragma parameter interlace_bff "Interlacing - Bottom Field First" 0.0 0.0 1.0 1.0
+//#define interlace_bff global.interlace_bff
+#pragma parameter interlace_1080i "Interlace - Detect 1080i" 0.0 0.0 1.0 1.0
+#define interlace_1080i global.interlace_1080i
+
+//  LEVELS MANAGEMENT:
+float levels_autodim_temp = 0.5;              //  range (0, 1]
+
+bool  beam_generalized_gaussian = true;
+float beam_antialias_level      = 1.0;        //  range [0, 2]
+float beam_spot_shape_function  = 0.0;
+float beam_spot_power_static    = 1.0/3.0;    //  range (0, 16]
+float beam_min_shape_static     = 2.0;        //  range [2, 32]
+float beam_max_shape_static     = 4.0;        //  range [2, 32]
+
+//  PHOSPHOR MASK:
+float mask_sinc_lobes = 3.0;                  //  range [2, 4]
+float mask_min_allowed_triad_size = 2.0;
+
+//  PASS SCALES AND RELATED CONSTANTS:
+vec2  mask_resize_viewport_scale = vec2(0.0625, 0.0625);
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+vec2  mask_texture_small_size = vec2(64.0, 64.0);
+float mask_triads_per_tile    = 8.0;
+float mask_grille_avg_color   = 53.0/255.0;
+float mask_slot_avg_color     = 46.0/255.0;
+float mask_shadow_avg_color   = 50.0/255.0;
+
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+float bloom_approx_filter      = 0.0;
+vec2  mask_resize_src_lut_size = mask_texture_small_size;
+float max_aa_base_pixel_border = 0.0;
+float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+float max_tiled_pixel_border = max_aniso_pixel_border;
+float max_mask_texel_border = ceil(max_tiled_pixel_border);
+float max_mask_tile_border = max_mask_texel_border/
+        (mask_min_allowed_triad_size * mask_triads_per_tile);
+float mask_resize_num_tiles = 1.0 + 2.0 * max_mask_tile_border;
+float mask_start_texels = max_mask_texel_border;
+float mask_resize_num_triads = mask_resize_num_tiles * mask_triads_per_tile;
+vec2  min_allowed_viewport_triads = vec2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+//  Calculate {sigma, shape}_range outside of scanline_contrib so it's only
+//  done once per pixel (not 6 times) with runtime params.  Don't reuse the
+//  vertex shader calculations, so static versions can be constant-folded.
+float sigma_range = max(beam_max_sigma, beam_min_sigma) - beam_min_sigma;
+float shape_range = max(beam_max_shape, beam_min_shape) - beam_min_shape;
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+float pi = 3.141592653589;
+float under_half = 0.4995;
+
+//  Provide accessors settings which still need "cooking:"
+float get_mask_amplify()
+{
+    float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    float mask_slot_amplify   = 1.0/mask_slot_avg_color;
+    float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
--- a/crt/shaders/crt-royale/src-fast/bloom-functions.h
+++ b/crt/shaders/crt-royale/src-fast/bloom-functions.h
@ -0,0 +1,133 @@
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "blur-functions.h"
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+float bloom_diff_thresh = 1.0/256.0;
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+float get_min_sigma_to_blur_triad(float triad_size, float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+
+    return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
+
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+float get_absolute_scale_blur_sigma(float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+
+    float min_sigma = get_min_sigma_to_blur_triad(max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+
+    return bloom_approx_scale_x/max_viewport_size_x * min_sigma;
+}
+
+float get_center_weight(float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    return get_fast_gaussian_weight_sum_inv(sigma);
+}
+
+
+float get_bloom_approx_sigma(float output_size_x_runtime, float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    float mask_num_triads_static    = max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+
+    //  Assume an extremely large viewport size for asymptotic results:
+    float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+    //  We're either using blur3x3 or bilinear filtering.  The biggest
+    //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+    //  static calculation.
+    float output_size_x_static = bloom_approx_size_x;
+
+    float asymptotic_triad_size = max_viewport_size_x/mask_num_triads_static;
+    float asymptotic_sigma      = get_min_sigma_to_blur_triad(asymptotic_triad_size, bloom_diff_thresh);
+    float bloom_approx_sigma    = asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+
+    //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+    //  try accounting for the Gaussian scanline sigma from the last pass
+    //  too; use the static default value:
+    return length(vec2(bloom_approx_sigma, beam_max_sigma_static));
+}
+
+#endif  //  BLOOM_FUNCTIONS_H
+
--- a/crt/shaders/crt-royale/src-fast/blur-functions.h
+++ b/crt/shaders/crt-royale/src-fast/blur-functions.h
@ -0,0 +1,148 @@
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+
+        //  blurN_std_dev values are specified in terms of dxdy strides.
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        float blur3_std_dev = 0.62666015625;
+        float blur4_std_dev = 0.66171875;
+        float blur5_std_dev = 0.9845703125;
+        float blur6_std_dev = 1.02626953125;
+        float blur7_std_dev = 1.36103515625;
+        float blur8_std_dev = 1.4080078125;
+        float blur9_std_dev = 1.7533203125;
+        float blur10_std_dev = 1.80478515625;
+        float blur11_std_dev = 2.15986328125;
+        float blur12_std_dev = 2.215234375;
+        float blur17_std_dev = 3.45535583496;
+        float blur25_std_dev = 5.3409576416;
+        float blur31_std_dev = 6.86488037109;
+        float blur43_std_dev = 10.1852050781;
+
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    float error_blurring = 0.5;
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+
+vec4 uv2_to_uv4(vec2 tex_uv)
+{
+    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
+    return vec4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+
+vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0,0.0,0.0);
+
+    sum += w4 * texture(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * texture(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * texture(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * texture(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * texture(tex, tex_uv).rgb;
+    sum += w1 * texture(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * texture(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * texture(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * texture(tex, tex_uv + 4.0 * dxdy).rgb;
+
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+
+    float w0 = 1.0;
+
+    vec3 sum = vec3(0.0,0.0,0.0);
+
+    sum += (w1_8.w * texture(tex, tex_uv - (7.0 + w1_8_ratio.w) * dxdy).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv - (5.0 + w1_8_ratio.z) * dxdy).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv - (3.0 + w1_8_ratio.y) * dxdy).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv - (1.0 + w1_8_ratio.x) * dxdy).rgb);
+    sum += (w0 * texture(tex, tex_uv).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv + (1.0 + w1_8_ratio.x) * dxdy).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv + (3.0 + w1_8_ratio.y) * dxdy).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv + (5.0 + w1_8_ratio.z) * dxdy).rgb);
+    sum += (w1_8.w * texture(tex, tex_uv + (7.0 + w1_8_ratio.w) * dxdy).rgb);
+
+    return sum * weight_sum_inv;
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
--- a/crt/shaders/crt-royale/src-fast/crt-royale-bloom-horizontal-reconstitute.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-bloom-horizontal-reconstitute.slang
@ -0,0 +1,206 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	vec4 MASKED_SCANLINESSize;
+	vec4 BRIGHTPASSSize;
+} params;
+
+#define MASKED_SCANLINEStexture MASKED_SCANLINES
+#define MASKED_SCANLINEStexture_size params.MASKED_SCANLINESSize.xy
+#define MASKED_SCANLINESvideo_size params.MASKED_SCANLINESSize.xy
+#define BRIGHTPASStexture BRIGHTPASS
+#define BRIGHTPASStexture_size params.BRIGHTPASSSize.xy
+#define BRIGHTPASSvideo_size params.BRIGHTPASSSize.xy
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+///////////////////////////////  VERTEX INCLUDES  //////////////////////////////
+
+#include "scanline-functions.h"
+
+#define GAMMA_OUT(color)    pow(color, vec3(1.0 / lcd_gamma))
+
+float bloom_diff_thresh_   = 1.0/256.0;
+float mask_min_allowed_tile_size  = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
+
+struct st_gauss{
+    vec4 w1_8;
+    vec4 w1_8_ratio;
+};
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 video_uv;
+layout(location = 1) out float bloom_sigma_runtime;
+layout(location = 2) out vec4 w1_8;
+layout(location = 3) out vec4 w1_8_ratio;
+layout(location = 4) out float weight_sum_inv;
+layout(location = 5) out float undim_mask_contrast_factors;
+
+// copied from bloom-functions.h
+float get_min_sigma_to_blur_triad(float triad_size, float thresh)
+{
+    return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
+}
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+st_gauss get_blur_fastest_w1_8(float sigma)
+{
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+
+    st_gauss blur_weights;
+
+    blur_weights.w1_8.x = w1 + w2;
+    blur_weights.w1_8.y = w3 + w4;
+    blur_weights.w1_8.z = w5 + w6;
+    blur_weights.w1_8.w = w7 + w8;
+    blur_weights.w1_8_ratio.x = w2/(blur_weights.w1_8.x) + 1.0;
+    blur_weights.w1_8_ratio.y = w4/(blur_weights.w1_8.y) + 3.0;
+    blur_weights.w1_8_ratio.z = w6/(blur_weights.w1_8.z) + 5.0;
+    blur_weights.w1_8_ratio.w = w8/(blur_weights.w1_8.w) + 7.0;
+
+    return blur_weights;
+}
+
+vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Stated tile properties must be correct:
+    float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    float tile_aspect_ratio     = 1.0/tile_aspect_ratio_inv;
+    vec2  tile_aspect           = vec2(1.0, tile_aspect_ratio_inv);
+
+    float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
+
+    //  Make sure we're not upsizing:
+    float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
+
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    vec2 temp_tile_size    = temp_tile_size_x * tile_aspect;
+    vec2 min_tile_size     = mask_min_allowed_tile_size * tile_aspect;
+    vec2 max_tile_size     = estimated_mask_resize_output_size / mask_resize_num_tiles;
+    vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
+
+    float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
+    float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
+    vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
+
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
+
+    return final_resized_tile_size;
+}
+
+void main()
+{
+    gl_Position = global.MVP * Position;
+    video_uv = TexCoord;
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    float mask_tile_size_x = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
+
+    bloom_sigma_runtime = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+
+    st_gauss blur_weights = get_blur_fastest_w1_8(bloom_sigma_runtime);
+
+    w1_8       = blur_weights.w1_8;
+    w1_8_ratio = blur_weights.w1_8_ratio * params.SourceSize.z;
+
+    weight_sum_inv = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
+
+    float undim_factor = 1.0/levels_autodim_temp;
+
+    undim_mask_contrast_factors = undim_factor * get_mask_amplify() * levels_contrast;
+}
+
+
+
+#pragma stage fragment
+layout(location = 0) in vec2 video_uv;
+layout(location = 1) in float bloom_sigma_runtime;
+layout(location = 2) in vec4 w1_8;
+layout(location = 3) in vec4 w1_8_ratio;
+layout(location = 4) in float weight_sum_inv;
+layout(location = 5) in float undim_mask_contrast_factors;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D BRIGHTPASS;
+layout(set = 0, binding = 4) uniform sampler2D MASKED_SCANLINES;
+#define bloom_texture Source
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
+{
+
+    float w0 = 1.0;
+
+    vec3 sum = vec3(0.0,0.0,0.0);
+
+    sum += (w1_8.w * texture(tex, tex_uv - vec2(w1_8_ratio.w, 0.0)).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv - vec2(w1_8_ratio.z, 0.0)).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv - vec2(w1_8_ratio.y, 0.0)).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv - vec2(w1_8_ratio.x, 0.0)).rgb);
+    sum += (w0 * texture(tex, tex_uv).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv + vec2(w1_8_ratio.x, 0.0)).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv + vec2(w1_8_ratio.y, 0.0)).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv + vec2(w1_8_ratio.z, 0.0)).rgb);
+    sum += (w1_8.w * texture(tex, tex_uv + vec2(w1_8_ratio.w, 0.0)).rgb);
+
+    return sum * weight_sum_inv;
+}
+
+
+void main()
+{
+    vec3 blurred_brightpass = tex2Dblur17fastest(bloom_texture, video_uv, weight_sum_inv, w1_8, w1_8_ratio);
+
+    //  Sample the masked scanlines.  Alpha contains the auto-dim factor:
+    vec3 intensity_dim  = texture(MASKED_SCANLINEStexture, video_uv).rgb;
+
+    //  Calculate the mask dimpass, add it to the blurred brightpass, and
+    //  undim (from scanline auto-dim) and amplify (from mask dim) the result:
+    vec3 brightpass     = texture(BRIGHTPASStexture, video_uv).rgb;
+    vec3 phosphor_bloom = (intensity_dim - brightpass + blurred_brightpass) * undim_mask_contrast_factors;
+
+    FragColor = vec4(GAMMA_OUT(phosphor_bloom), 1.0);
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-bloom-vertical.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-bloom-vertical.slang
@ -0,0 +1,204 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+	vec4 MASKED_SCANLINESSize;
+} params;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+float bloom_diff_thresh_   = 1.0/256.0;
+float mask_min_allowed_tile_size  = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
+
+struct st_gauss{
+    vec4 w1_8;
+    vec4 w1_8_ratio;
+};
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out float bloom_sigma_runtime;
+layout(location = 2) out vec4 w1_8;
+layout(location = 3) out vec4 w1_8_ratio;
+layout(location = 4) out float weight_sum_inv;
+
+// copied from bloom-functions.h
+float get_min_sigma_to_blur_triad(float triad_size,
+    float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+st_gauss get_blur_fastest_w1_8(float sigma)
+{
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+
+    st_gauss blur_weights;
+
+    blur_weights.w1_8.x = w1 + w2;
+    blur_weights.w1_8.y = w3 + w4;
+    blur_weights.w1_8.z = w5 + w6;
+    blur_weights.w1_8.w = w7 + w8;
+    blur_weights.w1_8_ratio.x = w2/(blur_weights.w1_8.x) + 1.0;
+    blur_weights.w1_8_ratio.y = w4/(blur_weights.w1_8.y) + 3.0;
+    blur_weights.w1_8_ratio.z = w6/(blur_weights.w1_8.z) + 5.0;
+    blur_weights.w1_8_ratio.w = w8/(blur_weights.w1_8.w) + 7.0;
+
+    return blur_weights;
+}
+
+vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Stated tile properties must be correct:
+    float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    float tile_aspect_ratio     = 1.0/tile_aspect_ratio_inv;
+    vec2  tile_aspect           = vec2(1.0, tile_aspect_ratio_inv);
+
+    float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
+
+    //  Make sure we're not upsizing:
+    float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
+
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    vec2 temp_tile_size    = temp_tile_size_x * tile_aspect;
+    vec2 min_tile_size     = mask_min_allowed_tile_size * tile_aspect;
+    vec2 max_tile_size     = estimated_mask_resize_output_size / mask_resize_num_tiles;
+    vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
+
+    float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
+    float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
+    vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
+
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
+
+    return final_resized_tile_size;
+}
+
+void main()
+{
+    gl_Position = global.MVP * Position;
+    tex_uv = TexCoord * 1.0001;
+   
+    //  Get the uv sample distance between output pixels.  Calculate dxdy like
+    //  blurs/shaders/vertex-shader-blur-fast-vertical.h.
+    vec2 dxdy_scale = params.SourceSize.xy/params.OutputSize.xy;
+    vec2 dxdy       = dxdy_scale/params.SourceSize.xy;
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    float mask_tile_size_x = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
+    bloom_sigma_runtime    = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+
+    st_gauss blur_weights  = get_blur_fastest_w1_8(bloom_sigma_runtime);
+
+    w1_8       = blur_weights.w1_8;
+    w1_8_ratio = blur_weights.w1_8_ratio * dxdy.y;
+
+    weight_sum_inv = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
+}
+
+#pragma stage fragment
+#pragma format R8G8B8A8_SRGB
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in float bloom_sigma_runtime;
+layout(location = 2) in vec4 w1_8;
+layout(location = 3) in vec4 w1_8_ratio;
+layout(location = 4) in float weight_sum_inv;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+vec3 tex2Dblur17fastest(sampler2D tex, vec2 tex_uv, float weight_sum_inv, vec4 w1_8, vec4 w1_8_ratio)
+{
+
+    float w0 = 1.0;
+
+    vec3 sum = vec3(0.0,0.0,0.0);
+
+    sum += (w1_8.w * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.w)).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.z)).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.y)).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv - vec2(0.0, w1_8_ratio.x)).rgb);
+    sum += (w0 * texture(tex, tex_uv).rgb);
+    sum += (w1_8.x * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.x)).rgb);
+    sum += (w1_8.y * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.y)).rgb);
+    sum += (w1_8.z * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.z)).rgb);
+    sum += (w1_8.w * texture(tex, tex_uv + vec2(0.0, w1_8_ratio.w)).rgb);
+
+    return sum * weight_sum_inv;
+}
+
+void main()
+{
+    FragColor = vec4(tex2Dblur17fastest(Source, tex_uv, weight_sum_inv, w1_8, w1_8_ratio), 1.0);
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-brightpass.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-brightpass.slang
@ -0,0 +1,150 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+	vec4 MASKED_SCANLINESSize;
+} params;
+
+#define MASKED_SCANLINEStexture MASKED_SCANLINES
+#define MASKED_SCANLINEStexture_size params.MASKED_SCANLINESSize.xy
+#define MASKED_SCANLINESvideo_size params.MASKED_SCANLINESSize.xy
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+float bloom_diff_thresh_ = 1.0/256.0;
+float mask_min_allowed_tile_size  = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out float center_weight;
+layout(location = 2) out float undim_mask_contrast_factors;
+
+// copied from bloom-functions.h
+float get_min_sigma_to_blur_triad(float triad_size, float thresh)
+{
+    return -0.05168 + 0.6113*triad_size - 1.122*triad_size*sqrt(0.000416 + thresh);
+}
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    return min(exp(exp(0.348348412457428/(sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Stated tile properties must be correct:
+    float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    float tile_aspect_ratio     = 1.0/tile_aspect_ratio_inv;
+    vec2  tile_aspect           = vec2(1.0, tile_aspect_ratio_inv);
+
+    float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
+
+    //  Make sure we're not upsizing:
+    float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
+
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    vec2 temp_tile_size    = temp_tile_size_x * tile_aspect;
+    vec2 min_tile_size     = mask_min_allowed_tile_size * tile_aspect;
+    vec2 max_tile_size     = estimated_mask_resize_output_size / mask_resize_num_tiles;
+    vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
+
+    float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
+    float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
+    vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
+
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
+
+    return final_resized_tile_size;
+}
+
+void main()
+{
+    gl_Position = global.MVP * Position;
+
+    tex_uv = TexCoord;
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    float mask_tile_size_x    = get_resized_mask_tile_size(params.OutputSize.xy, params.OutputSize.xy * mask_resize_viewport_scale, false).x;
+    float bloom_sigma_runtime = get_min_sigma_to_blur_triad(mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+
+    center_weight = get_fast_gaussian_weight_sum_inv(bloom_sigma_runtime);
+
+    float undim_factor = 1.0/levels_autodim_temp;
+
+    undim_mask_contrast_factors = undim_factor * get_mask_amplify() * levels_contrast;
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in float center_weight;
+layout(location = 2) in float undim_mask_contrast_factors;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D MASKED_SCANLINES;
+layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+void main()
+{
+    //  Sample the masked scanlines:
+    vec3 intensity_dim = texture(MASKED_SCANLINEStexture, tex_uv).rgb;
+
+    //  Get the full intensity, including auto-undimming, and mask compensation:
+    vec3  intensity = intensity_dim * undim_mask_contrast_factors;
+
+    //  Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
+    //  would look like, so we can estimate how much energy we'll receive from
+    //  blooming neighbors:
+    vec3 phosphor_blur_approx = levels_contrast * texture(ORIG_LINEARIZED, tex_uv).rgb;
+
+    //  Compute the blur weight for the center texel and the maximum energy we
+    //  expect to receive from neighbors:
+    vec3  max_area_contribution_approx = max(vec3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity);
+
+    //  Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
+    //  because it actually gets better results (on top of being very simple),
+    //  but adjust all intensities for the user's desired underestimate factor:
+    vec3 area_contrib_underestimate = bloom_underestimate_levels * max_area_contribution_approx;
+    vec3 intensity_underestimate    = bloom_underestimate_levels * intensity;
+
+    //  Calculate the blur_ratio, the ratio of intensity we want to blur:
+    vec3 blur_ratio_temp = ((vec3(1.0, 1.0, 1.0) - area_contrib_underestimate) / intensity_underestimate - vec3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
+    vec3 blur_ratio      = clamp(blur_ratio_temp, 0.0, 1.0);
+
+    //  Calculate the brightpass based on the auto-dimmed, unamplified, masked
+    //  scanlines, encode if necessary, and return!
+    vec3 brightpass = intensity_dim * mix(blur_ratio, vec3(1.0, 1.0, 1.0), global.bloom_excess);
+
+    FragColor = vec4(brightpass, 1.0);
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
@ -0,0 +1,97 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "bind-shader-params.h"
+#include "scanline-functions.h"
+
+#define GAMMA_IN(color)    pow(color, vec3(crt_gamma))
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out vec2 uv_step;
+layout(location = 2) out float interlaced;
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+   tex_uv      = TexCoord * 1.00001;
+   uv_step     = vec2(1.0)/params.SourceSize.xy;
+   
+    //  Detect interlacing: 1.0 = true, 0.0 = false.
+    vec2 _video_size = params.SourceSize.xy;
+    interlaced       = float(is_interlaced(_video_size.y));
+}
+
+#pragma stage fragment
+#pragma format R8G8B8A8_SRGB
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in vec2 uv_step;
+layout(location = 2) in float interlaced;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+void main()
+{
+    //  Linearize the input based on CRT gamma and bob interlaced fields.
+    //  Bobbing ensures we can immediately blur without getting artifacts.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+    if(bool(interlace_detect))
+    {
+        //  Sample the current line and an average of the previous/next line;
+        //  tex2D_linearize will decode CRT gamma.  Don't bother branching:
+        vec2 v_step = vec2(0.0, uv_step.y);
+
+        vec3 curr_line = GAMMA_IN(texture(input_texture, tex_uv         ).rgb);
+        vec3 last_line = GAMMA_IN(texture(input_texture, tex_uv - v_step).rgb);
+        vec3 next_line = GAMMA_IN(texture(input_texture, tex_uv + v_step).rgb);
+
+        vec3 interpolated_line = 0.5 * (last_line + next_line);
+
+        //  If we're interlacing, determine which field curr_line is in:
+        float modulus         = interlaced + 1.0;
+        float field_offset    = mod(params.FrameCount + global.interlace_bff, modulus);
+        float curr_line_texel = tex_uv.y * params.SourceSize.y;
+
+        //  Use under_half to fix a rounding bug around exact texel locations.
+        float line_num_last = floor(curr_line_texel - under_half);
+        float wrong_field   = mod(line_num_last + field_offset, modulus);
+
+        //  Select the correct color, and output the result:
+        vec3 color = mix(curr_line, interpolated_line, wrong_field);
+        FragColor  = vec4(color, 1.0);
+    }
+    else
+    {
+        FragColor = vec4(GAMMA_IN(texture(input_texture, tex_uv).rgb), 1.0);
+    }
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-mask-resize-horizontal.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-mask-resize-horizontal.slang
@ -0,0 +1,114 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "phosphor-mask-resizing.h"
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 src_tex_uv_wrap;
+layout(location = 1) out vec2 tile_uv_wrap;
+layout(location = 2) out vec2 resize_magnification_scale;
+layout(location = 3) out vec2 src_dxdy;
+layout(location = 4) out vec2 tile_size_uv;
+layout(location = 5) out vec2 input_tiles_per_texture;
+
+void main()
+{
+    gl_Position = global.MVP * Position;
+
+    //  First estimate the viewport size (the user will get the wrong number of
+    //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
+    vec2 estimated_viewport_size = params.OutputSize.xy / mask_resize_viewport_scale;
+
+    //  Find the final size of our resized phosphor mask tiles.  We probably
+    //  estimated the viewport size and MASK_RESIZE output size differently last
+    //  pass, so do not swear they were the same. ;)
+    vec2 mask_resize_tile_size = get_resized_mask_tile_size(estimated_viewport_size, params.OutputSize.xy, false);
+
+    //  We'll render resized tiles until filling the output FBO or meeting a
+    //  limit, so compute [wrapped] tile uv coords based on the output uv coords
+    //  and the number of tiles that will fit in the FBO.
+    vec2 output_tiles_this_pass = params.OutputSize.xy / mask_resize_tile_size;
+    tile_uv_wrap                = TexCoord * output_tiles_this_pass;
+
+    //  Get the texel size of an input tile and related values:
+    vec2 input_tile_size    = vec2(min(mask_resize_src_lut_size.x, params.SourceSize.x), mask_resize_tile_size.y);
+    tile_size_uv            = input_tile_size / params.SourceSize.xy;
+    input_tiles_per_texture = params.SourceSize.xy / input_tile_size;
+
+    //  Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
+    //  the tile size in uv coords, and save frac() for the fragment shader.
+    src_tex_uv_wrap = tile_uv_wrap * tile_size_uv;
+
+    //  Output the values we need, including the magnification scale and step:
+    resize_magnification_scale = mask_resize_tile_size / input_tile_size;
+    src_dxdy                   = vec2(1.0/params.SourceSize.x, 0.0);
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 src_tex_uv_wrap;
+layout(location = 1) in vec2 tile_uv_wrap;
+layout(location = 2) in vec2 resize_magnification_scale;
+layout(location = 3) in vec2 src_dxdy;
+layout(location = 4) in vec2 tile_size_uv;
+layout(location = 5) in vec2 input_tiles_per_texture;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+void main()
+{
+    //  The input contains one mask tile horizontally and a number vertically.
+    //  Resize the tile horizontally to its final screen size and repeat it
+    //  until drawing at least mask_resize_num_tiles, leaving it unchanged
+    //  vertically.  Lanczos-resizing the phosphor mask achieves much sharper
+    //  results than mipmapping, outputting >= mask_resize_num_tiles makes for
+    //  easier tiled sampling later.
+    //  Discard unneeded fragments in case our profile allows real branches.
+
+    if(max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
+    {
+        float src_dx = src_dxdy.x;
+        vec2 src_tex_uv = fract(src_tex_uv_wrap);
+        vec3 pixel_color = downsample_horizontal_sinc_tiled(input_texture, src_tex_uv, params.SourceSize.xy, src_dxdy.x, resize_magnification_scale.x, tile_size_uv.x);
+
+        //  The input LUT was linear RGB, and so is our output:
+        FragColor = vec4(pixel_color, 1.0);
+    }
+    else
+    {
+        discard;
+    }
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-mask-resize-vertical.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-mask-resize-vertical.slang
@ -0,0 +1,120 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "phosphor-mask-resizing.h"
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 src_tex_uv_wrap;
+layout(location = 1) out vec2 resize_magnification_scale;
+
+void main()
+{
+    gl_Position = global.MVP * Position;
+
+    //  First estimate the viewport size (the user will get the wrong number of
+    //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
+    vec2  estimated_viewport_size = params.OutputSize.xy / mask_resize_viewport_scale.yy;
+
+    //  Estimate the output size of MASK_RESIZE (the next pass).  The estimated
+    //  x component shouldn't matter, because we're not using the x result, and
+    //  we're not swearing it's correct (if we did, the x result would influence
+    //  the y result to maintain the tile aspect ratio).
+    vec2 estimated_mask_resize_output_size = params.OutputSize.xy;
+
+    //  Find the final intended [y] size of our resized phosphor mask tiles,
+    //  then the tile size for the current pass (resize y only):
+    vec2 mask_resize_tile_size = get_resized_mask_tile_size(estimated_viewport_size, estimated_mask_resize_output_size, false);
+    vec2 pass_output_tile_size = vec2(min(mask_resize_src_lut_size.x, params.OutputSize.x), mask_resize_tile_size.y);
+
+    //  We'll render resized tiles until filling the output FBO or meeting a
+    //  limit, so compute [wrapped] tile uv coords based on the output uv coords
+    //  and the number of tiles that will fit in the FBO.
+    vec2 output_tiles_this_pass = params.OutputSize.xy / pass_output_tile_size;
+
+    //  The input LUT is just a single mask tile, so texture uv coords are the
+    //  same as tile uv coords (save fract() for the fragment shader).  The
+    //  magnification scale is also straightforward:
+    src_tex_uv_wrap            = TexCoord * output_tiles_this_pass;
+    resize_magnification_scale = pass_output_tile_size / mask_resize_src_lut_size;
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 src_tex_uv_wrap;
+layout(location = 1) in vec2 resize_magnification_scale;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small;
+layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small;
+layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small;
+
+void main()
+{
+    //  Resize the input phosphor mask tile to the final vertical size it will
+    //  appear on screen.  Keep 1x horizontal size if possible (IN.output_size
+    //  >= mask_resize_src_lut_size), and otherwise linearly sample horizontally
+    //  to fit exactly one tile.  Lanczos-resizing the phosphor mask achieves
+    //  much sharper results than mipmapping, and vertically resizing first
+    //  minimizes the total number of taps required.  We output a number of
+    //  resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
+    //vec2 src_tex_uv_wrap = src_tex_uv_wrap;
+    //  Discard unneeded fragments in case our profile allows real branches.
+
+    if(src_tex_uv_wrap.y <= mask_resize_num_tiles)
+    {
+        float src_dy     = 1.0/mask_resize_src_lut_size.y;
+        vec2  src_tex_uv = fract(src_tex_uv_wrap);
+        vec3  pixel_color;
+
+        if(mask_type < 0.5)
+        {
+            pixel_color = downsample_vertical_sinc_tiled(mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
+        }
+        else if(mask_type < 1.5)
+        {
+            pixel_color = downsample_vertical_sinc_tiled(mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
+        }
+        else
+        {
+            pixel_color = downsample_vertical_sinc_tiled(mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size, src_dy, resize_magnification_scale.y, 1.0);
+        }
+            //  The input LUT was linear RGB, and so is our output:
+            FragColor = vec4(pixel_color, 1.0);
+        }
+    else
+    {
+        discard;
+    }
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-scanlines-horizontal-apply-mask.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-scanlines-horizontal-apply-mask.slang
@ -0,0 +1,112 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	vec4 VERTICAL_SCANLINESSize;
+	vec4 MASK_RESIZESize;
+} params;
+
+#define VERTICAL_SCANLINEStexture VERTICAL_SCANLINES
+#define VERTICAL_SCANLINEStexture_size params.VERTICAL_SCANLINESSize.xy
+#define VERTICAL_SCANLINESvideo_size params.VERTICAL_SCANLINESSize.xy
+#define MASK_RESIZEtexture MASK_RESIZE
+#define MASK_RESIZEtexture_size params.MASK_RESIZESize.xy
+#define MASK_RESIZEvideo_size params.MASK_RESIZESize.xy
+
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "bind-shader-params.h"
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+#include "scanline-functions.h"
+#include "phosphor-mask-resizing.h"
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 video_uv;
+layout(location = 1) out vec2 scanline_texture_size_inv;
+layout(location = 2) out vec4 mask_tile_start_uv_and_size;
+layout(location = 3) out vec2 mask_tiles_per_screen;
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+    video_uv = TexCoord;
+
+    scanline_texture_size_inv = vec2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size;
+
+    //  Get a consistent name for the final mask texture size.  Sample mode 0
+    //  uses the manually resized mask, but ignore it if we never resized.
+    vec2 mask_resize_texture_size = MASK_RESIZEtexture_size;
+    vec2 mask_resize_video_size = MASK_RESIZEvideo_size;
+
+    //  Compute mask tile dimensions, starting points, etc.:
+    mask_tile_start_uv_and_size = get_mask_sampling_parameters(mask_resize_texture_size, mask_resize_video_size, params.OutputSize.xy, mask_tiles_per_screen);
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 video_uv;
+layout(location = 1) in vec2 scanline_texture_size_inv;
+layout(location = 2) in vec4 mask_tile_start_uv_and_size;
+layout(location = 3) in vec2 mask_tiles_per_screen;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D VERTICAL_SCANLINES;
+layout(set = 0, binding = 4) uniform sampler2D MASK_RESIZE;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+void main()
+{
+    //  This pass: Sample (misconverged?) scanlines to the final horizontal
+    //  resolution, apply halation (bouncing electrons), and apply the phosphor
+    //  mask.  Fake a bloom if requested.  Unless we fake a bloom, the output
+    //  will be dim from the scanline auto-dim, mask dimming, and low gamma.
+
+    //  Horizontally sample the current row (a vertically interpolated scanline)
+    //  and account for horizontal convergence offsets, given in units of texels.
+    vec3 scanline_color_dim = sample_rgb_scanline_horizontal(VERTICAL_SCANLINEStexture, video_uv, VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv);
+
+    //  Sample the phosphor mask:
+    vec2 tile_uv_wrap = video_uv * mask_tiles_per_screen;
+
+    vec2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(tile_uv_wrap, mask_tile_start_uv_and_size);
+
+    vec3 phosphor_mask_sample;
+
+    //  Sample the resized mask, and avoid tiling artifacts:
+    phosphor_mask_sample = texture(MASK_RESIZEtexture, mask_tex_uv).rgb;
+
+    //  Apply the phosphor mask:
+    vec3 phosphor_emission_dim = scanline_color_dim * phosphor_mask_sample;
+
+    FragColor = vec4(phosphor_emission_dim, 1.0);
+}
--- a/crt/shaders/crt-royale/src-fast/crt-royale-scanlines-vertical-interlacing.slang
+++ b/crt/shaders/crt-royale/src-fast/crt-royale-scanlines-vertical-interlacing.slang
@ -0,0 +1,126 @@
+#version 450
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+	vec4 ORIG_LINEARIZEDSize;
+} params;
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "bind-shader-params.h"
+#include "scanline-functions.h"
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out vec2 uv_step;                     //  uv size of a texel (x) and scanline (y)
+layout(location = 2) out vec2 il_step_multiple;            //  (1, 1) = progressive, (1, 2) = interlaced
+layout(location = 3) out float pixel_height_in_scanlines;  //  Height of an output pixel in scanlines
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+   tex_uv      = TexCoord * 1.00001;
+   
+    //  Detect interlacing: il_step_multiple indicates the step multiple between
+    //  lines: 1 is for progressive sources, and 2 is for interlaced sources.
+    vec2  video_size_ = params.ORIG_LINEARIZEDSize.xy;
+    float y_step      = 1.0 + float(is_interlaced(video_size_.y));
+    il_step_multiple  = vec2(1.0, y_step);
+
+    //  Get the uv tex coords step between one texel (x) and scanline (y):
+    uv_step = il_step_multiple / params.ORIG_LINEARIZEDSize.xy;
+
+    //  We need the pixel height in scanlines for antialiased/integral sampling:
+    float ph = (video_size_.y / params.OutputSize.y) / il_step_multiple.y;
+    pixel_height_in_scanlines = ph;
+}
+
+#pragma stage fragment
+#pragma format R8G8B8A8_SRGB
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in vec2 uv_step;                      //  uv size of a texel (x) and scanline (y)
+layout(location = 2) in vec2 il_step_multiple;             //  (1, 1) = progressive, (1, 2) = interlaced
+layout(location = 3) in float pixel_height_in_scanlines;   //  Height of an output pixel in scanlines
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
+
+#define input_texture ORIG_LINEARIZED
+
+void main()
+{
+    //  This pass: Sample multiple (misconverged?) scanlines to the final
+    //  vertical resolution.  Temporarily auto-dim the output to avoid clipping.
+
+    //  Read some attributes into local variables:
+    vec2 texture_size_    = params.ORIG_LINEARIZEDSize.xy;
+    vec2 texture_size_inv = params.ORIG_LINEARIZEDSize.zw;
+
+    float frame_count = float(params.FrameCount);
+    float ph = pixel_height_in_scanlines;
+
+    //  Get the uv coords of the previous scanline (in this field), and the
+    //  scanline's distance from this sample, in scanlines.
+    float dist;
+    vec2  scanline_uv = get_last_scanline_uv(tex_uv, texture_size_, texture_size_inv, il_step_multiple, frame_count, dist);
+ 
+    //  NOTE: Anisotropic filtering creates interlacing artifacts, which is why
+    //  ORIG_LINEARIZED bobbed any interlaced input before this pass.
+    vec2 v_step          = vec2(0.0, uv_step.y);
+    vec3 scanline2_color = texture(input_texture, scanline_uv         ).rgb;
+    vec3 scanline3_color = texture(input_texture, scanline_uv + v_step).rgb;
+
+    vec3 scanline0_color, scanline1_color, scanline4_color, scanline5_color, scanline_outside_color;
+    float dist_round;
+
+    //  dist is in [0, 1]
+    dist_round              = round(dist);
+    vec2 sample_1or4_uv_off = mix(-v_step, 2.0 * v_step, dist_round);
+    scanline_outside_color  = texture(input_texture, scanline_uv + sample_1or4_uv_off).rgb;
+    
+    //  Compute scanline contributions, accounting for vertical convergence.
+    //  Vertical convergence offsets are in units of current-field scanlines.
+    //  dist2 means "positive sample distance from scanline 2, in scanlines:"
+    vec3 dist2 = vec3(dist);
+
+    //  Calculate and sum final scanline contributions, starting with lines 2/3.
+    //  There is no normalization step, because we're not interpolating a
+    //  continuous signal.  Instead, each scanline is an additive light source.
+    vec3 scanline2_contrib  = scanline_contrib(dist2, scanline2_color, ph, sigma_range, shape_range);
+    vec3 scanline3_contrib  = scanline_contrib(abs(vec3(1.0,1.0,1.0) - dist2), scanline3_color, ph, sigma_range, shape_range);
+    vec3 scanline_intensity = scanline2_contrib + scanline3_contrib;
+
+    vec3 dist1or4 = mix(dist2 + vec3(1.0,1.0,1.0), vec3(2.0,2.0,2.0) - dist2, dist_round);
+    vec3 scanline1or4_contrib = scanline_contrib(dist1or4, scanline_outside_color, ph, sigma_range, shape_range);
+    scanline_intensity += scanline1or4_contrib;
+
+    //  Auto-dim the image to avoid clipping, encode if necessary, and output.
+    //  My original idea was to compute a minimal auto-dim factor and put it in
+    //  the alpha channel, but it wasn't working, at least not reliably.  This
+    //  is faster anyway, levels_autodim_temp = 0.5 isn't causing banding.
+    FragColor = vec4(scanline_intensity * levels_autodim_temp, 1.0);
+}
--- a/crt/shaders/crt-royale/src-fast/phosphor-mask-resizing.h
+++ b/crt/shaders/crt-royale/src-fast/phosphor-mask-resizing.h
@ -0,0 +1,271 @@
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+        #define USE_SINGLE_STATIC_LOOP
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+float mask_min_allowed_tile_size  = ceil(mask_min_allowed_triad_size * mask_triads_per_tile);
+float mask_min_expected_tile_size = mask_min_allowed_tile_size;
+
+//  Limit the number of sinc resize taps by the maximum minification factor:
+float pi_over_lobes = pi/mask_sinc_lobes;
+float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+float max_sinc_resize_samples_m4 = ceil(max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+
+vec2 get_first_texel_tile_uv_and_dist(vec2 tex_uv, vec2 tex_size, float dr, float input_tiles_per_texture_r, float samples, bool vertical)
+{
+
+    vec2 curr_texel  = tex_uv * tex_size;
+    vec2 prev_texel  = floor(curr_texel - vec2(under_half)) + vec2(0.5);
+    vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0);
+    vec2 first_texel_uv_wrap_2D = first_texel * dr;
+    vec2 first_texel_dist_2D    = curr_texel - first_texel;
+
+    //  Convert from tex_uv to tile_uv coords so we can sub fracts for fmods.
+    vec2 first_texel_tile_uv_wrap_2D = first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    vec2 coord_negative = vec2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    vec2 first_texel_tile_uv_2D = fract(first_texel_tile_uv_wrap_2D) + coord_negative;
+
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    vec2 tile_u_and_dist = vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    vec2 tile_v_and_dist = vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+}
+
+vec4 tex2Dlod0try(sampler2D tex, vec2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    return texture(tex, tex_uv);
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \
+        vec4 tile_uv_r = fract(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        vec4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            vec4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), vec4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        vec4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        vec4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        vec3 new_sample0 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        vec3 new_sample1 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        vec3 new_sample2 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        vec3 new_sample3 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        vec3 new_sample0 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        vec3 new_sample1 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        vec3 new_sample2 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        vec3 new_sample3 = tex2Dlod0try(tex,                       \
+            vec2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+vec3 downsample_vertical_sinc_tiled(sampler2D tex, vec2 tex_uv, vec2 tex_size, float dr, float magnification_scale, float tile_size_uv_r)
+{
+    int samples = int(max_sinc_resize_samples_m4);
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+
+    //  true = vertical resize:
+    vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    vec4 first_texel_tile_uv_rrrr    = first_texel_tile_r_and_dist.xxxx;
+    vec4 first_dist_unscaled         = first_texel_tile_r_and_dist.yyyy;
+
+    //  Get the tile sample offset:
+    float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    int i_step = 4;
+    vec4 weight_sum  = vec4(0.0);
+    vec3 pixel_color = vec3(0.0);
+ 
+    for(int i = 0; i < samples; i += i_step)
+    {
+        VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+    }
+ 
+    //  Normalize so the weight_sum == 1.0, and return:
+    vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + weight_sum_reduce.y);
+
+    return (pixel_color/scalar_weight_sum);
+}
+
+vec3 downsample_horizontal_sinc_tiled(sampler2D tex, vec2 tex_uv, vec2 tex_size, float dr, float magnification_scale, float tile_size_uv_r)
+{
+    int samples = int(max_sinc_resize_samples_m4);
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+
+    //  false = horizontal resize:
+    vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    vec4 first_texel_tile_uv_rrrr    = first_texel_tile_r_and_dist.xxxx;
+    vec4 first_dist_unscaled         = first_texel_tile_r_and_dist.yyyy;
+
+    //  Get the tile sample offset:
+    float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    int i_step = 4;
+    vec4 weight_sum  = vec4(0.0);
+    vec3 pixel_color = vec3(0.0);
+
+    for(int i = 0; i < samples; i += i_step)
+    {
+        HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+    }
+
+    //  Normalize so the weight_sum == 1.0, and return:
+    vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + weight_sum_reduce.y);
+
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+vec2 get_resized_mask_tile_size(vec2 estimated_viewport_size, vec2 estimated_mask_resize_output_size, bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Stated tile properties must be correct:
+    float tile_aspect_ratio_inv = mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    float tile_aspect_ratio     = 1.0/tile_aspect_ratio_inv;
+    vec2  tile_aspect           = vec2(1.0, tile_aspect_ratio_inv);
+
+    float desired_tile_size_x = mask_triads_per_tile * global.mask_triad_size_desired;
+
+    //  Make sure we're not upsizing:
+    float temp_tile_size_x = min(desired_tile_size_x, mask_resize_src_lut_size.x);
+
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    vec2 temp_tile_size    = temp_tile_size_x * tile_aspect;
+    vec2 min_tile_size     = mask_min_allowed_tile_size * tile_aspect;
+    vec2 max_tile_size     = estimated_mask_resize_output_size / mask_resize_num_tiles;
+    vec2 clamped_tile_size = clamp(temp_tile_size, min_tile_size, max_tile_size);
+
+    float x_tile_size_from_y = clamped_tile_size.y * tile_aspect_ratio;
+    float y_tile_size_from_x = mix(clamped_tile_size.y, clamped_tile_size.x * tile_aspect_ratio_inv, float(solemnly_swear_same_inputs_for_every_pass));
+    vec2 reclamped_tile_size = vec2(min(clamped_tile_size.x, x_tile_size_from_y), min(clamped_tile_size.y, y_tile_size_from_x));
+
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    vec2 final_resized_tile_size = floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
+
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+vec4 get_mask_sampling_parameters(vec2 mask_resize_texture_size, vec2 mask_resize_video_size, vec2 true_viewport_size, out vec2 mask_tiles_per_screen)
+{
+    vec2 mask_resize_tile_size = get_resized_mask_tile_size(true_viewport_size, mask_resize_video_size, false);
+
+    //  Sample MASK_RESIZE: The resized tile is a fracttion of the texture
+    //  size and starts at a nonzero offset to allow for border texels:
+    vec2 mask_tile_uv_size  = mask_resize_tile_size / mask_resize_texture_size;
+    vec2 skipped_tiles      = mask_start_texels/mask_resize_tile_size;
+    vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+    
+    //  mask_tiles_per_screen must be based on the *true* viewport size:
+    mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+    
+    return vec4(mask_tile_start_uv, mask_tile_uv_size);
+}
+
+vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(vec2 tile_uv_wrap, vec4 mask_tile_start_uv_and_size)
+{
+    vec2 tile_uv = fract(tile_uv_wrap);
+    vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy + tile_uv * mask_tile_start_uv_and_size.zw;
+
+    return mask_tex_uv;
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
--- a/crt/shaders/crt-royale/src-fast/scanline-functions.h
+++ b/crt/shaders/crt-royale/src-fast/scanline-functions.h
@ -0,0 +1,309 @@
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "special-functions.h"
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+vec3 get_gaussian_sigma(vec3 color, float sigma_range)
+{
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return vec3(beam_min_sigma) + sigma_range * pow(color, vec3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        vec3 color_minus_1 = color - vec3(1.0);
+
+	return vec3(beam_min_sigma) + sigma_range * sqrt(vec3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+vec3 get_generalized_gaussian_beta(vec3 color, float shape_range)
+{
+    return beam_min_shape + shape_range * pow(color, vec3(beam_shape_power));
+}
+
+vec3 scanline_gaussian_integral_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range)
+{
+    vec3 sigma         = get_gaussian_sigma(color, sigma_range);
+    vec3 ph_offset     = vec3(pixel_height * 0.5);
+    vec3 denom_inv     = 1.0/(sigma*sqrt(2.0));
+    vec3 integral_high = erf((dist + ph_offset)*denom_inv);
+    vec3 integral_low  = erf((dist - ph_offset)*denom_inv);
+
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+vec3 scanline_generalized_gaussian_integral_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
+{
+    vec3 alpha     = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    vec3 beta      = get_generalized_gaussian_beta(color, shape_range);
+    vec3 alpha_inv = vec3(1.0)/alpha;
+    vec3 s         = vec3(1.0)/beta;
+    vec3 ph_offset = vec3(pixel_height * 0.5);
+
+    vec3 gamma_s_inv   = vec3(1.0)/gamma_impl(s, beta);
+    vec3 dist1         = dist + ph_offset;
+    vec3 dist0         = dist - ph_offset;
+    vec3 integral_high = sign(dist1) * normalized_ligamma_impl(s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    vec3 integral_low  = sign(dist0) * normalized_ligamma_impl(s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+vec3 scanline_gaussian_sampled_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range)
+{
+    vec3 sigma     = get_gaussian_sigma(color, sigma_range);
+    vec3 sigma_inv = vec3(1.0)/sigma;
+
+    vec3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    vec3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        vec3 sample_offset = vec3(pixel_height/3.0);
+        vec3 dist2         = dist + sample_offset;
+        vec3 dist3         = abs(dist - sample_offset);
+
+	//  Average three pure Gaussian samples:
+        vec3 scale   = color/3.0  * outer_denom_inv;
+        vec3 weight1 = exp(-( dist* dist)*inner_denom_inv);
+        vec3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        vec3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+
+	return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+vec3 scanline_generalized_gaussian_sampled_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
+{
+    vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    vec3 beta  = get_generalized_gaussian_beta(color, shape_range);
+
+    //  Avoid repeated divides:
+    vec3 alpha_inv = vec3(1.0)/alpha;
+    vec3 beta_inv  = vec3(1.0)/beta;
+    vec3 scale     = color * beta * 0.5 * alpha_inv / gamma_impl(beta_inv, beta);
+
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        vec3 sample_offset = vec3(pixel_height/3.0);
+        vec3 dist2         = dist + sample_offset;
+        vec3 dist3         = abs(dist - sample_offset);
+
+	//  Average three generalized Gaussian samples:
+        vec3 weight1 = exp(-pow(abs( dist*alpha_inv), beta));
+        vec3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        vec3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+
+	return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+vec3 scanline_contrib(vec3 dist, vec3 color, float pixel_height, float sigma_range, float shape_range)
+{
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+
+// 2 - Apply mask only.
+vec3 get_raw_interpolated_color(vec3 color0, vec3 color1, vec3 color2, vec3 color3, vec4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max((mat4x3(color0, color1, color2, color3) * weights), 0.0);
+}
+
+vec3 get_interpolated_linear_color(vec3 color0, vec3 color1, vec3 color2, vec3 color3, vec4 weights)
+{
+    float intermediate_gamma = lcd_gamma;
+
+    //  Inputs: color0-3 are colors in linear RGB.
+            vec3 linear_mixed_color = get_raw_interpolated_color(color0, color1, color2, color3, weights);
+
+	    vec3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, vec3(1.0/intermediate_gamma)),
+                    pow(color1, vec3(1.0/intermediate_gamma)),
+                    pow(color2, vec3(1.0/intermediate_gamma)),
+                    pow(color3, vec3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			float beam_horiz_linear_rgb_weight1 = 1.0;
+            return mix(gamma_mixed_color, linear_mixed_color, global.beam_horiz_linear_rgb_weight);
+}
+
+vec3 get_scanline_color(sampler2D tex, vec2 scanline_uv, vec2 uv_step_x, vec4 weights)
+{
+    vec3 color1 = texture(tex, scanline_uv).rgb;
+    vec3 color2 = texture(tex, scanline_uv + uv_step_x).rgb;
+    vec3 color0 = vec3(0.0);
+    vec3 color3 = vec3(0.0);
+
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = texture(tex, scanline_uv - uv_step_x).rgb;
+        color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+vec3 sample_single_scanline_horizontal(sampler2D tex, vec2 tex_uv, vec2 tex_size, vec2 texture_size_inv)
+{
+    vec2 curr_texel = tex_uv * tex_size;
+
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    vec2 prev_texel        = floor(curr_texel - vec2(under_half)) + vec2(0.5);
+    vec2 prev_texel_hor    = vec2(prev_texel.x, curr_texel.y);
+    vec2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+
+    float prev_dist   = curr_texel.x - prev_texel_hor.x;
+    vec4 sample_dists = vec4(1.0 + prev_dist, prev_dist, 1.0 - prev_dist, 2.0 - prev_dist);
+
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    vec4 weights;
+
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        float x  = sample_dists.y;
+        float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights  = vec4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights               = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        vec4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights       = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5)/(pi_dists * pi_dists);
+    }
+
+    //  Ensure the weight sum == 1.0:
+    vec4 final_weights = weights/dot(weights, vec4(1.0));
+
+    //  Get the interpolated horizontal scanline color:
+    vec2 uv_step_x = vec2(texture_size_inv.x, 0.0);
+
+    return get_scanline_color(tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+vec3 sample_rgb_scanline_horizontal(sampler2D tex, vec2 tex_uv, vec2 tex_size, vec2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    return sample_single_scanline_horizontal(tex, tex_uv, tex_size, texture_size_inv);
+}
+
+// Monolythic
+vec2 get_last_scanline_uv( vec2 tex_uv,
+		           vec2 tex_size, vec2 texture_size_inv, 
+		           vec2 il_step_multiple,
+                           float frame_count, out float dist)
+{
+    float field_offset = floor(il_step_multiple.y * 0.75)*mod(frame_count + float(global.interlace_bff), 2.0);
+    vec2  curr_texel   = tex_uv * tex_size;
+
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    vec2  prev_texel_num     = floor(curr_texel - vec2(under_half));
+    float wrong_field        = mod(prev_texel_num.y + field_offset, il_step_multiple.y);
+    vec2  scanline_texel_num = prev_texel_num - vec2(0.0, wrong_field);
+
+    //  Snap to the center of the previous scanline in the current field:
+    vec2  scanline_texel = scanline_texel_num + vec2(0.5);
+    vec2  scanline_uv    = scanline_texel * texture_size_inv;
+
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+
+    return scanline_uv;
+}
+
+bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        bool hd_interlace = bool(interlace_1080i) ? ((num_lines > 1079.5) && (num_lines < 1080.5)) : false;
+
+	return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
--- a/crt/shaders/crt-royale/src-fast/special-functions.h
+++ b/crt/shaders/crt-royale/src-fast/special-functions.h
@ -0,0 +1,182 @@
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+vec3 erf6(vec3 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	vec3 one = vec3(1.0);
+	vec3 sign_x = sign(x);
+	vec3 t = one/(one + 0.47047*abs(x));
+	vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec3 erft(vec3 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+vec3 erf(vec3 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+
+vec3 gamma_impl(vec3 s, vec3 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	vec3 g = vec3(1.12906830989);
+	vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
+	vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
+	vec3 e = vec3(2.71828182845904523536028747135266249775724709);
+	vec3 sph = s + vec3(0.5);
+	vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
+	vec3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
+{
+     //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	vec3 scale = pow(z, s);
+	vec3 sum = s_inv;
+	vec3 z_sq = z*z;
+	vec3 denom1 = s + vec3(1.0);
+	vec3 denom2 = 2.0*s + vec3(4.0);
+	vec3 denom3 = 6.0*s + vec3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+vec3 uigamma_large_z_impl(vec3 s, vec3 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = vec4('inf');
+	//      vec4 one = vec4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	vec3 numerator = pow(z, s) * exp(-z);
+	vec3 denom = vec3(7.0) + z - s;
+	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
+	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
+	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+	return numerator / denom;
+}
+
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+vec3 normalized_ligamma_impl(vec3 s, vec3 z,
+    vec3 s_inv, vec3 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	vec3 thresh = vec3(0.775075);
+	bvec3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bvec3 inverse_z_is_large = not(z_is_large);
+	return large_z * vec3(z_is_large) + small_z * vec3(inverse_z_is_large);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+