同步上游

NVSharpen 分离为预处理和后处理变体
hooke007 · Mar 10, 2022 · 1c10309 · 1c10309
1 parent da1bcdd
commit 1c10309
Show file tree

Hide file tree

Showing 7 changed files with 543 additions and 183 deletions.
diff --git a/portable_config/shaders/AMD-FSR.glsl b/portable_config/shaders/AMD-FSR.glsl
@@ -24,7 +24,8 @@
 // Changelog
 // Made it compatible with pre-OpenGL 4.0 renderers
 // Made it directly operate on LUMA plane, since the original shader was operating on LUMA by deriving it from RGB. This should cause a major increase in performance, especially on OpenGL 4.0+ renderers (4+2 texture lookups vs. 12+5)
-// Removed transparency preservation mechanism since the alpha channel is a separate source plan than LUMA
+// Removed transparency preservation mechanism since the alpha channel is a separate source plane than LUMA
+// Added optional performance-saving lossy optimizations to EASU (Credit: atyuwen, https://atyuwen.github.io/posts/optimizing-fsr/)
 // 
 // Notes
 // Per AMD's guidelines only upscales content up to 4x (e.g., 1080p -> 2160p, 720p -> 1440p etc.) and everything else in between,
@@ -39,8 +40,22 @@
 //!HEIGHT OUTPUT.h OUTPUT.h LUMA.h 2 * < * LUMA.h 2 * OUTPUT.h LUMA.h 2 * > * + OUTPUT.h OUTPUT.h LUMA.h 2 * = * +
 //!COMPONENTS 1
 
+// User variables - EASU
+#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1.
+#define FSR_EASU_DERING 1 // If set to 0, disables deringing for a small increase in performance. 0 or 1.
+#define FSR_EASU_SIMPLE_ANALYSIS 0 // If set to 1, uses a simpler single-pass direction and length analysis for an increase in performance. 0 or 1.
+#define FSR_EASU_QUIT_EARLY 0 // If set to 1, uses bilinear filtering for non-edge pixels and skips EASU on those regions for an increase in performance. 0 or 1.
+
 // Shader code
 
+#ifndef FSR_EASU_DIR_THRESHOLD
+	#if (FSR_EASU_QUIT_EARLY == 1)
+		#define FSR_EASU_DIR_THRESHOLD 64.0
+	#elif (FSR_EASU_QUIT_EARLY == 0)
+		#define FSR_EASU_DIR_THRESHOLD 32768.0
+	#endif
+#endif
+
 float APrxLoRcpF1(float a) {
 	return uintBitsToFloat(uint(0x7ef07ebb) - floatBitsToUint(a));
 }
@@ -57,16 +72,24 @@ float AMax3F1(float x, float y, float z) {
 	return max(x, max(y, z));
 }
 
+#if (FSR_PQ == 1)
+
+float ToGamma2(float a) { 
+	return pow(a, 4.0);
+}
+
+#endif
+
  // Filtering for a given tap for the scalar.
  void FsrEasuTap(
-	inout float aC,  // Accumulated color, with negative lobe.
+	inout float aC,	// Accumulated color, with negative lobe.
 	inout float aW, // Accumulated weight.
 	vec2 off,       // Pixel offset from resolve position to tap.
 	vec2 dir,       // Gradient direction.
 	vec2 len,       // Length.
 	float lob,      // Negative lobe strength.
-	float clp,      // Clipping point.
-	float c){        // Tap color.
+	float clp,		// Clipping point.
+	float c){		// Tap color.
 	// Rotate offset by direction.
 	vec2 v;
 	v.x = (off.x * ( dir.x)) + (off.y * dir.y);
@@ -84,8 +107,8 @@ float AMax3F1(float x, float y, float z) {
 	// The general form of the 'base' is,
 	//  (a*(b*x^2-1)^2-(a-1))
 	// Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
-	float wB = float(2.0 / 5.0) * d2 + float(-1.0);
-	float wA = lob * d2 + float(-1.0);
+	float wB = float(2.0 / 5.0) * d2 + -1.0;
+	float wA = lob * d2 + -1.0;
 	wB *= wB;
 	wA *= wA;
 	wB = float(25.0 / 16.0) * wB + float(-(25.0 / 16.0 - 1.0));
@@ -100,16 +123,42 @@ void FsrEasuSet(
 	inout vec2 dir,
 	inout float len,
 	vec2 pp,
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	float b, float c,
+	float i, float j, float f, float e,
+	float k, float l, float h, float g,
+	float o, float n
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
 	bool biS, bool biT, bool biU, bool biV,
-	float lA, float lB, float lC, float lD, float lE){
+	float lA, float lB, float lC, float lD, float lE
+#endif
+	){
 	// Compute bilinear weight, branches factor out as predicates are compiler time immediates.
 	//  s t
 	//  u v
-	float w = float(0.0);
-	if(biS) w = (float(1.0) - pp.x) * (float(1.0) - pp.y);
-	if(biT) w = pp.x * (float(1.0) - pp.y);
-	if(biU) w = (float(1.0) - pp.x) * pp.y;
-	if(biV) w = pp.x * pp.y;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	vec4 w = vec4(0.0);
+	w.x = (1.0 - pp.x) * (1.0 - pp.y);
+	w.y =        pp.x  * (1.0 - pp.y);
+	w.z = (1.0 - pp.x) *        pp.y;
+	w.w =        pp.x  *        pp.y;
+
+	float lA = dot(w, vec4(b, c, f, g));
+	float lB = dot(w, vec4(e, f, i, j));
+	float lC = dot(w, vec4(f, g, j, k));
+	float lD = dot(w, vec4(g, h, k, l));
+	float lE = dot(w, vec4(j, k, n, o));
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
+	float w = 0.0;
+	if (biS)
+		w = (1.0 - pp.x) * (1.0 - pp.y);
+	if (biT)
+		w =        pp.x  * (1.0 - pp.y);
+	if (biU)
+		w = (1.0 - pp.x) *        pp.y;
+	if (biV)
+		w =        pp.x  *        pp.y;
+#endif
 	// Direction is the '+' diff.
 	//    a
 	//  b c d
@@ -121,25 +170,44 @@ void FsrEasuSet(
 	float lenX = max(abs(dc), abs(cb));
 	lenX = APrxLoRcpF1(lenX);
 	float dirX = lD - lB;
-	dir.x += dirX * w;
-	lenX = clamp(abs(dirX) * lenX, float(0.0), float(1.0));
+	lenX = clamp(abs(dirX) * lenX, 0.0, 1.0);
 	lenX *= lenX;
-	len += lenX * w;
 	// Repeat for the y axis.
 	float ec = lE - lC;
 	float ca = lC - lA;
 	float lenY = max(abs(ec), abs(ca));
 	lenY = APrxLoRcpF1(lenY);
 	float dirY = lE - lA;
-	dir.y += dirY * w;
-	lenY = clamp(abs(dirY) * lenY, float(0.0), float(1.0));
+	lenY = clamp(abs(dirY) * lenY, 0.0, 1.0);
 	lenY *= lenY;
-	len += lenY * w;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	len = lenX + lenY;
+	dir = vec2(dirX, dirY);
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
+	dir += vec2(dirX, dirY) * w;
+	len += dot(vec2(w), vec2(lenX, lenY));
+#endif
 }
 
 vec4 hook() {
+	// Result
+	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
+
 	//------------------------------------------------------------------------------------------------------------------------------
-	// Get position of 'f'.
+	//      +---+---+
+	//      |   |   |
+	//      +--(0)--+
+	//      | b | c |
+	//  +---F---+---+---+
+	//  | e | f | g | h |
+	//  +--(1)--+--(2)--+
+	//  | i | j | k | l |
+	//  +---+---+---+---+
+	//      | n | o |
+	//      +--(3)--+
+	//      |   |   |
+	//      +---+---+
+	// Get position of 'F'.
 	vec2 pp = HOOKED_pos * HOOKED_size - vec2(0.5);
 	vec2 fp = floor(pp);
 	pp -= fp;
@@ -152,16 +220,8 @@ vec4 hook() {
 	// Gather 4 ordering.
 	//  a b
 	//  r g
-	// For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
-	//    a b    <- unused (z)
-	//    r g
-	//  a b a b
-	//  r g r g
-	//    a b
-	//    r g    <- unused (z)
 	// Allowing dead-code removal to remove the 'z's.
-
- #if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310)))
+#if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310)))
 	vec4 bczzL = HOOKED_gather(vec2((fp + vec2(1.0, -1.0)) * HOOKED_pt), 0);
 	vec4 ijfeL = HOOKED_gather(vec2((fp + vec2(0.0,  1.0)) * HOOKED_pt), 0);
 	vec4 klhgL = HOOKED_gather(vec2((fp + vec2(2.0,  1.0)) * HOOKED_pt), 0);
@@ -203,48 +263,78 @@ vec4 hook() {
 	float gL = klhgL.w;
 	float oL = zzonL.z;
 	float nL = zzonL.w;
+
+#if (FSR_PQ == 1)
+	// Not the most performance-friendly solution, but should work until mpv adds proper gamma transformation functions for shaders
+	bL = ToGamma2(bL);
+	cL = ToGamma2(cL);
+	iL = ToGamma2(iL);
+	jL = ToGamma2(jL);
+	fL = ToGamma2(fL);
+	eL = ToGamma2(eL);
+	kL = ToGamma2(kL);
+	lL = ToGamma2(lL);
+	hL = ToGamma2(hL);
+	gL = ToGamma2(gL);
+	oL = ToGamma2(oL);
+	nL = ToGamma2(nL);
+#endif
+
 	// Accumulate for bilinear interpolation.
 	vec2 dir = vec2(0.0);
-	float len = float(0.0);
+	float len = 0.0;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	FsrEasuSet(dir, len, pp, bL, cL, iL, jL, fL, eL, kL, lL, hL, gL, oL, nL);
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
 	FsrEasuSet(dir, len, pp, true, false, false, false, bL, eL, fL, gL, jL);
 	FsrEasuSet(dir, len, pp, false, true, false, false, cL, fL, gL, hL, kL);
 	FsrEasuSet(dir, len, pp, false, false, true, false, fL, iL, jL, kL, nL);
 	FsrEasuSet(dir, len, pp, false, false, false, true, gL, jL, kL, lL, oL);
+#endif
 	//------------------------------------------------------------------------------------------------------------------------------
 	// Normalize with approximation, and cleanup close to zero.
 	vec2 dir2 = dir * dir;
 	float dirR = dir2.x + dir2.y;
-	bool zro = dirR < float(1.0 / 32768.0);
+	bool zro = dirR < float(1.0 / FSR_EASU_DIR_THRESHOLD);
 	dirR = APrxLoRsqF1(dirR);
-	dirR = zro ? float(1.0) : dirR;
-	dir.x = zro ? float(1.0) : dir.x;
+#if (FSR_EASU_QUIT_EARLY == 1)
+	if (zro) {
+		vec4 w = vec4(0.0);
+		w.x = (1.0 - pp.x) * (1.0 - pp.y);
+		w.y =        pp.x  * (1.0 - pp.y);
+		w.z = (1.0 - pp.x) *        pp.y;
+		w.w =        pp.x  *        pp.y;
+
+		pix.r = clamp(dot(w, vec4(fL, gL, jL, kL)), 0.0, 1.0);
+		return pix;
+	}
+#elif (FSR_EASU_QUIT_EARLY == 0)
+	dirR = zro ? 1.0 : dirR;
+	dir.x = zro ? 1.0 : dir.x;
+#endif
 	dir *= vec2(dirR);
 	// Transform from {0 to 2} to {0 to 1} range, and shape with square.
-	len = len * float(0.5);
+	len = len * 0.5;
 	len *= len;
 	// Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
 	float stretch = (dir.x * dir.x + dir.y * dir.y) * APrxLoRcpF1(max(abs(dir.x), abs(dir.y)));
 	// Anisotropic length after rotation,
 	//  x := 1.0 lerp to 'stretch' on edges
 	//  y := 1.0 lerp to 2x on edges
-	vec2 len2 = vec2(float(1.0) + (stretch - float(1.0)) * len, float(1.0) + float(-0.5) * len);
+	vec2 len2 = vec2(1.0 + (stretch - 1.0) * len, 1.0 + -0.5 * len);
 	// Based on the amount of 'edge',
 	// the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
-	float lob = float(0.5) + float((1.0 / 4.0 - 0.04) - 0.5) * len;
+	float lob = 0.5 + float((1.0 / 4.0 - 0.04) - 0.5) * len;
 	// Set distance^2 clipping point to the end of the adjustable window.
 	float clp = APrxLoRcpF1(lob);
 	//------------------------------------------------------------------------------------------------------------------------------
-	// Accumulation mixed with min/max of 4 nearest.
+	// Accumulation
 	//    b c
 	//  e f g h
 	//  i j k l
 	//    n o
-	float min1 = min(AMin3F1(fL, gL, jL), kL);
-	float max1 = max(AMax3F1(fL, gL, jL), kL);
-
-	// Accumulation.
 	float aC = 0.0;
-	float aW = float(0.0);
+	float aW = 0.0;
 	FsrEasuTap(aC, aW, vec2( 0.0,-1.0) - pp, dir, len2, lob, clp, bL); // b
 	FsrEasuTap(aC, aW, vec2( 1.0,-1.0) - pp, dir, len2, lob, clp, cL); // c
 	FsrEasuTap(aC, aW, vec2(-1.0, 1.0) - pp, dir, len2, lob, clp, iL); // i
@@ -259,8 +349,13 @@ vec4 hook() {
 	FsrEasuTap(aC, aW, vec2( 0.0, 2.0) - pp, dir, len2, lob, clp, nL); // n
 	//------------------------------------------------------------------------------------------------------------------------------
 	// Normalize and dering.
-	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
-	pix.r = min(max1, max(min1, aC * float(1.0 / aW)));
+	pix.r = aC / aW;
+#if (FSR_EASU_DERING == 1)
+	float min1 = min(AMin3F1(fL, gL, jL), kL);
+	float max1 = max(AMax3F1(fL, gL, jL), kL);
+	pix.r = clamp(pix.r, min1, max1);
+#endif
+	pix.r = clamp(pix.r, 0.0, 1.0);
 
 	return pix;
 }
@@ -273,16 +368,17 @@ vec4 hook() {
 //!COMPONENTS 1
 
 // User variables - RCAS
-#define SHARPNESS 0.25 // Controls the amount of sharpening. The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 0.0 to N>0.
-#define FSR_RCAS_DENOISE 1 // If set to 1, applies denoising in addition to sharpening. Can be disabled for better performance. 0 or 1.
+#define SHARPNESS 0.2 // Controls the amount of sharpening. The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 0.0 to 2.0.
+#define FSR_RCAS_DENOISE 1 // If set to 1, lessens the sharpening on noisy areas. Can be disabled for better performance. 0 or 1.
+#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1.
 
 // Shader code
 
 #define FSR_RCAS_LIMIT (0.25 - (1.0 / 16.0)) // This is set at the limit of providing unnatural results for sharpening.
 
 float APrxMedRcpF1(float a) {
 	float b = uintBitsToFloat(uint(0x7ef19fff) - floatBitsToUint(a));
-	return b * (-b * a + float(2.0));
+	return b * (-b * a + 2.0);
 }
 
 float AMax3F1(float x, float y, float z) {
@@ -293,6 +389,14 @@ float AMin3F1(float x, float y, float z) {
 	return min(x, min(y, z));
 }
 
+#if (FSR_PQ == 1)
+
+float FromGamma2(float a) { 
+	return sqrt(sqrt(a));
+}
+
+#endif
+
 vec4 hook() {
 	// Algorithm uses minimal 3x3 pixel neighborhood.
 	//    b 
@@ -323,24 +427,27 @@ vec4 hook() {
 	vec2 peakC = vec2(1.0, -1.0 * 4.0);
 
 	// Limiters, these need to be high precision RCPs.
-	float hitMinL = min(mn1L, e) * (float(1.0) / (float(4.0) * mx1L));
-	float hitMaxL = (peakC.x - max(mx1L, e)) * (float(1.0) / (float(4.0) * mn1L + peakC.y));
+	float hitMinL = min(mn1L, e) / (4.0 * mx1L);
+	float hitMaxL = (peakC.x - max(mx1L, e)) / (4.0 * mn1L + peakC.y);
 	float lobeL = max(-hitMinL, hitMaxL);
-	float lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, float(0.0))) * exp2(-max(float(SHARPNESS), float(0.0)));
+	float lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, 0.0)) * exp2(-clamp(float(SHARPNESS), 0.0, 2.0));
 
 	// Apply noise removal.
 #if (FSR_RCAS_DENOISE == 1)
 	// Noise detection.
-	float nz = float(0.25) * b + float(0.25) * d + float(0.25) * f + float(0.25) * h - e;
+	float nz = 0.25 * b + 0.25 * d + 0.25 * f + 0.25 * h - e;
 	nz = clamp(abs(nz) * APrxMedRcpF1(AMax3F1(AMax3F1(b, d, e), f, h) - AMin3F1(AMin3F1(b, d, e), f, h)), 0.0, 1.0);
-	nz = float(-0.5) * nz + float(1.0);
+	nz = -0.5 * nz + 1.0;
 	lobe *= nz;
 #endif
 
 	// Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
-	float rcpL = APrxMedRcpF1(float(4.0) * lobe + float(1.0));
+	float rcpL = APrxMedRcpF1(4.0 * lobe + 1.0);
 	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
 	pix.r = float((lobe * b + lobe * d + lobe * h + lobe * f + e) * rcpL);
+#if (FSR_PQ == 1)
+	pix.r = FromGamma2(pix.r);
+#endif
 
 	return pix;
 }