diff --git a/portable_config/shaders/AMD-FSR.glsl b/portable_config/shaders/AMD-FSR.glsl
index de81bff1..f1794d72 100644
--- a/portable_config/shaders/AMD-FSR.glsl
+++ b/portable_config/shaders/AMD-FSR.glsl
@@ -24,7 +24,8 @@
 // Changelog
 // Made it compatible with pre-OpenGL 4.0 renderers
 // Made it directly operate on LUMA plane, since the original shader was operating on LUMA by deriving it from RGB. This should cause a major increase in performance, especially on OpenGL 4.0+ renderers (4+2 texture lookups vs. 12+5)
-// Removed transparency preservation mechanism since the alpha channel is a separate source plan than LUMA
+// Removed transparency preservation mechanism since the alpha channel is a separate source plane than LUMA
+// Added optional performance-saving lossy optimizations to EASU (Credit: atyuwen, https://atyuwen.github.io/posts/optimizing-fsr/)
 // 
 // Notes
 // Per AMD's guidelines only upscales content up to 4x (e.g., 1080p -> 2160p, 720p -> 1440p etc.) and everything else in between,
@@ -39,8 +40,22 @@
 //!HEIGHT OUTPUT.h OUTPUT.h LUMA.h 2 * < * LUMA.h 2 * OUTPUT.h LUMA.h 2 * > * + OUTPUT.h OUTPUT.h LUMA.h 2 * = * +
 //!COMPONENTS 1
 
+// User variables - EASU
+#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1.
+#define FSR_EASU_DERING 1 // If set to 0, disables deringing for a small increase in performance. 0 or 1.
+#define FSR_EASU_SIMPLE_ANALYSIS 0 // If set to 1, uses a simpler single-pass direction and length analysis for an increase in performance. 0 or 1.
+#define FSR_EASU_QUIT_EARLY 0 // If set to 1, uses bilinear filtering for non-edge pixels and skips EASU on those regions for an increase in performance. 0 or 1.
+
 // Shader code
 
+#ifndef FSR_EASU_DIR_THRESHOLD
+	#if (FSR_EASU_QUIT_EARLY == 1)
+		#define FSR_EASU_DIR_THRESHOLD 64.0
+	#elif (FSR_EASU_QUIT_EARLY == 0)
+		#define FSR_EASU_DIR_THRESHOLD 32768.0
+	#endif
+#endif
+
 float APrxLoRcpF1(float a) {
 	return uintBitsToFloat(uint(0x7ef07ebb) - floatBitsToUint(a));
 }
@@ -57,16 +72,24 @@ float AMax3F1(float x, float y, float z) {
 	return max(x, max(y, z));
 }
 
+#if (FSR_PQ == 1)
+
+float ToGamma2(float a) { 
+	return pow(a, 4.0);
+}
+
+#endif
+
  // Filtering for a given tap for the scalar.
  void FsrEasuTap(
-	inout float aC,  // Accumulated color, with negative lobe.
+	inout float aC,	// Accumulated color, with negative lobe.
 	inout float aW, // Accumulated weight.
 	vec2 off,       // Pixel offset from resolve position to tap.
 	vec2 dir,       // Gradient direction.
 	vec2 len,       // Length.
 	float lob,      // Negative lobe strength.
-	float clp,      // Clipping point.
-	float c){        // Tap color.
+	float clp,		// Clipping point.
+	float c){		// Tap color.
 	// Rotate offset by direction.
 	vec2 v;
 	v.x = (off.x * ( dir.x)) + (off.y * dir.y);
@@ -84,8 +107,8 @@ float AMax3F1(float x, float y, float z) {
 	// The general form of the 'base' is,
 	//  (a*(b*x^2-1)^2-(a-1))
 	// Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
-	float wB = float(2.0 / 5.0) * d2 + float(-1.0);
-	float wA = lob * d2 + float(-1.0);
+	float wB = float(2.0 / 5.0) * d2 + -1.0;
+	float wA = lob * d2 + -1.0;
 	wB *= wB;
 	wA *= wA;
 	wB = float(25.0 / 16.0) * wB + float(-(25.0 / 16.0 - 1.0));
@@ -100,16 +123,42 @@ void FsrEasuSet(
 	inout vec2 dir,
 	inout float len,
 	vec2 pp,
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	float b, float c,
+	float i, float j, float f, float e,
+	float k, float l, float h, float g,
+	float o, float n
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
 	bool biS, bool biT, bool biU, bool biV,
-	float lA, float lB, float lC, float lD, float lE){
+	float lA, float lB, float lC, float lD, float lE
+#endif
+	){
 	// Compute bilinear weight, branches factor out as predicates are compiler time immediates.
 	//  s t
 	//  u v
-	float w = float(0.0);
-	if(biS) w = (float(1.0) - pp.x) * (float(1.0) - pp.y);
-	if(biT) w = pp.x * (float(1.0) - pp.y);
-	if(biU) w = (float(1.0) - pp.x) * pp.y;
-	if(biV) w = pp.x * pp.y;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	vec4 w = vec4(0.0);
+	w.x = (1.0 - pp.x) * (1.0 - pp.y);
+	w.y =        pp.x  * (1.0 - pp.y);
+	w.z = (1.0 - pp.x) *        pp.y;
+	w.w =        pp.x  *        pp.y;
+
+	float lA = dot(w, vec4(b, c, f, g));
+	float lB = dot(w, vec4(e, f, i, j));
+	float lC = dot(w, vec4(f, g, j, k));
+	float lD = dot(w, vec4(g, h, k, l));
+	float lE = dot(w, vec4(j, k, n, o));
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
+	float w = 0.0;
+	if (biS)
+		w = (1.0 - pp.x) * (1.0 - pp.y);
+	if (biT)
+		w =        pp.x  * (1.0 - pp.y);
+	if (biU)
+		w = (1.0 - pp.x) *        pp.y;
+	if (biV)
+		w =        pp.x  *        pp.y;
+#endif
 	// Direction is the '+' diff.
 	//    a
 	//  b c d
@@ -121,25 +170,44 @@ void FsrEasuSet(
 	float lenX = max(abs(dc), abs(cb));
 	lenX = APrxLoRcpF1(lenX);
 	float dirX = lD - lB;
-	dir.x += dirX * w;
-	lenX = clamp(abs(dirX) * lenX, float(0.0), float(1.0));
+	lenX = clamp(abs(dirX) * lenX, 0.0, 1.0);
 	lenX *= lenX;
-	len += lenX * w;
 	// Repeat for the y axis.
 	float ec = lE - lC;
 	float ca = lC - lA;
 	float lenY = max(abs(ec), abs(ca));
 	lenY = APrxLoRcpF1(lenY);
 	float dirY = lE - lA;
-	dir.y += dirY * w;
-	lenY = clamp(abs(dirY) * lenY, float(0.0), float(1.0));
+	lenY = clamp(abs(dirY) * lenY, 0.0, 1.0);
 	lenY *= lenY;
-	len += lenY * w;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	len = lenX + lenY;
+	dir = vec2(dirX, dirY);
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
+	dir += vec2(dirX, dirY) * w;
+	len += dot(vec2(w), vec2(lenX, lenY));
+#endif
 }
 
 vec4 hook() {
+	// Result
+	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
+
 	//------------------------------------------------------------------------------------------------------------------------------
-	// Get position of 'f'.
+	//      +---+---+
+	//      |   |   |
+	//      +--(0)--+
+	//      | b | c |
+	//  +---F---+---+---+
+	//  | e | f | g | h |
+	//  +--(1)--+--(2)--+
+	//  | i | j | k | l |
+	//  +---+---+---+---+
+	//      | n | o |
+	//      +--(3)--+
+	//      |   |   |
+	//      +---+---+
+	// Get position of 'F'.
 	vec2 pp = HOOKED_pos * HOOKED_size - vec2(0.5);
 	vec2 fp = floor(pp);
 	pp -= fp;
@@ -152,16 +220,8 @@ vec4 hook() {
 	// Gather 4 ordering.
 	//  a b
 	//  r g
-	// For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
-	//    a b    <- unused (z)
-	//    r g
-	//  a b a b
-	//  r g r g
-	//    a b
-	//    r g    <- unused (z)
 	// Allowing dead-code removal to remove the 'z's.
-	
- #if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310)))
+#if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310)))
 	vec4 bczzL = HOOKED_gather(vec2((fp + vec2(1.0, -1.0)) * HOOKED_pt), 0);
 	vec4 ijfeL = HOOKED_gather(vec2((fp + vec2(0.0,  1.0)) * HOOKED_pt), 0);
 	vec4 klhgL = HOOKED_gather(vec2((fp + vec2(2.0,  1.0)) * HOOKED_pt), 0);
@@ -203,48 +263,78 @@ vec4 hook() {
 	float gL = klhgL.w;
 	float oL = zzonL.z;
 	float nL = zzonL.w;
+
+#if (FSR_PQ == 1)
+	// Not the most performance-friendly solution, but should work until mpv adds proper gamma transformation functions for shaders
+	bL = ToGamma2(bL);
+	cL = ToGamma2(cL);
+	iL = ToGamma2(iL);
+	jL = ToGamma2(jL);
+	fL = ToGamma2(fL);
+	eL = ToGamma2(eL);
+	kL = ToGamma2(kL);
+	lL = ToGamma2(lL);
+	hL = ToGamma2(hL);
+	gL = ToGamma2(gL);
+	oL = ToGamma2(oL);
+	nL = ToGamma2(nL);
+#endif
+
 	// Accumulate for bilinear interpolation.
 	vec2 dir = vec2(0.0);
-	float len = float(0.0);
+	float len = 0.0;
+#if (FSR_EASU_SIMPLE_ANALYSIS == 1)
+	FsrEasuSet(dir, len, pp, bL, cL, iL, jL, fL, eL, kL, lL, hL, gL, oL, nL);
+#elif (FSR_EASU_SIMPLE_ANALYSIS == 0)
 	FsrEasuSet(dir, len, pp, true, false, false, false, bL, eL, fL, gL, jL);
 	FsrEasuSet(dir, len, pp, false, true, false, false, cL, fL, gL, hL, kL);
 	FsrEasuSet(dir, len, pp, false, false, true, false, fL, iL, jL, kL, nL);
 	FsrEasuSet(dir, len, pp, false, false, false, true, gL, jL, kL, lL, oL);
+#endif
 	//------------------------------------------------------------------------------------------------------------------------------
 	// Normalize with approximation, and cleanup close to zero.
 	vec2 dir2 = dir * dir;
 	float dirR = dir2.x + dir2.y;
-	bool zro = dirR < float(1.0 / 32768.0);
+	bool zro = dirR < float(1.0 / FSR_EASU_DIR_THRESHOLD);
 	dirR = APrxLoRsqF1(dirR);
-	dirR = zro ? float(1.0) : dirR;
-	dir.x = zro ? float(1.0) : dir.x;
+#if (FSR_EASU_QUIT_EARLY == 1)
+	if (zro) {
+		vec4 w = vec4(0.0);
+		w.x = (1.0 - pp.x) * (1.0 - pp.y);
+		w.y =        pp.x  * (1.0 - pp.y);
+		w.z = (1.0 - pp.x) *        pp.y;
+		w.w =        pp.x  *        pp.y;
+
+		pix.r = clamp(dot(w, vec4(fL, gL, jL, kL)), 0.0, 1.0);
+		return pix;
+	}
+#elif (FSR_EASU_QUIT_EARLY == 0)
+	dirR = zro ? 1.0 : dirR;
+	dir.x = zro ? 1.0 : dir.x;
+#endif
 	dir *= vec2(dirR);
 	// Transform from {0 to 2} to {0 to 1} range, and shape with square.
-	len = len * float(0.5);
+	len = len * 0.5;
 	len *= len;
 	// Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
 	float stretch = (dir.x * dir.x + dir.y * dir.y) * APrxLoRcpF1(max(abs(dir.x), abs(dir.y)));
 	// Anisotropic length after rotation,
 	//  x := 1.0 lerp to 'stretch' on edges
 	//  y := 1.0 lerp to 2x on edges
-	vec2 len2 = vec2(float(1.0) + (stretch - float(1.0)) * len, float(1.0) + float(-0.5) * len);
+	vec2 len2 = vec2(1.0 + (stretch - 1.0) * len, 1.0 + -0.5 * len);
 	// Based on the amount of 'edge',
 	// the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
-	float lob = float(0.5) + float((1.0 / 4.0 - 0.04) - 0.5) * len;
+	float lob = 0.5 + float((1.0 / 4.0 - 0.04) - 0.5) * len;
 	// Set distance^2 clipping point to the end of the adjustable window.
 	float clp = APrxLoRcpF1(lob);
 	//------------------------------------------------------------------------------------------------------------------------------
-	// Accumulation mixed with min/max of 4 nearest.
+	// Accumulation
 	//    b c
 	//  e f g h
 	//  i j k l
 	//    n o
-	float min1 = min(AMin3F1(fL, gL, jL), kL);
-	float max1 = max(AMax3F1(fL, gL, jL), kL);
-
-	// Accumulation.
 	float aC = 0.0;
-	float aW = float(0.0);
+	float aW = 0.0;
 	FsrEasuTap(aC, aW, vec2( 0.0,-1.0) - pp, dir, len2, lob, clp, bL); // b
 	FsrEasuTap(aC, aW, vec2( 1.0,-1.0) - pp, dir, len2, lob, clp, cL); // c
 	FsrEasuTap(aC, aW, vec2(-1.0, 1.0) - pp, dir, len2, lob, clp, iL); // i
@@ -259,8 +349,13 @@ vec4 hook() {
 	FsrEasuTap(aC, aW, vec2( 0.0, 2.0) - pp, dir, len2, lob, clp, nL); // n
 	//------------------------------------------------------------------------------------------------------------------------------
 	// Normalize and dering.
-	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
-	pix.r = min(max1, max(min1, aC * float(1.0 / aW)));
+	pix.r = aC / aW;
+#if (FSR_EASU_DERING == 1)
+	float min1 = min(AMin3F1(fL, gL, jL), kL);
+	float max1 = max(AMax3F1(fL, gL, jL), kL);
+	pix.r = clamp(pix.r, min1, max1);
+#endif
+	pix.r = clamp(pix.r, 0.0, 1.0);
 
 	return pix;
 }
@@ -273,8 +368,9 @@ vec4 hook() {
 //!COMPONENTS 1
 
 // User variables - RCAS
-#define SHARPNESS 0.25 // Controls the amount of sharpening. The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 0.0 to N>0.
-#define FSR_RCAS_DENOISE 1 // If set to 1, applies denoising in addition to sharpening. Can be disabled for better performance. 0 or 1.
+#define SHARPNESS 0.2 // Controls the amount of sharpening. The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 0.0 to 2.0.
+#define FSR_RCAS_DENOISE 1 // If set to 1, lessens the sharpening on noisy areas. Can be disabled for better performance. 0 or 1.
+#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1.
 
 // Shader code
 
@@ -282,7 +378,7 @@ vec4 hook() {
 
 float APrxMedRcpF1(float a) {
 	float b = uintBitsToFloat(uint(0x7ef19fff) - floatBitsToUint(a));
-	return b * (-b * a + float(2.0));
+	return b * (-b * a + 2.0);
 }
 
 float AMax3F1(float x, float y, float z) {
@@ -293,6 +389,14 @@ float AMin3F1(float x, float y, float z) {
 	return min(x, min(y, z));
 }
 
+#if (FSR_PQ == 1)
+
+float FromGamma2(float a) { 
+	return sqrt(sqrt(a));
+}
+
+#endif
+
 vec4 hook() {
 	// Algorithm uses minimal 3x3 pixel neighborhood.
 	//    b 
@@ -323,24 +427,27 @@ vec4 hook() {
 	vec2 peakC = vec2(1.0, -1.0 * 4.0);
 
 	// Limiters, these need to be high precision RCPs.
-	float hitMinL = min(mn1L, e) * (float(1.0) / (float(4.0) * mx1L));
-	float hitMaxL = (peakC.x - max(mx1L, e)) * (float(1.0) / (float(4.0) * mn1L + peakC.y));
+	float hitMinL = min(mn1L, e) / (4.0 * mx1L);
+	float hitMaxL = (peakC.x - max(mx1L, e)) / (4.0 * mn1L + peakC.y);
 	float lobeL = max(-hitMinL, hitMaxL);
-	float lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, float(0.0))) * exp2(-max(float(SHARPNESS), float(0.0)));
+	float lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, 0.0)) * exp2(-clamp(float(SHARPNESS), 0.0, 2.0));
 
 	// Apply noise removal.
 #if (FSR_RCAS_DENOISE == 1)
 	// Noise detection.
-	float nz = float(0.25) * b + float(0.25) * d + float(0.25) * f + float(0.25) * h - e;
+	float nz = 0.25 * b + 0.25 * d + 0.25 * f + 0.25 * h - e;
 	nz = clamp(abs(nz) * APrxMedRcpF1(AMax3F1(AMax3F1(b, d, e), f, h) - AMin3F1(AMin3F1(b, d, e), f, h)), 0.0, 1.0);
-	nz = float(-0.5) * nz + float(1.0);
+	nz = -0.5 * nz + 1.0;
 	lobe *= nz;
 #endif
 
 	// Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
-	float rcpL = APrxMedRcpF1(float(4.0) * lobe + float(1.0));
+	float rcpL = APrxMedRcpF1(4.0 * lobe + 1.0);
 	vec4 pix = vec4(0.0, 0.0, 0.0, 1.0);
 	pix.r = float((lobe * b + lobe * d + lobe * h + lobe * f + e) * rcpL);
+#if (FSR_PQ == 1)
+	pix.r = FromGamma2(pix.r);
+#endif
 
 	return pix;
 }
\ No newline at end of file
diff --git a/portable_config/shaders/NVScaler.glsl b/portable_config/shaders/NVScaler.glsl
index c29642c9..25c4f64f 100644
--- a/portable_config/shaders/NVScaler.glsl
+++ b/portable_config/shaders/NVScaler.glsl
@@ -75,12 +75,10 @@ const float kSharpStrengthScale = kSharpStrengthMax - kSharpStrengthMin;
 const float kSharpLimitScale = kSharpLimitMax - kSharpLimitMin;
 const float kContrastBoost = 1.0f;
 const float kEps = 1.0f;
+#define kScaleX (HOOKED_size.x / target_size.x)
+#define kScaleY (HOOKED_size.y / target_size.y)
 #define kSrcNormX HOOKED_pt.x
 #define kSrcNormY HOOKED_pt.y
-#define kDstNormX (1.f / target_size.x)
-#define kDstNormY (1.f / target_size.y)
-#define kScaleX (input_size.x / target_size.x)
-#define kScaleY (input_size.y / target_size.y)
 
 // HLSL to GLSL macros
 #define saturate(x) clamp(x, 0, 1)
@@ -108,26 +106,26 @@ vec4 GetEdgeMap(float p[4][4], int i, int j) {
 	float e_0_90 = 0;
 	float e_45_135 = 0;
 
-    if (g_0_90_max + g_45_135_max == 0)
-    {
-        return vec4(0, 0, 0, 0);
-    }
+	if (g_0_90_max + g_45_135_max == 0)
+	{
+		return vec4(0, 0, 0, 0);
+	}
 
-    e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
-    e_45_135 = 1.0f - e_0_90;
+	e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
+	e_45_135 = 1.0f - e_0_90;
 
-    bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
-    bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
-    bool c_g_0_90 = g_0_90_max == g_0;
-    bool c_g_45_135 = g_45_135_max == g_45;
+	bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
+	bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
+	bool c_g_0_90 = g_0_90_max == g_0;
+	bool c_g_45_135 = g_45_135_max == g_45;
 
-    float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
-    float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
+	float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
+	float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
 
-    float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
-    float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
-    float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
-    float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
+	float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
+	float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
+	float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
+	float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
 
 	return vec4(weight_0, weight_90, weight_45, weight_135);
 }
@@ -140,7 +138,6 @@ void LoadFilterBanksSh(int i0, int di) {
 		int phase = i >> 1;
 		int vIdx = i & 1;
 
-		// vec4 v = vec4(NVTEX_LOAD(coef_scaler, ivec2(vIdx, phase)));
 		vec4 v = vec4(texelFetch(coef_scaler, ivec2(vIdx, phase), 0));
 		int filterOffset = vIdx * 4;
 		shCoefScaler[phase][filterOffset + 0] = v.x;
@@ -151,7 +148,6 @@ void LoadFilterBanksSh(int i0, int di) {
 			shCoefScaler[phase][3] = v.w;
 		}
 
-		// v = vec4(NVTEX_LOAD(coef_usm, ivec2(vIdx, phase)));
 		v = vec4(texelFetch(coef_usm, ivec2(vIdx, phase), 0));
 		shCoefUSM[phase][filterOffset + 0] = v.x;
 		shCoefUSM[phase][filterOffset + 1] = v.y;
@@ -242,8 +238,8 @@ float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_i
 float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, vec4 w)
 {
 	float f = 0.f;
-    if (w.x > 0.0f)
-    {
+	if (w.x > 0.0f)
+	{
 		// 0 deg filter
 		float interp0Deg[6];
 		{
@@ -252,11 +248,11 @@ float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int p
 				interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
 			}
 		}
-        f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
-    }
+		f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
+	}
 
-    if (w.y > 0.0f)
-    {
+	if (w.y > 0.0f)
+	{
 		// 90 deg filter
 		float interp90Deg[6];
 		{
@@ -266,9 +262,9 @@ float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int p
 			}
 		}
 		f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
-    }
-    if (w.z > 0.0f)
-    {
+	}
+	if (w.z > 0.0f)
+	{
 		//45 deg filter
 		float pphase_b45;
 		pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
@@ -307,10 +303,10 @@ float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int p
 			}
 		}
 		f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
-    }
-    
+	}
+	
 	if (w.w > 0.0f)
-    {
+	{
 		//135 deg filter
 		float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
 
@@ -460,80 +456,79 @@ void hook()
 	groupMemoryBarrier();
 	barrier();
 
-    // output coord within a tile
-    const ivec2 pos = ivec2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
-    // x coord inside the output image
-    const int dstX = dstBlockX + pos.x;
-    // x coord inside the input image
-    const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
-    // nearest integer part
-    const int px = int(floor(srcX) - srcBlockStartX);
-    // fractional part
-    const float fx = srcX - floor(srcX);
-    // discretized phase
-    const int fx_int = int(fx * kPhaseCount);
-
-    for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k)
-    {
-        // y coord inside the output image
-        const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
-        // y coord inside the input image
-        const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
-
-        // nearest integer part
-        const int py = int(floor(srcY) - srcBlockStartY);
-        // fractional part
-        const float fy = srcY - floor(srcY);
-        // discretized phase
-        const int fy_int = int(fy * kPhaseCount);
-
-        // generate weights for directional filters
-        const int startEdgeMapIdx = py * kEdgeMapPitch + px;
-        vec4 edge[2][2];
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                // need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
-                edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
-            }
-        }
-        const vec4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
-
-        // load 6x6 support to regs
-        const int startTileIdx = py * kTilePitch + px;
-        float p[6][6];
-        {
-            for (int i = 0; i < 6; ++i)
-            {
-                for (int j = 0; j < 6; ++j)
-                {
-                    p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
-                }
-            }
-        }
-
-        // weigth for luma
-        const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
-
-        // final luma is a weighted product of directional & normal filters
-        float opY = 0;
-
-        // get traditional scaler filter output
-        opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
-
-        // get directional filter bank output
-        opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
-
-        // do bilinear tap for luma upscaling
-		vec4 op = vec4(0.0, 0.0, 0.0, 1.0);
-		op.r = HOOKED_tex(vec2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY)).r;
-
-        const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - op.r;
-        op += corr;
-
-        imageStore(out_image, ivec2(dstX, dstY), op);
-    }
+	// output coord within a tile
+	const ivec2 pos = ivec2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
+	// x coord inside the output image
+	const int dstX = dstBlockX + pos.x;
+	// x coord inside the input image
+	const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
+	// nearest integer part
+	const int px = int(floor(srcX) - srcBlockStartX);
+	// fractional part
+	const float fx = srcX - floor(srcX);
+	// discretized phase
+	const int fx_int = int(fx * kPhaseCount);
+
+	for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k)
+	{
+		// y coord inside the output image
+		const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
+		// y coord inside the input image
+		const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
+
+		// nearest integer part
+		const int py = int(floor(srcY) - srcBlockStartY);
+		// fractional part
+		const float fy = srcY - floor(srcY);
+		// discretized phase
+		const int fy_int = int(fy * kPhaseCount);
+
+		// generate weights for directional filters
+		const int startEdgeMapIdx = py * kEdgeMapPitch + px;
+		vec4 edge[2][2];
+		for (int i = 0; i < 2; i++)
+		{
+			for (int j = 0; j < 2; j++)
+			{
+				// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
+				edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
+			}
+		}
+		const vec4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
+
+		// load 6x6 support to regs
+		const int startTileIdx = py * kTilePitch + px;
+		float p[6][6];
+		{
+			for (int i = 0; i < 6; ++i)
+			{
+				for (int j = 0; j < 6; ++j)
+				{
+					p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
+				}
+			}
+		}
+
+		// weigth for luma
+		const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
+
+		// final luma is a weighted product of directional & normal filters
+		float opY = 0;
+
+		// get traditional scaler filter output
+		opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
+
+		// get directional filter bank output
+		opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
+
+		// do bilinear tap for luma upscaling
+		vec4 op = HOOKED_tex(vec2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY));
+
+		const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - op.r;
+		op.x += corr;
+
+		imageStore(out_image, ivec2(dstX, dstY), op);
+	}
 }
 
 //!TEXTURE coef_scaler
diff --git a/portable_config/shaders/NVSharpen.glsl b/portable_config/shaders/NVSharpen.glsl
index 20a3ff38..77d476c8 100644
--- a/portable_config/shaders/NVSharpen.glsl
+++ b/portable_config/shaders/NVSharpen.glsl
@@ -22,10 +22,15 @@
 // NVIDIA Image Scaling v1.0.2 by NVIDIA
 // ported to mpv by agyild
 
-//!HOOK OUTPUT
+// Changelog
+// Made it directly operate on LUMA plane, since the original shader was operating
+// on LUMA by deriving it from RGB.
+
+//!HOOK LUMA
 //!BIND HOOKED
 //!DESC NVIDIA Image Sharpening v1.0.2
 //!COMPUTE 32 32 256 1
+//!WHEN OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 > ! OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 < ! *
 
 // User variables
 #define SHARPNESS 0.25 // Amount of sharpening. 0.0 to 1.0.
@@ -38,7 +43,6 @@
 #define kSupportSize 5
 #define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
 #define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
-#define NIS_SCALE_FLOAT 1.0f
 const float sharpen_slider = clamp(SHARPNESS, 0.0f, 1.0f) - 0.5f;
 const float MaxScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.75f;
 const float MinScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
@@ -72,13 +76,6 @@ const float kEps = 1.0f / 255.0f;
 shared float shPixelsY[kNumPixelsY][kNumPixelsX];
 
 // Shader code
-float getY(vec3 rgba) {
-#if (NIS_HDR_MODE == 1)
-	return float(0.262f) * rgba.x + float(0.678f) * rgba.y + float(0.0593f) * rgba.z;
-#else
-	return float(0.2126f) * rgba.x + float(0.7152f) * rgba.y + float(0.0722f) * rgba.z;
-#endif
-}
 
 vec4 GetEdgeMap(float p[5][5], int i, int j) {
 	const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
@@ -217,8 +214,8 @@ void hook() {
 			for (int dx = 0; dx < 2; dx++) {
 				const float tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
 				const float ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
-				const vec4 px = HOOKED_tex(vec2(tx, ty));
-				shPixelsY[pos.y + dy][pos.x + dx] = getY(px.xyz);
+				const float px = HOOKED_tex(vec2(tx, ty)).r;
+				shPixelsY[pos.y + dy][pos.x + dx] = px;
 			}
 		}
 	}
@@ -250,14 +247,12 @@ void hook() {
 		// final USM is a weighted sum filter outputs
 		const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
 
-		// do bilinear tap and correct rgb texel so it produces new sharpened luma
+		// do bilinear tap and correct luma texel so it produces new sharpened luma
 		const int dstX = dstBlockX + pos.x;
 		const int dstY = dstBlockY + pos.y;
 
 		vec4 op = HOOKED_tex(vec2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY));
 		op.x += usmY;
-		op.y += usmY;
-		op.z += usmY;
 
 		imageStore(out_image, ivec2(dstX, dstY), op);
 	}
diff --git a/portable_config/shaders/NVSharpen_rgb.glsl b/portable_config/shaders/NVSharpen_rgb.glsl
new file mode 100644
index 00000000..f8edda39
--- /dev/null
+++ b/portable_config/shaders/NVSharpen_rgb.glsl
@@ -0,0 +1,264 @@
+// The MIT License(MIT)
+//
+// Copyright(c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files(the "Software"), to deal in
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of
+// the Software, and to permit persons to whom the Software is furnished to do so,
+// subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// MOD from NVSharpen.glsl
+
+//!HOOK OUTPUT
+//!BIND HOOKED
+//!DESC NVIDIA Image Sharpening v1.0.2
+//!COMPUTE 32 32 256 1
+
+// User variables
+#define SHARPNESS 0.25 // Amount of sharpening. 0.0 to 1.0.
+#define NIS_THREAD_GROUP_SIZE 256 // May be set to 128 for better performance on NVIDIA hardware, otherwise set to 256. Don't forget to modify the COMPUTE directive accordingly as well (e.g., COMPUTE 32 32 128 1).
+#define NIS_HDR_MODE 0 // Must be set to 1 for content with PQ colorspace. 0 or 1.
+
+// Constant variables
+#define NIS_BLOCK_WIDTH 32
+#define NIS_BLOCK_HEIGHT 32
+#define kSupportSize 5
+#define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
+#define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
+#define NIS_SCALE_FLOAT 1.0f
+const float sharpen_slider = clamp(SHARPNESS, 0.0f, 1.0f) - 0.5f;
+const float MaxScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.75f;
+const float MinScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
+const float LimitScale = (sharpen_slider >= 0.0f) ? 1.25f : 1.0f;
+const float kDetectRatio = 2 * 1127.f / 1024.f;
+const float kDetectThres = (bool(NIS_HDR_MODE) ? 32.0f : 64.0f) / 1024.0f;
+const float kMinContrastRatio = bool(NIS_HDR_MODE) ? 1.5f : 2.0f;
+const float kMaxContrastRatio = bool(NIS_HDR_MODE) ? 5.0f : 10.0f;
+const float kSharpStartY = bool(NIS_HDR_MODE) ? 0.35f : 0.45f;
+const float kSharpEndY = bool(NIS_HDR_MODE) ? 0.55f : 0.9f;
+const float kSharpStrengthMin = max(0.0f, 0.4f + sharpen_slider * MinScale * (bool(NIS_HDR_MODE) ? 1.1f : 1.2));
+const float kSharpStrengthMax = ((bool(NIS_HDR_MODE) ? 2.2f : 1.6f) + sharpen_slider * MaxScale * 1.8f);
+const float kSharpLimitMin = max((bool(NIS_HDR_MODE) ? 0.06f :0.1f), (bool(NIS_HDR_MODE) ? 0.1f : 0.14f) + sharpen_slider * LimitScale * (bool(NIS_HDR_MODE) ? 0.28f : 0.32f)); //
+const float kSharpLimitMax = ((bool(NIS_HDR_MODE) ? 0.6f : 0.5f) + sharpen_slider * LimitScale * 0.6f);
+const float kRatioNorm = 1.0f / (kMaxContrastRatio - kMinContrastRatio);
+const float kSharpScaleY = 1.0f / (kSharpEndY - kSharpStartY);
+const float kSharpStrengthScale = kSharpStrengthMax - kSharpStrengthMin;
+const float kSharpLimitScale = kSharpLimitMax - kSharpLimitMin;
+const float kContrastBoost = 1.0f;
+const float kEps = 1.0f / 255.0f;
+#define kSrcNormX HOOKED_pt.x
+#define kSrcNormY HOOKED_pt.y
+#define kDstNormX kSrcNormX
+#define kDstNormY kSrcNormY
+
+// HLSL to GLSL macros
+#define saturate(x) clamp(x, 0, 1)
+#define lerp(a, b, x) mix(a, b, x)
+
+// CS Shared variables
+shared float shPixelsY[kNumPixelsY][kNumPixelsX];
+
+// Shader code
+float getY(vec3 rgba) {
+#if (NIS_HDR_MODE == 1)
+	return float(0.262f) * rgba.x + float(0.678f) * rgba.y + float(0.0593f) * rgba.z;
+#else
+	return float(0.2126f) * rgba.x + float(0.7152f) * rgba.y + float(0.0722f) * rgba.z;
+#endif
+}
+
+vec4 GetEdgeMap(float p[5][5], int i, int j) {
+	const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
+	const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
+	const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
+	const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
+
+	const float g_0_90_max = max(g_0, g_90);
+	const float g_0_90_min = min(g_0, g_90);
+	const float g_45_135_max = max(g_45, g_135);
+	const float g_45_135_min = min(g_45, g_135);
+
+	float e_0_90 = 0;
+	float e_45_135 = 0;
+
+    if (g_0_90_max + g_45_135_max == 0)
+    {
+        return vec4(0, 0, 0, 0);
+    }
+
+    e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
+    e_45_135 = 1.0f - e_0_90;
+
+    bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
+    bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
+    bool c_g_0_90 = g_0_90_max == g_0;
+    bool c_g_45_135 = g_45_135_max == g_45;
+
+    float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
+    float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
+
+    float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
+    float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
+    float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
+    float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
+
+	return vec4(weight_0, weight_90, weight_45, weight_135);
+}
+
+float CalcLTIFast(const float y[5]) {
+	const float a_min = min(min(y[0], y[1]), y[2]);
+	const float a_max = max(max(y[0], y[1]), y[2]);
+
+	const float b_min = min(min(y[2], y[3]), y[4]);
+	const float b_max = max(max(y[2], y[3]), y[4]);
+
+	const float a_cont = a_max - a_min;
+	const float b_cont = b_max - b_min;
+
+	const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
+	return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
+}
+
+float EvalUSM(const float pxl[5], const float sharpnessStrength, const float sharpnessLimit) {
+	// USM profile
+	float y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3];
+	// boost USM profile
+	y_usm *= sharpnessStrength;
+	// clamp to the limit
+	y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm));
+	// reduce ringing
+	y_usm *= CalcLTIFast(pxl);
+
+	return y_usm;
+}
+
+vec4 GetDirUSM(const float p[5][5]) {
+	// sharpness boost & limit are the same for all directions
+	const float scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY);
+	// scale the ramp to sharpen as a function of luma
+	const float sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin;
+	// scale the ramp to limit USM as a function of luma
+	const float sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2];
+
+	vec4 rval;
+	// 0 deg filter
+	float interp0Deg[5];
+	{
+		for (int i = 0; i < 5; ++i)
+		{
+			interp0Deg[i] = p[i][2];
+		}
+	}
+
+	rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit);
+
+	// 90 deg filter
+	float interp90Deg[5];
+	{
+		for (int i = 0; i < 5; ++i)
+		{
+			interp90Deg[i] = p[2][i];
+		}
+	}
+
+	rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit);
+
+	//45 deg filter
+	float interp45Deg[5];
+	interp45Deg[0] = p[1][1];
+	interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f);
+	interp45Deg[2] = p[2][2];
+	interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f);
+	interp45Deg[4] = p[3][3];
+
+	rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit);
+
+	//135 deg filter
+	float interp135Deg[5];
+	interp135Deg[0] = p[3][1];
+	interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f);
+	interp135Deg[2] = p[2][2];
+	interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f);
+	interp135Deg[4] = p[1][3];
+
+	rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit);
+	return rval;
+}
+
+void hook() {
+	uvec2 blockIdx = gl_WorkGroupID.xy;
+	uint threadIdx = gl_LocalInvocationID.x;
+
+	const int dstBlockX = int(NIS_BLOCK_WIDTH * blockIdx.x);
+	const int dstBlockY = int(NIS_BLOCK_HEIGHT * blockIdx.y);
+
+	// fill in input luma tile in batches of 2x2 pixels
+	// we use texture gather to get extra support necessary
+	// to compute 2x2 edge map outputs too
+	const float kShift = 0.5f - kSupportSize / 2;
+
+	for (int i = int(threadIdx) * 2; i < kNumPixelsX * kNumPixelsY / 2; i += NIS_THREAD_GROUP_SIZE * 2) {
+		uvec2 pos = uvec2(uint(i) % uint(kNumPixelsX), uint(i) / uint(kNumPixelsX) * 2);
+
+		for (int dy = 0; dy < 2; dy++) {
+			for (int dx = 0; dx < 2; dx++) {
+				const float tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
+				const float ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
+				const vec4 px = HOOKED_tex(vec2(tx, ty));
+				shPixelsY[pos.y + dy][pos.x + dx] = getY(px.xyz);
+			}
+		}
+	}
+
+	groupMemoryBarrier();
+	barrier();
+
+	for (int k = int(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE)
+	{
+		const ivec2 pos = ivec2(uint(k) % uint(NIS_BLOCK_WIDTH), uint(k) / uint(NIS_BLOCK_WIDTH));
+
+		// load 5x5 support to regs
+		float p[5][5];
+
+		for (int i = 0; i < 5; ++i)
+		{
+			for (int j = 0; j < 5; ++j)
+			{
+				p[i][j] = shPixelsY[pos.y + i][pos.x + j];
+			}
+		}
+
+		// get directional filter bank output
+		vec4 dirUSM = GetDirUSM(p);
+
+		// generate weights for directional filters
+		vec4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1);
+
+		// final USM is a weighted sum filter outputs
+		const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
+
+		// do bilinear tap and correct rgb texel so it produces new sharpened luma
+		const int dstX = dstBlockX + pos.x;
+		const int dstY = dstBlockY + pos.y;
+
+		vec4 op = HOOKED_tex(vec2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY));
+		op.x += usmY;
+		op.y += usmY;
+		op.z += usmY;
+
+		imageStore(out_image, ivec2(dstX, dstY), op);
+	}
+}
+
diff --git a/portable_config/shaders/adaptive-sharpen.glsl b/portable_config/shaders/adaptive-sharpen.glsl
index f0b46f50..23250cdd 100644
--- a/portable_config/shaders/adaptive-sharpen.glsl
+++ b/portable_config/shaders/adaptive-sharpen.glsl
@@ -23,7 +23,7 @@
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // Adaptive sharpen - version 2021-10-17
-// Tuned for use post-resize, EXPECTS FULL RANGE GAMMA LIGHT (requires ps >= 3.0)
+// Tuned for use post-resize
 
 //!HOOK OUTPUT
 //!BIND HOOKED
@@ -248,7 +248,7 @@ vec4 hook() {
 
     float sharpdiff_lim = sat(c0_Y + sharpdiff) - c0_Y;
     float satmul = (c0_Y + max(sharpdiff_lim*0.9, sharpdiff_lim)*0.3 + 0.03)/(c0_Y + 0.03);
-    vec3 res = c0_Y + (sharpdiff_lim*3.0 + sharpdiff)/4.0 + (c[0] - c0_Y)*satmul;
+    vec3 res = c0_Y + sharpdiff + (c[0] - c0_Y)*satmul;
 
     return vec4(res, HOOKED_texOff(0).a);
 }
diff --git a/portable_config/shaders/adaptive-sharpen_anime.glsl b/portable_config/shaders/adaptive-sharpen_anime.glsl
index 6ff80be0..7864b15d 100644
--- a/portable_config/shaders/adaptive-sharpen_anime.glsl
+++ b/portable_config/shaders/adaptive-sharpen_anime.glsl
@@ -22,8 +22,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Adaptive sharpen - version 2021-10-17
-// Tuned for use post-resize, EXPECTS FULL RANGE GAMMA LIGHT (requires ps >= 3.0)
+// MOD from adaptive-sharpen.glsl
 
 //!HOOK OUTPUT
 //!BIND HOOKED
@@ -248,7 +247,7 @@ vec4 hook() {
 
     float sharpdiff_lim = sat(c0_Y + sharpdiff) - c0_Y;
     float satmul = (c0_Y + max(sharpdiff_lim*0.9, sharpdiff_lim)*0.3 + 0.03)/(c0_Y + 0.03);
-    vec3 res = c0_Y + (sharpdiff_lim*3.0 + sharpdiff)/4.0 + (c[0] - c0_Y)*satmul;
+    vec3 res = c0_Y + sharpdiff + (c[0] - c0_Y)*satmul;
 
     return vec4(res, HOOKED_texOff(0).a);
 }
diff --git a/portable_config/shaders/adaptive-sharpen_luma.glsl b/portable_config/shaders/adaptive-sharpen_luma.glsl
index cd66a17b..10c7bbbf 100644
--- a/portable_config/shaders/adaptive-sharpen_luma.glsl
+++ b/portable_config/shaders/adaptive-sharpen_luma.glsl
@@ -22,12 +22,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Adaptive sharpen - version 2021-10-17
-// Tuned for use post-resize, EXPECTS FULL RANGE GAMMA LIGHT (requires ps >= 3.0)
+// MOD from adaptive-sharpen.glsl
 
 //!HOOK LUMA
 //!BIND HOOKED
-//!DESC adaptive-sharpen (luma)
+//!DESC adaptive-sharpen_luma
 
 //--------------------------------------- Settings ------------------------------------------------
 
@@ -246,7 +245,8 @@ vec4 hook() {
 
     float sharpdiff_lim = sat(c0_Y + sharpdiff) - c0_Y;
     float satmul = (c0_Y + max(sharpdiff_lim*0.9, sharpdiff_lim)*0.3 + 0.03)/(c0_Y + 0.03);
-    vec3 res = c0_Y + (sharpdiff_lim*3.0 + sharpdiff)/4.0 + (c[0] - c0_Y)*satmul;
+    vec3 res = c0_Y + sharpdiff + (c[0] - c0_Y)*satmul;
 
     return vec4(res, HOOKED_texOff(0).a);
 }
+